unicodeobject.c revision 315aa404030f425a8bf7fdb5a5275c118555bc37
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/*[clinic input] 51class str "PyUnicodeObject *" "&PyUnicode_Type" 52[clinic start generated code]*/ 53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 54 55/* --- Globals ------------------------------------------------------------ 56 57NOTE: In the interpreter's initialization phase, some globals are currently 58 initialized dynamically as needed. In the process Unicode objects may 59 be created before the Unicode type is ready. 60 61*/ 62 63 64#ifdef __cplusplus 65extern "C" { 66#endif 67 68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 69#define MAX_UNICODE 0x10ffff 70 71#ifdef Py_DEBUG 72# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 73#else 74# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 75#endif 76 77#define _PyUnicode_UTF8(op) \ 78 (((PyCompactUnicodeObject*)(op))->utf8) 79#define PyUnicode_UTF8(op) \ 80 (assert(_PyUnicode_CHECK(op)), \ 81 assert(PyUnicode_IS_READY(op)), \ 82 PyUnicode_IS_COMPACT_ASCII(op) ? \ 83 ((char*)((PyASCIIObject*)(op) + 1)) : \ 84 _PyUnicode_UTF8(op)) 85#define _PyUnicode_UTF8_LENGTH(op) \ 86 (((PyCompactUnicodeObject*)(op))->utf8_length) 87#define PyUnicode_UTF8_LENGTH(op) \ 88 (assert(_PyUnicode_CHECK(op)), \ 89 assert(PyUnicode_IS_READY(op)), \ 90 PyUnicode_IS_COMPACT_ASCII(op) ? \ 91 ((PyASCIIObject*)(op))->length : \ 92 _PyUnicode_UTF8_LENGTH(op)) 93#define _PyUnicode_WSTR(op) \ 94 (((PyASCIIObject*)(op))->wstr) 95#define _PyUnicode_WSTR_LENGTH(op) \ 96 (((PyCompactUnicodeObject*)(op))->wstr_length) 97#define _PyUnicode_LENGTH(op) \ 98 (((PyASCIIObject *)(op))->length) 99#define _PyUnicode_STATE(op) \ 100 (((PyASCIIObject *)(op))->state) 101#define _PyUnicode_HASH(op) \ 102 (((PyASCIIObject *)(op))->hash) 103#define _PyUnicode_KIND(op) \ 104 (assert(_PyUnicode_CHECK(op)), \ 105 ((PyASCIIObject *)(op))->state.kind) 106#define _PyUnicode_GET_LENGTH(op) \ 107 (assert(_PyUnicode_CHECK(op)), \ 108 ((PyASCIIObject *)(op))->length) 109#define _PyUnicode_DATA_ANY(op) \ 110 (((PyUnicodeObject*)(op))->data.any) 111 112#undef PyUnicode_READY 113#define PyUnicode_READY(op) \ 114 (assert(_PyUnicode_CHECK(op)), \ 115 (PyUnicode_IS_READY(op) ? \ 116 0 : \ 117 _PyUnicode_Ready(op))) 118 119#define _PyUnicode_SHARE_UTF8(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 123#define _PyUnicode_SHARE_WSTR(op) \ 124 (assert(_PyUnicode_CHECK(op)), \ 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 126 127/* true if the Unicode object has an allocated UTF-8 memory block 128 (not shared with other data) */ 129#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 ((_PyUnicode_WSTR(op) && \ 138 (!PyUnicode_IS_READY(op) || \ 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 140 141/* Generic helper macro to convert characters of different types. 142 from_type and to_type have to be valid type names, begin and end 143 are pointers to the source characters which should be of type 144 "from_type *". to is a pointer of type "to_type *" and points to the 145 buffer where the result characters are written to. */ 146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 147 do { \ 148 to_type *_to = (to_type *)(to); \ 149 const from_type *_iter = (from_type *)(begin); \ 150 const from_type *_end = (from_type *)(end); \ 151 Py_ssize_t n = (_end) - (_iter); \ 152 const from_type *_unrolled_end = \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 154 while (_iter < (_unrolled_end)) { \ 155 _to[0] = (to_type) _iter[0]; \ 156 _to[1] = (to_type) _iter[1]; \ 157 _to[2] = (to_type) _iter[2]; \ 158 _to[3] = (to_type) _iter[3]; \ 159 _iter += 4; _to += 4; \ 160 } \ 161 while (_iter < (_end)) \ 162 *_to++ = (to_type) *_iter++; \ 163 } while (0) 164 165/* This dictionary holds all interned unicode strings. Note that references 166 to strings in this dictionary are *not* counted in the string's ob_refcnt. 167 When the interned string reaches a refcnt of 0 the string deallocation 168 function will delete the reference from this dictionary. 169 170 Another way to look at this is that to say that the actual reference 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 172*/ 173static PyObject *interned = NULL; 174 175/* The empty Unicode object is shared to improve performance. */ 176static PyObject *unicode_empty = NULL; 177 178#define _Py_INCREF_UNICODE_EMPTY() \ 179 do { \ 180 if (unicode_empty != NULL) \ 181 Py_INCREF(unicode_empty); \ 182 else { \ 183 unicode_empty = PyUnicode_New(0, 0); \ 184 if (unicode_empty != NULL) { \ 185 Py_INCREF(unicode_empty); \ 186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 187 } \ 188 } \ 189 } while (0) 190 191#define _Py_RETURN_UNICODE_EMPTY() \ 192 do { \ 193 _Py_INCREF_UNICODE_EMPTY(); \ 194 return unicode_empty; \ 195 } while (0) 196 197/* Forward declaration */ 198Py_LOCAL_INLINE(int) 199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 200 201/* List of static strings. */ 202static _Py_Identifier *static_strings = NULL; 203 204/* Single character Unicode strings in the Latin-1 range are being 205 shared as well. */ 206static PyObject *unicode_latin1[256] = {NULL}; 207 208/* Fast detection of the most frequent whitespace characters */ 209const unsigned char _Py_ascii_whitespace[] = { 210 0, 0, 0, 0, 0, 0, 0, 0, 211/* case 0x0009: * CHARACTER TABULATION */ 212/* case 0x000A: * LINE FEED */ 213/* case 0x000B: * LINE TABULATION */ 214/* case 0x000C: * FORM FEED */ 215/* case 0x000D: * CARRIAGE RETURN */ 216 0, 1, 1, 1, 1, 1, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218/* case 0x001C: * FILE SEPARATOR */ 219/* case 0x001D: * GROUP SEPARATOR */ 220/* case 0x001E: * RECORD SEPARATOR */ 221/* case 0x001F: * UNIT SEPARATOR */ 222 0, 0, 0, 0, 1, 1, 1, 1, 223/* case 0x0020: * SPACE */ 224 1, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0 237}; 238 239/* forward */ 240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 241static PyObject* get_latin1_char(unsigned char ch); 242static int unicode_modifiable(PyObject *unicode); 243 244 245static PyObject * 246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 247static PyObject * 248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 249static PyObject * 250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 251 252static PyObject * 253unicode_encode_call_errorhandler(const char *errors, 254 PyObject **errorHandler,const char *encoding, const char *reason, 255 PyObject *unicode, PyObject **exceptionObject, 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 257 258static void 259raise_encode_exception(PyObject **exceptionObject, 260 const char *encoding, 261 PyObject *unicode, 262 Py_ssize_t startpos, Py_ssize_t endpos, 263 const char *reason); 264 265/* Same for linebreaks */ 266static unsigned char ascii_linebreak[] = { 267 0, 0, 0, 0, 0, 0, 0, 0, 268/* 0x000A, * LINE FEED */ 269/* 0x000B, * LINE TABULATION */ 270/* 0x000C, * FORM FEED */ 271/* 0x000D, * CARRIAGE RETURN */ 272 0, 0, 1, 1, 1, 1, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274/* 0x001C, * FILE SEPARATOR */ 275/* 0x001D, * GROUP SEPARATOR */ 276/* 0x001E, * RECORD SEPARATOR */ 277 0, 0, 0, 0, 1, 1, 1, 0, 278 0, 0, 0, 0, 0, 0, 0, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0 291}; 292 293/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 294 This function is kept for backward compatibility with the old API. */ 295Py_UNICODE 296PyUnicode_GetMax(void) 297{ 298#ifdef Py_UNICODE_WIDE 299 return 0x10FFFF; 300#else 301 /* This is actually an illegal character, so it should 302 not be passed to unichr. */ 303 return 0xFFFF; 304#endif 305} 306 307#ifdef Py_DEBUG 308int 309_PyUnicode_CheckConsistency(PyObject *op, int check_content) 310{ 311 PyASCIIObject *ascii; 312 unsigned int kind; 313 314 assert(PyUnicode_Check(op)); 315 316 ascii = (PyASCIIObject *)op; 317 kind = ascii->state.kind; 318 319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 320 assert(kind == PyUnicode_1BYTE_KIND); 321 assert(ascii->state.ready == 1); 322 } 323 else { 324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 325 void *data; 326 327 if (ascii->state.compact == 1) { 328 data = compact + 1; 329 assert(kind == PyUnicode_1BYTE_KIND 330 || kind == PyUnicode_2BYTE_KIND 331 || kind == PyUnicode_4BYTE_KIND); 332 assert(ascii->state.ascii == 0); 333 assert(ascii->state.ready == 1); 334 assert (compact->utf8 != data); 335 } 336 else { 337 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 338 339 data = unicode->data.any; 340 if (kind == PyUnicode_WCHAR_KIND) { 341 assert(ascii->length == 0); 342 assert(ascii->hash == -1); 343 assert(ascii->state.compact == 0); 344 assert(ascii->state.ascii == 0); 345 assert(ascii->state.ready == 0); 346 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 347 assert(ascii->wstr != NULL); 348 assert(data == NULL); 349 assert(compact->utf8 == NULL); 350 } 351 else { 352 assert(kind == PyUnicode_1BYTE_KIND 353 || kind == PyUnicode_2BYTE_KIND 354 || kind == PyUnicode_4BYTE_KIND); 355 assert(ascii->state.compact == 0); 356 assert(ascii->state.ready == 1); 357 assert(data != NULL); 358 if (ascii->state.ascii) { 359 assert (compact->utf8 == data); 360 assert (compact->utf8_length == ascii->length); 361 } 362 else 363 assert (compact->utf8 != data); 364 } 365 } 366 if (kind != PyUnicode_WCHAR_KIND) { 367 if ( 368#if SIZEOF_WCHAR_T == 2 369 kind == PyUnicode_2BYTE_KIND 370#else 371 kind == PyUnicode_4BYTE_KIND 372#endif 373 ) 374 { 375 assert(ascii->wstr == data); 376 assert(compact->wstr_length == ascii->length); 377 } else 378 assert(ascii->wstr != data); 379 } 380 381 if (compact->utf8 == NULL) 382 assert(compact->utf8_length == 0); 383 if (ascii->wstr == NULL) 384 assert(compact->wstr_length == 0); 385 } 386 /* check that the best kind is used */ 387 if (check_content && kind != PyUnicode_WCHAR_KIND) 388 { 389 Py_ssize_t i; 390 Py_UCS4 maxchar = 0; 391 void *data; 392 Py_UCS4 ch; 393 394 data = PyUnicode_DATA(ascii); 395 for (i=0; i < ascii->length; i++) 396 { 397 ch = PyUnicode_READ(kind, data, i); 398 if (ch > maxchar) 399 maxchar = ch; 400 } 401 if (kind == PyUnicode_1BYTE_KIND) { 402 if (ascii->state.ascii == 0) { 403 assert(maxchar >= 128); 404 assert(maxchar <= 255); 405 } 406 else 407 assert(maxchar < 128); 408 } 409 else if (kind == PyUnicode_2BYTE_KIND) { 410 assert(maxchar >= 0x100); 411 assert(maxchar <= 0xFFFF); 412 } 413 else { 414 assert(maxchar >= 0x10000); 415 assert(maxchar <= MAX_UNICODE); 416 } 417 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 418 } 419 return 1; 420} 421#endif 422 423static PyObject* 424unicode_result_wchar(PyObject *unicode) 425{ 426#ifndef Py_DEBUG 427 Py_ssize_t len; 428 429 len = _PyUnicode_WSTR_LENGTH(unicode); 430 if (len == 0) { 431 Py_DECREF(unicode); 432 _Py_RETURN_UNICODE_EMPTY(); 433 } 434 435 if (len == 1) { 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 437 if ((Py_UCS4)ch < 256) { 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 439 Py_DECREF(unicode); 440 return latin1_char; 441 } 442 } 443 444 if (_PyUnicode_Ready(unicode) < 0) { 445 Py_DECREF(unicode); 446 return NULL; 447 } 448#else 449 assert(Py_REFCNT(unicode) == 1); 450 451 /* don't make the result ready in debug mode to ensure that the caller 452 makes the string ready before using it */ 453 assert(_PyUnicode_CheckConsistency(unicode, 1)); 454#endif 455 return unicode; 456} 457 458static PyObject* 459unicode_result_ready(PyObject *unicode) 460{ 461 Py_ssize_t length; 462 463 length = PyUnicode_GET_LENGTH(unicode); 464 if (length == 0) { 465 if (unicode != unicode_empty) { 466 Py_DECREF(unicode); 467 _Py_RETURN_UNICODE_EMPTY(); 468 } 469 return unicode_empty; 470 } 471 472 if (length == 1) { 473 void *data = PyUnicode_DATA(unicode); 474 int kind = PyUnicode_KIND(unicode); 475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 476 if (ch < 256) { 477 PyObject *latin1_char = unicode_latin1[ch]; 478 if (latin1_char != NULL) { 479 if (unicode != latin1_char) { 480 Py_INCREF(latin1_char); 481 Py_DECREF(unicode); 482 } 483 return latin1_char; 484 } 485 else { 486 assert(_PyUnicode_CheckConsistency(unicode, 1)); 487 Py_INCREF(unicode); 488 unicode_latin1[ch] = unicode; 489 return unicode; 490 } 491 } 492 } 493 494 assert(_PyUnicode_CheckConsistency(unicode, 1)); 495 return unicode; 496} 497 498static PyObject* 499unicode_result(PyObject *unicode) 500{ 501 assert(_PyUnicode_CHECK(unicode)); 502 if (PyUnicode_IS_READY(unicode)) 503 return unicode_result_ready(unicode); 504 else 505 return unicode_result_wchar(unicode); 506} 507 508static PyObject* 509unicode_result_unchanged(PyObject *unicode) 510{ 511 if (PyUnicode_CheckExact(unicode)) { 512 if (PyUnicode_READY(unicode) == -1) 513 return NULL; 514 Py_INCREF(unicode); 515 return unicode; 516 } 517 else 518 /* Subtype -- return genuine unicode string with the same value. */ 519 return _PyUnicode_Copy(unicode); 520} 521 522#ifdef HAVE_MBCS 523static OSVERSIONINFOEX winver; 524#endif 525 526/* --- Bloom Filters ----------------------------------------------------- */ 527 528/* stuff to implement simple "bloom filters" for Unicode characters. 529 to keep things simple, we use a single bitmask, using the least 5 530 bits from each unicode characters as the bit index. */ 531 532/* the linebreak mask is set up by Unicode_Init below */ 533 534#if LONG_BIT >= 128 535#define BLOOM_WIDTH 128 536#elif LONG_BIT >= 64 537#define BLOOM_WIDTH 64 538#elif LONG_BIT >= 32 539#define BLOOM_WIDTH 32 540#else 541#error "LONG_BIT is smaller than 32" 542#endif 543 544#define BLOOM_MASK unsigned long 545 546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 547 548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 549 550#define BLOOM_LINEBREAK(ch) \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 553 554Py_LOCAL_INLINE(BLOOM_MASK) 555make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 556{ 557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 558 do { \ 559 TYPE *data = (TYPE *)PTR; \ 560 TYPE *end = data + LEN; \ 561 Py_UCS4 ch; \ 562 for (; data != end; data++) { \ 563 ch = *data; \ 564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 565 } \ 566 break; \ 567 } while (0) 568 569 /* calculate simple bloom-style bitmask for a given unicode string */ 570 571 BLOOM_MASK mask; 572 573 mask = 0; 574 switch (kind) { 575 case PyUnicode_1BYTE_KIND: 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 577 break; 578 case PyUnicode_2BYTE_KIND: 579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 580 break; 581 case PyUnicode_4BYTE_KIND: 582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 583 break; 584 default: 585 assert(0); 586 } 587 return mask; 588 589#undef BLOOM_UPDATE 590} 591 592/* Compilation of templated routines */ 593 594#include "stringlib/asciilib.h" 595#include "stringlib/fastsearch.h" 596#include "stringlib/partition.h" 597#include "stringlib/split.h" 598#include "stringlib/count.h" 599#include "stringlib/find.h" 600#include "stringlib/find_max_char.h" 601#include "stringlib/localeutil.h" 602#include "stringlib/undef.h" 603 604#include "stringlib/ucs1lib.h" 605#include "stringlib/fastsearch.h" 606#include "stringlib/partition.h" 607#include "stringlib/split.h" 608#include "stringlib/count.h" 609#include "stringlib/find.h" 610#include "stringlib/replace.h" 611#include "stringlib/find_max_char.h" 612#include "stringlib/localeutil.h" 613#include "stringlib/undef.h" 614 615#include "stringlib/ucs2lib.h" 616#include "stringlib/fastsearch.h" 617#include "stringlib/partition.h" 618#include "stringlib/split.h" 619#include "stringlib/count.h" 620#include "stringlib/find.h" 621#include "stringlib/replace.h" 622#include "stringlib/find_max_char.h" 623#include "stringlib/localeutil.h" 624#include "stringlib/undef.h" 625 626#include "stringlib/ucs4lib.h" 627#include "stringlib/fastsearch.h" 628#include "stringlib/partition.h" 629#include "stringlib/split.h" 630#include "stringlib/count.h" 631#include "stringlib/find.h" 632#include "stringlib/replace.h" 633#include "stringlib/find_max_char.h" 634#include "stringlib/localeutil.h" 635#include "stringlib/undef.h" 636 637#include "stringlib/unicodedefs.h" 638#include "stringlib/fastsearch.h" 639#include "stringlib/count.h" 640#include "stringlib/find.h" 641#include "stringlib/undef.h" 642 643/* --- Unicode Object ----------------------------------------------------- */ 644 645static PyObject * 646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 647 648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 649 Py_ssize_t size, Py_UCS4 ch, 650 int direction) 651{ 652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 653 654 switch (kind) { 655 case PyUnicode_1BYTE_KIND: 656 { 657 Py_UCS1 ch1 = (Py_UCS1) ch; 658 if (ch1 == ch) 659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 660 else 661 return -1; 662 } 663 case PyUnicode_2BYTE_KIND: 664 { 665 Py_UCS2 ch2 = (Py_UCS2) ch; 666 if (ch2 == ch) 667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 668 else 669 return -1; 670 } 671 case PyUnicode_4BYTE_KIND: 672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 673 default: 674 assert(0); 675 return -1; 676 } 677} 678 679#ifdef Py_DEBUG 680/* Fill the data of an Unicode string with invalid characters to detect bugs 681 earlier. 682 683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 685 invalid character in Unicode 6.0. */ 686static void 687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 688{ 689 int kind = PyUnicode_KIND(unicode); 690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 691 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 692 if (length <= old_length) 693 return; 694 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 695} 696#endif 697 698static PyObject* 699resize_compact(PyObject *unicode, Py_ssize_t length) 700{ 701 Py_ssize_t char_size; 702 Py_ssize_t struct_size; 703 Py_ssize_t new_size; 704 int share_wstr; 705 PyObject *new_unicode; 706#ifdef Py_DEBUG 707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 708#endif 709 710 assert(unicode_modifiable(unicode)); 711 assert(PyUnicode_IS_READY(unicode)); 712 assert(PyUnicode_IS_COMPACT(unicode)); 713 714 char_size = PyUnicode_KIND(unicode); 715 if (PyUnicode_IS_ASCII(unicode)) 716 struct_size = sizeof(PyASCIIObject); 717 else 718 struct_size = sizeof(PyCompactUnicodeObject); 719 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 720 721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 722 PyErr_NoMemory(); 723 return NULL; 724 } 725 new_size = (struct_size + (length + 1) * char_size); 726 727 _Py_DEC_REFTOTAL; 728 _Py_ForgetReference(unicode); 729 730 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 731 if (new_unicode == NULL) { 732 _Py_NewReference(unicode); 733 PyErr_NoMemory(); 734 return NULL; 735 } 736 unicode = new_unicode; 737 _Py_NewReference(unicode); 738 739 _PyUnicode_LENGTH(unicode) = length; 740 if (share_wstr) { 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 742 if (!PyUnicode_IS_ASCII(unicode)) 743 _PyUnicode_WSTR_LENGTH(unicode) = length; 744 } 745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 746 PyObject_DEL(_PyUnicode_WSTR(unicode)); 747 _PyUnicode_WSTR(unicode) = NULL; 748 } 749#ifdef Py_DEBUG 750 unicode_fill_invalid(unicode, old_length); 751#endif 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 753 length, 0); 754 assert(_PyUnicode_CheckConsistency(unicode, 0)); 755 return unicode; 756} 757 758static int 759resize_inplace(PyObject *unicode, Py_ssize_t length) 760{ 761 wchar_t *wstr; 762 Py_ssize_t new_size; 763 assert(!PyUnicode_IS_COMPACT(unicode)); 764 assert(Py_REFCNT(unicode) == 1); 765 766 if (PyUnicode_IS_READY(unicode)) { 767 Py_ssize_t char_size; 768 int share_wstr, share_utf8; 769 void *data; 770#ifdef Py_DEBUG 771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 772#endif 773 774 data = _PyUnicode_DATA_ANY(unicode); 775 char_size = PyUnicode_KIND(unicode); 776 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 778 779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 780 PyErr_NoMemory(); 781 return -1; 782 } 783 new_size = (length + 1) * char_size; 784 785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 786 { 787 PyObject_DEL(_PyUnicode_UTF8(unicode)); 788 _PyUnicode_UTF8(unicode) = NULL; 789 _PyUnicode_UTF8_LENGTH(unicode) = 0; 790 } 791 792 data = (PyObject *)PyObject_REALLOC(data, new_size); 793 if (data == NULL) { 794 PyErr_NoMemory(); 795 return -1; 796 } 797 _PyUnicode_DATA_ANY(unicode) = data; 798 if (share_wstr) { 799 _PyUnicode_WSTR(unicode) = data; 800 _PyUnicode_WSTR_LENGTH(unicode) = length; 801 } 802 if (share_utf8) { 803 _PyUnicode_UTF8(unicode) = data; 804 _PyUnicode_UTF8_LENGTH(unicode) = length; 805 } 806 _PyUnicode_LENGTH(unicode) = length; 807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 808#ifdef Py_DEBUG 809 unicode_fill_invalid(unicode, old_length); 810#endif 811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 812 assert(_PyUnicode_CheckConsistency(unicode, 0)); 813 return 0; 814 } 815 } 816 assert(_PyUnicode_WSTR(unicode) != NULL); 817 818 /* check for integer overflow */ 819 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 820 PyErr_NoMemory(); 821 return -1; 822 } 823 new_size = sizeof(wchar_t) * (length + 1); 824 wstr = _PyUnicode_WSTR(unicode); 825 wstr = PyObject_REALLOC(wstr, new_size); 826 if (!wstr) { 827 PyErr_NoMemory(); 828 return -1; 829 } 830 _PyUnicode_WSTR(unicode) = wstr; 831 _PyUnicode_WSTR(unicode)[length] = 0; 832 _PyUnicode_WSTR_LENGTH(unicode) = length; 833 assert(_PyUnicode_CheckConsistency(unicode, 0)); 834 return 0; 835} 836 837static PyObject* 838resize_copy(PyObject *unicode, Py_ssize_t length) 839{ 840 Py_ssize_t copy_length; 841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 842 PyObject *copy; 843 844 if (PyUnicode_READY(unicode) == -1) 845 return NULL; 846 847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 848 if (copy == NULL) 849 return NULL; 850 851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 853 return copy; 854 } 855 else { 856 PyObject *w; 857 858 w = (PyObject*)_PyUnicode_New(length); 859 if (w == NULL) 860 return NULL; 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 862 copy_length = Py_MIN(copy_length, length); 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 864 copy_length * sizeof(wchar_t)); 865 return w; 866 } 867} 868 869/* We allocate one more byte to make sure the string is 870 Ux0000 terminated; some code (e.g. new_identifier) 871 relies on that. 872 873 XXX This allocator could further be enhanced by assuring that the 874 free list never reduces its size below 1. 875 876*/ 877 878static PyUnicodeObject * 879_PyUnicode_New(Py_ssize_t length) 880{ 881 PyUnicodeObject *unicode; 882 size_t new_size; 883 884 /* Optimization for empty strings */ 885 if (length == 0 && unicode_empty != NULL) { 886 Py_INCREF(unicode_empty); 887 return (PyUnicodeObject*)unicode_empty; 888 } 889 890 /* Ensure we won't overflow the size. */ 891 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 892 return (PyUnicodeObject *)PyErr_NoMemory(); 893 } 894 if (length < 0) { 895 PyErr_SetString(PyExc_SystemError, 896 "Negative size passed to _PyUnicode_New"); 897 return NULL; 898 } 899 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 901 if (unicode == NULL) 902 return NULL; 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 904 905 _PyUnicode_WSTR_LENGTH(unicode) = length; 906 _PyUnicode_HASH(unicode) = -1; 907 _PyUnicode_STATE(unicode).interned = 0; 908 _PyUnicode_STATE(unicode).kind = 0; 909 _PyUnicode_STATE(unicode).compact = 0; 910 _PyUnicode_STATE(unicode).ready = 0; 911 _PyUnicode_STATE(unicode).ascii = 0; 912 _PyUnicode_DATA_ANY(unicode) = NULL; 913 _PyUnicode_LENGTH(unicode) = 0; 914 _PyUnicode_UTF8(unicode) = NULL; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0; 916 917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 918 if (!_PyUnicode_WSTR(unicode)) { 919 Py_DECREF(unicode); 920 PyErr_NoMemory(); 921 return NULL; 922 } 923 924 /* Initialize the first element to guard against cases where 925 * the caller fails before initializing str -- unicode_resize() 926 * reads str[0], and the Keep-Alive optimization can keep memory 927 * allocated for str alive across a call to unicode_dealloc(unicode). 928 * We don't want unicode_resize to read uninitialized memory in 929 * that case. 930 */ 931 _PyUnicode_WSTR(unicode)[0] = 0; 932 _PyUnicode_WSTR(unicode)[length] = 0; 933 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 935 return unicode; 936} 937 938static const char* 939unicode_kind_name(PyObject *unicode) 940{ 941 /* don't check consistency: unicode_kind_name() is called from 942 _PyUnicode_Dump() */ 943 if (!PyUnicode_IS_COMPACT(unicode)) 944 { 945 if (!PyUnicode_IS_READY(unicode)) 946 return "wstr"; 947 switch (PyUnicode_KIND(unicode)) 948 { 949 case PyUnicode_1BYTE_KIND: 950 if (PyUnicode_IS_ASCII(unicode)) 951 return "legacy ascii"; 952 else 953 return "legacy latin1"; 954 case PyUnicode_2BYTE_KIND: 955 return "legacy UCS2"; 956 case PyUnicode_4BYTE_KIND: 957 return "legacy UCS4"; 958 default: 959 return "<legacy invalid kind>"; 960 } 961 } 962 assert(PyUnicode_IS_READY(unicode)); 963 switch (PyUnicode_KIND(unicode)) { 964 case PyUnicode_1BYTE_KIND: 965 if (PyUnicode_IS_ASCII(unicode)) 966 return "ascii"; 967 else 968 return "latin1"; 969 case PyUnicode_2BYTE_KIND: 970 return "UCS2"; 971 case PyUnicode_4BYTE_KIND: 972 return "UCS4"; 973 default: 974 return "<invalid compact kind>"; 975 } 976} 977 978#ifdef Py_DEBUG 979/* Functions wrapping macros for use in debugger */ 980char *_PyUnicode_utf8(void *unicode){ 981 return PyUnicode_UTF8(unicode); 982} 983 984void *_PyUnicode_compact_data(void *unicode) { 985 return _PyUnicode_COMPACT_DATA(unicode); 986} 987void *_PyUnicode_data(void *unicode){ 988 printf("obj %p\n", unicode); 989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 994 return PyUnicode_DATA(unicode); 995} 996 997void 998_PyUnicode_Dump(PyObject *op) 999{ 1000 PyASCIIObject *ascii = (PyASCIIObject *)op; 1001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1003 void *data; 1004 1005 if (ascii->state.compact) 1006 { 1007 if (ascii->state.ascii) 1008 data = (ascii + 1); 1009 else 1010 data = (compact + 1); 1011 } 1012 else 1013 data = unicode->data.any; 1014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1015 unicode_kind_name(op), ascii->length); 1016 1017 if (ascii->wstr == data) 1018 printf("shared "); 1019 printf("wstr=%p", ascii->wstr); 1020 1021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1023 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1024 printf("shared "); 1025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1026 compact->utf8, compact->utf8_length); 1027 } 1028 printf(", data=%p\n", data); 1029} 1030#endif 1031 1032PyObject * 1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1034{ 1035 PyObject *obj; 1036 PyCompactUnicodeObject *unicode; 1037 void *data; 1038 enum PyUnicode_Kind kind; 1039 int is_sharing, is_ascii; 1040 Py_ssize_t char_size; 1041 Py_ssize_t struct_size; 1042 1043 /* Optimization for empty strings */ 1044 if (size == 0 && unicode_empty != NULL) { 1045 Py_INCREF(unicode_empty); 1046 return unicode_empty; 1047 } 1048 1049 is_ascii = 0; 1050 is_sharing = 0; 1051 struct_size = sizeof(PyCompactUnicodeObject); 1052 if (maxchar < 128) { 1053 kind = PyUnicode_1BYTE_KIND; 1054 char_size = 1; 1055 is_ascii = 1; 1056 struct_size = sizeof(PyASCIIObject); 1057 } 1058 else if (maxchar < 256) { 1059 kind = PyUnicode_1BYTE_KIND; 1060 char_size = 1; 1061 } 1062 else if (maxchar < 65536) { 1063 kind = PyUnicode_2BYTE_KIND; 1064 char_size = 2; 1065 if (sizeof(wchar_t) == 2) 1066 is_sharing = 1; 1067 } 1068 else { 1069 if (maxchar > MAX_UNICODE) { 1070 PyErr_SetString(PyExc_SystemError, 1071 "invalid maximum character passed to PyUnicode_New"); 1072 return NULL; 1073 } 1074 kind = PyUnicode_4BYTE_KIND; 1075 char_size = 4; 1076 if (sizeof(wchar_t) == 4) 1077 is_sharing = 1; 1078 } 1079 1080 /* Ensure we won't overflow the size. */ 1081 if (size < 0) { 1082 PyErr_SetString(PyExc_SystemError, 1083 "Negative size passed to PyUnicode_New"); 1084 return NULL; 1085 } 1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1087 return PyErr_NoMemory(); 1088 1089 /* Duplicated allocation code from _PyObject_New() instead of a call to 1090 * PyObject_New() so we are able to allocate space for the object and 1091 * it's data buffer. 1092 */ 1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1094 if (obj == NULL) 1095 return PyErr_NoMemory(); 1096 obj = PyObject_INIT(obj, &PyUnicode_Type); 1097 if (obj == NULL) 1098 return NULL; 1099 1100 unicode = (PyCompactUnicodeObject *)obj; 1101 if (is_ascii) 1102 data = ((PyASCIIObject*)obj) + 1; 1103 else 1104 data = unicode + 1; 1105 _PyUnicode_LENGTH(unicode) = size; 1106 _PyUnicode_HASH(unicode) = -1; 1107 _PyUnicode_STATE(unicode).interned = 0; 1108 _PyUnicode_STATE(unicode).kind = kind; 1109 _PyUnicode_STATE(unicode).compact = 1; 1110 _PyUnicode_STATE(unicode).ready = 1; 1111 _PyUnicode_STATE(unicode).ascii = is_ascii; 1112 if (is_ascii) { 1113 ((char*)data)[size] = 0; 1114 _PyUnicode_WSTR(unicode) = NULL; 1115 } 1116 else if (kind == PyUnicode_1BYTE_KIND) { 1117 ((char*)data)[size] = 0; 1118 _PyUnicode_WSTR(unicode) = NULL; 1119 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1120 unicode->utf8 = NULL; 1121 unicode->utf8_length = 0; 1122 } 1123 else { 1124 unicode->utf8 = NULL; 1125 unicode->utf8_length = 0; 1126 if (kind == PyUnicode_2BYTE_KIND) 1127 ((Py_UCS2*)data)[size] = 0; 1128 else /* kind == PyUnicode_4BYTE_KIND */ 1129 ((Py_UCS4*)data)[size] = 0; 1130 if (is_sharing) { 1131 _PyUnicode_WSTR_LENGTH(unicode) = size; 1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1133 } 1134 else { 1135 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1136 _PyUnicode_WSTR(unicode) = NULL; 1137 } 1138 } 1139#ifdef Py_DEBUG 1140 unicode_fill_invalid((PyObject*)unicode, 0); 1141#endif 1142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1143 return obj; 1144} 1145 1146#if SIZEOF_WCHAR_T == 2 1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1148 will decode surrogate pairs, the other conversions are implemented as macros 1149 for efficiency. 1150 1151 This function assumes that unicode can hold one more code point than wstr 1152 characters for a terminating null character. */ 1153static void 1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1155 PyObject *unicode) 1156{ 1157 const wchar_t *iter; 1158 Py_UCS4 *ucs4_out; 1159 1160 assert(unicode != NULL); 1161 assert(_PyUnicode_CHECK(unicode)); 1162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1164 1165 for (iter = begin; iter < end; ) { 1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1167 _PyUnicode_GET_LENGTH(unicode))); 1168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1169 && (iter+1) < end 1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1171 { 1172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1173 iter += 2; 1174 } 1175 else { 1176 *ucs4_out++ = *iter; 1177 iter++; 1178 } 1179 } 1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1181 _PyUnicode_GET_LENGTH(unicode))); 1182 1183} 1184#endif 1185 1186static int 1187unicode_check_modifiable(PyObject *unicode) 1188{ 1189 if (!unicode_modifiable(unicode)) { 1190 PyErr_SetString(PyExc_SystemError, 1191 "Cannot modify a string currently used"); 1192 return -1; 1193 } 1194 return 0; 1195} 1196 1197static int 1198_copy_characters(PyObject *to, Py_ssize_t to_start, 1199 PyObject *from, Py_ssize_t from_start, 1200 Py_ssize_t how_many, int check_maxchar) 1201{ 1202 unsigned int from_kind, to_kind; 1203 void *from_data, *to_data; 1204 1205 assert(0 <= how_many); 1206 assert(0 <= from_start); 1207 assert(0 <= to_start); 1208 assert(PyUnicode_Check(from)); 1209 assert(PyUnicode_IS_READY(from)); 1210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1211 1212 assert(PyUnicode_Check(to)); 1213 assert(PyUnicode_IS_READY(to)); 1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1215 1216 if (how_many == 0) 1217 return 0; 1218 1219 from_kind = PyUnicode_KIND(from); 1220 from_data = PyUnicode_DATA(from); 1221 to_kind = PyUnicode_KIND(to); 1222 to_data = PyUnicode_DATA(to); 1223 1224#ifdef Py_DEBUG 1225 if (!check_maxchar 1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1227 { 1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1229 Py_UCS4 ch; 1230 Py_ssize_t i; 1231 for (i=0; i < how_many; i++) { 1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1233 assert(ch <= to_maxchar); 1234 } 1235 } 1236#endif 1237 1238 if (from_kind == to_kind) { 1239 if (check_maxchar 1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1241 { 1242 /* Writing Latin-1 characters into an ASCII string requires to 1243 check that all written characters are pure ASCII */ 1244 Py_UCS4 max_char; 1245 max_char = ucs1lib_find_max_char(from_data, 1246 (Py_UCS1*)from_data + how_many); 1247 if (max_char >= 128) 1248 return -1; 1249 } 1250 Py_MEMCPY((char*)to_data + to_kind * to_start, 1251 (char*)from_data + from_kind * from_start, 1252 to_kind * how_many); 1253 } 1254 else if (from_kind == PyUnicode_1BYTE_KIND 1255 && to_kind == PyUnicode_2BYTE_KIND) 1256 { 1257 _PyUnicode_CONVERT_BYTES( 1258 Py_UCS1, Py_UCS2, 1259 PyUnicode_1BYTE_DATA(from) + from_start, 1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1261 PyUnicode_2BYTE_DATA(to) + to_start 1262 ); 1263 } 1264 else if (from_kind == PyUnicode_1BYTE_KIND 1265 && to_kind == PyUnicode_4BYTE_KIND) 1266 { 1267 _PyUnicode_CONVERT_BYTES( 1268 Py_UCS1, Py_UCS4, 1269 PyUnicode_1BYTE_DATA(from) + from_start, 1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1271 PyUnicode_4BYTE_DATA(to) + to_start 1272 ); 1273 } 1274 else if (from_kind == PyUnicode_2BYTE_KIND 1275 && to_kind == PyUnicode_4BYTE_KIND) 1276 { 1277 _PyUnicode_CONVERT_BYTES( 1278 Py_UCS2, Py_UCS4, 1279 PyUnicode_2BYTE_DATA(from) + from_start, 1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1281 PyUnicode_4BYTE_DATA(to) + to_start 1282 ); 1283 } 1284 else { 1285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1286 1287 if (!check_maxchar) { 1288 if (from_kind == PyUnicode_2BYTE_KIND 1289 && to_kind == PyUnicode_1BYTE_KIND) 1290 { 1291 _PyUnicode_CONVERT_BYTES( 1292 Py_UCS2, Py_UCS1, 1293 PyUnicode_2BYTE_DATA(from) + from_start, 1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1295 PyUnicode_1BYTE_DATA(to) + to_start 1296 ); 1297 } 1298 else if (from_kind == PyUnicode_4BYTE_KIND 1299 && to_kind == PyUnicode_1BYTE_KIND) 1300 { 1301 _PyUnicode_CONVERT_BYTES( 1302 Py_UCS4, Py_UCS1, 1303 PyUnicode_4BYTE_DATA(from) + from_start, 1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1305 PyUnicode_1BYTE_DATA(to) + to_start 1306 ); 1307 } 1308 else if (from_kind == PyUnicode_4BYTE_KIND 1309 && to_kind == PyUnicode_2BYTE_KIND) 1310 { 1311 _PyUnicode_CONVERT_BYTES( 1312 Py_UCS4, Py_UCS2, 1313 PyUnicode_4BYTE_DATA(from) + from_start, 1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1315 PyUnicode_2BYTE_DATA(to) + to_start 1316 ); 1317 } 1318 else { 1319 assert(0); 1320 return -1; 1321 } 1322 } 1323 else { 1324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1325 Py_UCS4 ch; 1326 Py_ssize_t i; 1327 1328 for (i=0; i < how_many; i++) { 1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1330 if (ch > to_maxchar) 1331 return -1; 1332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1333 } 1334 } 1335 } 1336 return 0; 1337} 1338 1339void 1340_PyUnicode_FastCopyCharacters( 1341 PyObject *to, Py_ssize_t to_start, 1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1343{ 1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1345} 1346 1347Py_ssize_t 1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1349 PyObject *from, Py_ssize_t from_start, 1350 Py_ssize_t how_many) 1351{ 1352 int err; 1353 1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1355 PyErr_BadInternalCall(); 1356 return -1; 1357 } 1358 1359 if (PyUnicode_READY(from) == -1) 1360 return -1; 1361 if (PyUnicode_READY(to) == -1) 1362 return -1; 1363 1364 if (from_start < 0) { 1365 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1366 return -1; 1367 } 1368 if (to_start < 0) { 1369 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1370 return -1; 1371 } 1372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1374 PyErr_Format(PyExc_SystemError, 1375 "Cannot write %zi characters at %zi " 1376 "in a string of %zi characters", 1377 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1378 return -1; 1379 } 1380 1381 if (how_many == 0) 1382 return 0; 1383 1384 if (unicode_check_modifiable(to)) 1385 return -1; 1386 1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1388 if (err) { 1389 PyErr_Format(PyExc_SystemError, 1390 "Cannot copy %s characters " 1391 "into a string of %s characters", 1392 unicode_kind_name(from), 1393 unicode_kind_name(to)); 1394 return -1; 1395 } 1396 return how_many; 1397} 1398 1399/* Find the maximum code point and count the number of surrogate pairs so a 1400 correct string length can be computed before converting a string to UCS4. 1401 This function counts single surrogates as a character and not as a pair. 1402 1403 Return 0 on success, or -1 on error. */ 1404static int 1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1407{ 1408 const wchar_t *iter; 1409 Py_UCS4 ch; 1410 1411 assert(num_surrogates != NULL && maxchar != NULL); 1412 *num_surrogates = 0; 1413 *maxchar = 0; 1414 1415 for (iter = begin; iter < end; ) { 1416#if SIZEOF_WCHAR_T == 2 1417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1418 && (iter+1) < end 1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1420 { 1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1422 ++(*num_surrogates); 1423 iter += 2; 1424 } 1425 else 1426#endif 1427 { 1428 ch = *iter; 1429 iter++; 1430 } 1431 if (ch > *maxchar) { 1432 *maxchar = ch; 1433 if (*maxchar > MAX_UNICODE) { 1434 PyErr_Format(PyExc_ValueError, 1435 "character U+%x is not in range [U+0000; U+10ffff]", 1436 ch); 1437 return -1; 1438 } 1439 } 1440 } 1441 return 0; 1442} 1443 1444int 1445_PyUnicode_Ready(PyObject *unicode) 1446{ 1447 wchar_t *end; 1448 Py_UCS4 maxchar = 0; 1449 Py_ssize_t num_surrogates; 1450#if SIZEOF_WCHAR_T == 2 1451 Py_ssize_t length_wo_surrogates; 1452#endif 1453 1454 /* _PyUnicode_Ready() is only intended for old-style API usage where 1455 strings were created using _PyObject_New() and where no canonical 1456 representation (the str field) has been set yet aka strings 1457 which are not yet ready. */ 1458 assert(_PyUnicode_CHECK(unicode)); 1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1460 assert(_PyUnicode_WSTR(unicode) != NULL); 1461 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1462 assert(_PyUnicode_UTF8(unicode) == NULL); 1463 /* Actually, it should neither be interned nor be anything else: */ 1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1465 1466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1468 &maxchar, &num_surrogates) == -1) 1469 return -1; 1470 1471 if (maxchar < 256) { 1472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1473 if (!_PyUnicode_DATA_ANY(unicode)) { 1474 PyErr_NoMemory(); 1475 return -1; 1476 } 1477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1478 _PyUnicode_WSTR(unicode), end, 1479 PyUnicode_1BYTE_DATA(unicode)); 1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1483 if (maxchar < 128) { 1484 _PyUnicode_STATE(unicode).ascii = 1; 1485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1487 } 1488 else { 1489 _PyUnicode_STATE(unicode).ascii = 0; 1490 _PyUnicode_UTF8(unicode) = NULL; 1491 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1492 } 1493 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1494 _PyUnicode_WSTR(unicode) = NULL; 1495 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1496 } 1497 /* In this case we might have to convert down from 4-byte native 1498 wchar_t to 2-byte unicode. */ 1499 else if (maxchar < 65536) { 1500 assert(num_surrogates == 0 && 1501 "FindMaxCharAndNumSurrogatePairs() messed up"); 1502 1503#if SIZEOF_WCHAR_T == 2 1504 /* We can share representations and are done. */ 1505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1509 _PyUnicode_UTF8(unicode) = NULL; 1510 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1511#else 1512 /* sizeof(wchar_t) == 4 */ 1513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1515 if (!_PyUnicode_DATA_ANY(unicode)) { 1516 PyErr_NoMemory(); 1517 return -1; 1518 } 1519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1520 _PyUnicode_WSTR(unicode), end, 1521 PyUnicode_2BYTE_DATA(unicode)); 1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1525 _PyUnicode_UTF8(unicode) = NULL; 1526 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1527 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1528 _PyUnicode_WSTR(unicode) = NULL; 1529 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1530#endif 1531 } 1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1533 else { 1534#if SIZEOF_WCHAR_T == 2 1535 /* in case the native representation is 2-bytes, we need to allocate a 1536 new normalized 4-byte version. */ 1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1539 if (!_PyUnicode_DATA_ANY(unicode)) { 1540 PyErr_NoMemory(); 1541 return -1; 1542 } 1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1545 _PyUnicode_UTF8(unicode) = NULL; 1546 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1547 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1548 _PyUnicode_STATE(unicode).ready = 1; 1549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1550 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1551 _PyUnicode_WSTR(unicode) = NULL; 1552 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1553#else 1554 assert(num_surrogates == 0); 1555 1556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1558 _PyUnicode_UTF8(unicode) = NULL; 1559 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1561#endif 1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1563 } 1564 _PyUnicode_STATE(unicode).ready = 1; 1565 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1566 return 0; 1567} 1568 1569static void 1570unicode_dealloc(PyObject *unicode) 1571{ 1572 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1573 case SSTATE_NOT_INTERNED: 1574 break; 1575 1576 case SSTATE_INTERNED_MORTAL: 1577 /* revive dead object temporarily for DelItem */ 1578 Py_REFCNT(unicode) = 3; 1579 if (PyDict_DelItem(interned, unicode) != 0) 1580 Py_FatalError( 1581 "deletion of interned string failed"); 1582 break; 1583 1584 case SSTATE_INTERNED_IMMORTAL: 1585 Py_FatalError("Immortal interned string died."); 1586 1587 default: 1588 Py_FatalError("Inconsistent interned string state."); 1589 } 1590 1591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1592 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1594 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1597 1598 Py_TYPE(unicode)->tp_free(unicode); 1599} 1600 1601#ifdef Py_DEBUG 1602static int 1603unicode_is_singleton(PyObject *unicode) 1604{ 1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1606 if (unicode == unicode_empty) 1607 return 1; 1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1609 { 1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1611 if (ch < 256 && unicode_latin1[ch] == unicode) 1612 return 1; 1613 } 1614 return 0; 1615} 1616#endif 1617 1618static int 1619unicode_modifiable(PyObject *unicode) 1620{ 1621 assert(_PyUnicode_CHECK(unicode)); 1622 if (Py_REFCNT(unicode) != 1) 1623 return 0; 1624 if (_PyUnicode_HASH(unicode) != -1) 1625 return 0; 1626 if (PyUnicode_CHECK_INTERNED(unicode)) 1627 return 0; 1628 if (!PyUnicode_CheckExact(unicode)) 1629 return 0; 1630#ifdef Py_DEBUG 1631 /* singleton refcount is greater than 1 */ 1632 assert(!unicode_is_singleton(unicode)); 1633#endif 1634 return 1; 1635} 1636 1637static int 1638unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1639{ 1640 PyObject *unicode; 1641 Py_ssize_t old_length; 1642 1643 assert(p_unicode != NULL); 1644 unicode = *p_unicode; 1645 1646 assert(unicode != NULL); 1647 assert(PyUnicode_Check(unicode)); 1648 assert(0 <= length); 1649 1650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1651 old_length = PyUnicode_WSTR_LENGTH(unicode); 1652 else 1653 old_length = PyUnicode_GET_LENGTH(unicode); 1654 if (old_length == length) 1655 return 0; 1656 1657 if (length == 0) { 1658 _Py_INCREF_UNICODE_EMPTY(); 1659 if (!unicode_empty) 1660 return -1; 1661 Py_DECREF(*p_unicode); 1662 *p_unicode = unicode_empty; 1663 return 0; 1664 } 1665 1666 if (!unicode_modifiable(unicode)) { 1667 PyObject *copy = resize_copy(unicode, length); 1668 if (copy == NULL) 1669 return -1; 1670 Py_DECREF(*p_unicode); 1671 *p_unicode = copy; 1672 return 0; 1673 } 1674 1675 if (PyUnicode_IS_COMPACT(unicode)) { 1676 PyObject *new_unicode = resize_compact(unicode, length); 1677 if (new_unicode == NULL) 1678 return -1; 1679 *p_unicode = new_unicode; 1680 return 0; 1681 } 1682 return resize_inplace(unicode, length); 1683} 1684 1685int 1686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1687{ 1688 PyObject *unicode; 1689 if (p_unicode == NULL) { 1690 PyErr_BadInternalCall(); 1691 return -1; 1692 } 1693 unicode = *p_unicode; 1694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1695 { 1696 PyErr_BadInternalCall(); 1697 return -1; 1698 } 1699 return unicode_resize(p_unicode, length); 1700} 1701 1702/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1703 1704 WARNING: The function doesn't copy the terminating null character and 1705 doesn't check the maximum character (may write a latin1 character in an 1706 ASCII string). */ 1707static void 1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1709 const char *str, Py_ssize_t len) 1710{ 1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1712 void *data = PyUnicode_DATA(unicode); 1713 const char *end = str + len; 1714 1715 switch (kind) { 1716 case PyUnicode_1BYTE_KIND: { 1717 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1718#ifdef Py_DEBUG 1719 if (PyUnicode_IS_ASCII(unicode)) { 1720 Py_UCS4 maxchar = ucs1lib_find_max_char( 1721 (const Py_UCS1*)str, 1722 (const Py_UCS1*)str + len); 1723 assert(maxchar < 128); 1724 } 1725#endif 1726 memcpy((char *) data + index, str, len); 1727 break; 1728 } 1729 case PyUnicode_2BYTE_KIND: { 1730 Py_UCS2 *start = (Py_UCS2 *)data + index; 1731 Py_UCS2 *ucs2 = start; 1732 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1733 1734 for (; str < end; ++ucs2, ++str) 1735 *ucs2 = (Py_UCS2)*str; 1736 1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1738 break; 1739 } 1740 default: { 1741 Py_UCS4 *start = (Py_UCS4 *)data + index; 1742 Py_UCS4 *ucs4 = start; 1743 assert(kind == PyUnicode_4BYTE_KIND); 1744 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1745 1746 for (; str < end; ++ucs4, ++str) 1747 *ucs4 = (Py_UCS4)*str; 1748 1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1750 } 1751 } 1752} 1753 1754static PyObject* 1755get_latin1_char(unsigned char ch) 1756{ 1757 PyObject *unicode = unicode_latin1[ch]; 1758 if (!unicode) { 1759 unicode = PyUnicode_New(1, ch); 1760 if (!unicode) 1761 return NULL; 1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1763 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1764 unicode_latin1[ch] = unicode; 1765 } 1766 Py_INCREF(unicode); 1767 return unicode; 1768} 1769 1770static PyObject* 1771unicode_char(Py_UCS4 ch) 1772{ 1773 PyObject *unicode; 1774 1775 assert(ch <= MAX_UNICODE); 1776 1777 if (ch < 256) 1778 return get_latin1_char(ch); 1779 1780 unicode = PyUnicode_New(1, ch); 1781 if (unicode == NULL) 1782 return NULL; 1783 switch (PyUnicode_KIND(unicode)) { 1784 case PyUnicode_1BYTE_KIND: 1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1786 break; 1787 case PyUnicode_2BYTE_KIND: 1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1789 break; 1790 default: 1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1793 } 1794 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1795 return unicode; 1796} 1797 1798PyObject * 1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1800{ 1801 PyObject *unicode; 1802 Py_UCS4 maxchar = 0; 1803 Py_ssize_t num_surrogates; 1804 1805 if (u == NULL) 1806 return (PyObject*)_PyUnicode_New(size); 1807 1808 /* If the Unicode data is known at construction time, we can apply 1809 some optimizations which share commonly used objects. */ 1810 1811 /* Optimization for empty strings */ 1812 if (size == 0) 1813 _Py_RETURN_UNICODE_EMPTY(); 1814 1815 /* Single character Unicode objects in the Latin-1 range are 1816 shared when using this constructor */ 1817 if (size == 1 && (Py_UCS4)*u < 256) 1818 return get_latin1_char((unsigned char)*u); 1819 1820 /* If not empty and not single character, copy the Unicode data 1821 into the new object */ 1822 if (find_maxchar_surrogates(u, u + size, 1823 &maxchar, &num_surrogates) == -1) 1824 return NULL; 1825 1826 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1827 if (!unicode) 1828 return NULL; 1829 1830 switch (PyUnicode_KIND(unicode)) { 1831 case PyUnicode_1BYTE_KIND: 1832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1833 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1834 break; 1835 case PyUnicode_2BYTE_KIND: 1836#if Py_UNICODE_SIZE == 2 1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1838#else 1839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1840 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1841#endif 1842 break; 1843 case PyUnicode_4BYTE_KIND: 1844#if SIZEOF_WCHAR_T == 2 1845 /* This is the only case which has to process surrogates, thus 1846 a simple copy loop is not enough and we need a function. */ 1847 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1848#else 1849 assert(num_surrogates == 0); 1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1851#endif 1852 break; 1853 default: 1854 assert(0 && "Impossible state"); 1855 } 1856 1857 return unicode_result(unicode); 1858} 1859 1860PyObject * 1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1862{ 1863 if (size < 0) { 1864 PyErr_SetString(PyExc_SystemError, 1865 "Negative size passed to PyUnicode_FromStringAndSize"); 1866 return NULL; 1867 } 1868 if (u != NULL) 1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1870 else 1871 return (PyObject *)_PyUnicode_New(size); 1872} 1873 1874PyObject * 1875PyUnicode_FromString(const char *u) 1876{ 1877 size_t size = strlen(u); 1878 if (size > PY_SSIZE_T_MAX) { 1879 PyErr_SetString(PyExc_OverflowError, "input too long"); 1880 return NULL; 1881 } 1882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1883} 1884 1885PyObject * 1886_PyUnicode_FromId(_Py_Identifier *id) 1887{ 1888 if (!id->object) { 1889 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1890 strlen(id->string), 1891 NULL, NULL); 1892 if (!id->object) 1893 return NULL; 1894 PyUnicode_InternInPlace(&id->object); 1895 assert(!id->next); 1896 id->next = static_strings; 1897 static_strings = id; 1898 } 1899 return id->object; 1900} 1901 1902void 1903_PyUnicode_ClearStaticStrings() 1904{ 1905 _Py_Identifier *tmp, *s = static_strings; 1906 while (s) { 1907 Py_CLEAR(s->object); 1908 tmp = s->next; 1909 s->next = NULL; 1910 s = tmp; 1911 } 1912 static_strings = NULL; 1913} 1914 1915/* Internal function, doesn't check maximum character */ 1916 1917PyObject* 1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1919{ 1920 const unsigned char *s = (const unsigned char *)buffer; 1921 PyObject *unicode; 1922 if (size == 1) { 1923#ifdef Py_DEBUG 1924 assert((unsigned char)s[0] < 128); 1925#endif 1926 return get_latin1_char(s[0]); 1927 } 1928 unicode = PyUnicode_New(size, 127); 1929 if (!unicode) 1930 return NULL; 1931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1932 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1933 return unicode; 1934} 1935 1936static Py_UCS4 1937kind_maxchar_limit(unsigned int kind) 1938{ 1939 switch (kind) { 1940 case PyUnicode_1BYTE_KIND: 1941 return 0x80; 1942 case PyUnicode_2BYTE_KIND: 1943 return 0x100; 1944 case PyUnicode_4BYTE_KIND: 1945 return 0x10000; 1946 default: 1947 assert(0 && "invalid kind"); 1948 return MAX_UNICODE; 1949 } 1950} 1951 1952Py_LOCAL_INLINE(Py_UCS4) 1953align_maxchar(Py_UCS4 maxchar) 1954{ 1955 if (maxchar <= 127) 1956 return 127; 1957 else if (maxchar <= 255) 1958 return 255; 1959 else if (maxchar <= 65535) 1960 return 65535; 1961 else 1962 return MAX_UNICODE; 1963} 1964 1965static PyObject* 1966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1967{ 1968 PyObject *res; 1969 unsigned char max_char; 1970 1971 if (size == 0) 1972 _Py_RETURN_UNICODE_EMPTY(); 1973 assert(size > 0); 1974 if (size == 1) 1975 return get_latin1_char(u[0]); 1976 1977 max_char = ucs1lib_find_max_char(u, u + size); 1978 res = PyUnicode_New(size, max_char); 1979 if (!res) 1980 return NULL; 1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1982 assert(_PyUnicode_CheckConsistency(res, 1)); 1983 return res; 1984} 1985 1986static PyObject* 1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1988{ 1989 PyObject *res; 1990 Py_UCS2 max_char; 1991 1992 if (size == 0) 1993 _Py_RETURN_UNICODE_EMPTY(); 1994 assert(size > 0); 1995 if (size == 1) 1996 return unicode_char(u[0]); 1997 1998 max_char = ucs2lib_find_max_char(u, u + size); 1999 res = PyUnicode_New(size, max_char); 2000 if (!res) 2001 return NULL; 2002 if (max_char >= 256) 2003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2004 else { 2005 _PyUnicode_CONVERT_BYTES( 2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2007 } 2008 assert(_PyUnicode_CheckConsistency(res, 1)); 2009 return res; 2010} 2011 2012static PyObject* 2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2014{ 2015 PyObject *res; 2016 Py_UCS4 max_char; 2017 2018 if (size == 0) 2019 _Py_RETURN_UNICODE_EMPTY(); 2020 assert(size > 0); 2021 if (size == 1) 2022 return unicode_char(u[0]); 2023 2024 max_char = ucs4lib_find_max_char(u, u + size); 2025 res = PyUnicode_New(size, max_char); 2026 if (!res) 2027 return NULL; 2028 if (max_char < 256) 2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2030 PyUnicode_1BYTE_DATA(res)); 2031 else if (max_char < 0x10000) 2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2033 PyUnicode_2BYTE_DATA(res)); 2034 else 2035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2036 assert(_PyUnicode_CheckConsistency(res, 1)); 2037 return res; 2038} 2039 2040PyObject* 2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2042{ 2043 if (size < 0) { 2044 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2045 return NULL; 2046 } 2047 switch (kind) { 2048 case PyUnicode_1BYTE_KIND: 2049 return _PyUnicode_FromUCS1(buffer, size); 2050 case PyUnicode_2BYTE_KIND: 2051 return _PyUnicode_FromUCS2(buffer, size); 2052 case PyUnicode_4BYTE_KIND: 2053 return _PyUnicode_FromUCS4(buffer, size); 2054 default: 2055 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2056 return NULL; 2057 } 2058} 2059 2060Py_UCS4 2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2062{ 2063 enum PyUnicode_Kind kind; 2064 void *startptr, *endptr; 2065 2066 assert(PyUnicode_IS_READY(unicode)); 2067 assert(0 <= start); 2068 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2069 assert(start <= end); 2070 2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2072 return PyUnicode_MAX_CHAR_VALUE(unicode); 2073 2074 if (start == end) 2075 return 127; 2076 2077 if (PyUnicode_IS_ASCII(unicode)) 2078 return 127; 2079 2080 kind = PyUnicode_KIND(unicode); 2081 startptr = PyUnicode_DATA(unicode); 2082 endptr = (char *)startptr + end * kind; 2083 startptr = (char *)startptr + start * kind; 2084 switch(kind) { 2085 case PyUnicode_1BYTE_KIND: 2086 return ucs1lib_find_max_char(startptr, endptr); 2087 case PyUnicode_2BYTE_KIND: 2088 return ucs2lib_find_max_char(startptr, endptr); 2089 case PyUnicode_4BYTE_KIND: 2090 return ucs4lib_find_max_char(startptr, endptr); 2091 default: 2092 assert(0); 2093 return 0; 2094 } 2095} 2096 2097/* Ensure that a string uses the most efficient storage, if it is not the 2098 case: create a new string with of the right kind. Write NULL into *p_unicode 2099 on error. */ 2100static void 2101unicode_adjust_maxchar(PyObject **p_unicode) 2102{ 2103 PyObject *unicode, *copy; 2104 Py_UCS4 max_char; 2105 Py_ssize_t len; 2106 unsigned int kind; 2107 2108 assert(p_unicode != NULL); 2109 unicode = *p_unicode; 2110 assert(PyUnicode_IS_READY(unicode)); 2111 if (PyUnicode_IS_ASCII(unicode)) 2112 return; 2113 2114 len = PyUnicode_GET_LENGTH(unicode); 2115 kind = PyUnicode_KIND(unicode); 2116 if (kind == PyUnicode_1BYTE_KIND) { 2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2118 max_char = ucs1lib_find_max_char(u, u + len); 2119 if (max_char >= 128) 2120 return; 2121 } 2122 else if (kind == PyUnicode_2BYTE_KIND) { 2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2124 max_char = ucs2lib_find_max_char(u, u + len); 2125 if (max_char >= 256) 2126 return; 2127 } 2128 else { 2129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2130 assert(kind == PyUnicode_4BYTE_KIND); 2131 max_char = ucs4lib_find_max_char(u, u + len); 2132 if (max_char >= 0x10000) 2133 return; 2134 } 2135 copy = PyUnicode_New(len, max_char); 2136 if (copy != NULL) 2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2138 Py_DECREF(unicode); 2139 *p_unicode = copy; 2140} 2141 2142PyObject* 2143_PyUnicode_Copy(PyObject *unicode) 2144{ 2145 Py_ssize_t length; 2146 PyObject *copy; 2147 2148 if (!PyUnicode_Check(unicode)) { 2149 PyErr_BadInternalCall(); 2150 return NULL; 2151 } 2152 if (PyUnicode_READY(unicode) == -1) 2153 return NULL; 2154 2155 length = PyUnicode_GET_LENGTH(unicode); 2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2157 if (!copy) 2158 return NULL; 2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2160 2161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2162 length * PyUnicode_KIND(unicode)); 2163 assert(_PyUnicode_CheckConsistency(copy, 1)); 2164 return copy; 2165} 2166 2167 2168/* Widen Unicode objects to larger buffers. Don't write terminating null 2169 character. Return NULL on error. */ 2170 2171void* 2172_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2173{ 2174 Py_ssize_t len; 2175 void *result; 2176 unsigned int skind; 2177 2178 if (PyUnicode_READY(s) == -1) 2179 return NULL; 2180 2181 len = PyUnicode_GET_LENGTH(s); 2182 skind = PyUnicode_KIND(s); 2183 if (skind >= kind) { 2184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2185 return NULL; 2186 } 2187 switch (kind) { 2188 case PyUnicode_2BYTE_KIND: 2189 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2190 if (!result) 2191 return PyErr_NoMemory(); 2192 assert(skind == PyUnicode_1BYTE_KIND); 2193 _PyUnicode_CONVERT_BYTES( 2194 Py_UCS1, Py_UCS2, 2195 PyUnicode_1BYTE_DATA(s), 2196 PyUnicode_1BYTE_DATA(s) + len, 2197 result); 2198 return result; 2199 case PyUnicode_4BYTE_KIND: 2200 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2201 if (!result) 2202 return PyErr_NoMemory(); 2203 if (skind == PyUnicode_2BYTE_KIND) { 2204 _PyUnicode_CONVERT_BYTES( 2205 Py_UCS2, Py_UCS4, 2206 PyUnicode_2BYTE_DATA(s), 2207 PyUnicode_2BYTE_DATA(s) + len, 2208 result); 2209 } 2210 else { 2211 assert(skind == PyUnicode_1BYTE_KIND); 2212 _PyUnicode_CONVERT_BYTES( 2213 Py_UCS1, Py_UCS4, 2214 PyUnicode_1BYTE_DATA(s), 2215 PyUnicode_1BYTE_DATA(s) + len, 2216 result); 2217 } 2218 return result; 2219 default: 2220 break; 2221 } 2222 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2223 return NULL; 2224} 2225 2226static Py_UCS4* 2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2228 int copy_null) 2229{ 2230 int kind; 2231 void *data; 2232 Py_ssize_t len, targetlen; 2233 if (PyUnicode_READY(string) == -1) 2234 return NULL; 2235 kind = PyUnicode_KIND(string); 2236 data = PyUnicode_DATA(string); 2237 len = PyUnicode_GET_LENGTH(string); 2238 targetlen = len; 2239 if (copy_null) 2240 targetlen++; 2241 if (!target) { 2242 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UCS4) < targetlen) { 2243 PyErr_NoMemory(); 2244 return NULL; 2245 } 2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2247 if (!target) { 2248 PyErr_NoMemory(); 2249 return NULL; 2250 } 2251 } 2252 else { 2253 if (targetsize < targetlen) { 2254 PyErr_Format(PyExc_SystemError, 2255 "string is longer than the buffer"); 2256 if (copy_null && 0 < targetsize) 2257 target[0] = 0; 2258 return NULL; 2259 } 2260 } 2261 if (kind == PyUnicode_1BYTE_KIND) { 2262 Py_UCS1 *start = (Py_UCS1 *) data; 2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2264 } 2265 else if (kind == PyUnicode_2BYTE_KIND) { 2266 Py_UCS2 *start = (Py_UCS2 *) data; 2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2268 } 2269 else { 2270 assert(kind == PyUnicode_4BYTE_KIND); 2271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2272 } 2273 if (copy_null) 2274 target[len] = 0; 2275 return target; 2276} 2277 2278Py_UCS4* 2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2280 int copy_null) 2281{ 2282 if (target == NULL || targetsize < 0) { 2283 PyErr_BadInternalCall(); 2284 return NULL; 2285 } 2286 return as_ucs4(string, target, targetsize, copy_null); 2287} 2288 2289Py_UCS4* 2290PyUnicode_AsUCS4Copy(PyObject *string) 2291{ 2292 return as_ucs4(string, NULL, 0, 1); 2293} 2294 2295#ifdef HAVE_WCHAR_H 2296 2297PyObject * 2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2299{ 2300 if (w == NULL) { 2301 if (size == 0) 2302 _Py_RETURN_UNICODE_EMPTY(); 2303 PyErr_BadInternalCall(); 2304 return NULL; 2305 } 2306 2307 if (size == -1) { 2308 size = wcslen(w); 2309 } 2310 2311 return PyUnicode_FromUnicode(w, size); 2312} 2313 2314#endif /* HAVE_WCHAR_H */ 2315 2316static void 2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2318 char c) 2319{ 2320 *fmt++ = '%'; 2321 if (longflag) 2322 *fmt++ = 'l'; 2323 else if (longlongflag) { 2324 /* longlongflag should only ever be nonzero on machines with 2325 HAVE_LONG_LONG defined */ 2326#ifdef HAVE_LONG_LONG 2327 char *f = PY_FORMAT_LONG_LONG; 2328 while (*f) 2329 *fmt++ = *f++; 2330#else 2331 /* we shouldn't ever get here */ 2332 assert(0); 2333 *fmt++ = 'l'; 2334#endif 2335 } 2336 else if (size_tflag) { 2337 char *f = PY_FORMAT_SIZE_T; 2338 while (*f) 2339 *fmt++ = *f++; 2340 } 2341 *fmt++ = c; 2342 *fmt = '\0'; 2343} 2344 2345/* maximum number of characters required for output of %lld or %p. 2346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2349 2350static int 2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2352 Py_ssize_t width, Py_ssize_t precision) 2353{ 2354 Py_ssize_t length, fill, arglen; 2355 Py_UCS4 maxchar; 2356 2357 if (PyUnicode_READY(str) == -1) 2358 return -1; 2359 2360 length = PyUnicode_GET_LENGTH(str); 2361 if ((precision == -1 || precision >= length) 2362 && width <= length) 2363 return _PyUnicodeWriter_WriteStr(writer, str); 2364 2365 if (precision != -1) 2366 length = Py_MIN(precision, length); 2367 2368 arglen = Py_MAX(length, width); 2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2371 else 2372 maxchar = writer->maxchar; 2373 2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2375 return -1; 2376 2377 if (width > length) { 2378 fill = width - length; 2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2380 return -1; 2381 writer->pos += fill; 2382 } 2383 2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2385 str, 0, length); 2386 writer->pos += length; 2387 return 0; 2388} 2389 2390static int 2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2392 Py_ssize_t width, Py_ssize_t precision) 2393{ 2394 /* UTF-8 */ 2395 Py_ssize_t length; 2396 PyObject *unicode; 2397 int res; 2398 2399 length = strlen(str); 2400 if (precision != -1) 2401 length = Py_MIN(length, precision); 2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2403 if (unicode == NULL) 2404 return -1; 2405 2406 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2407 Py_DECREF(unicode); 2408 return res; 2409} 2410 2411static const char* 2412unicode_fromformat_arg(_PyUnicodeWriter *writer, 2413 const char *f, va_list *vargs) 2414{ 2415 const char *p; 2416 Py_ssize_t len; 2417 int zeropad; 2418 Py_ssize_t width; 2419 Py_ssize_t precision; 2420 int longflag; 2421 int longlongflag; 2422 int size_tflag; 2423 Py_ssize_t fill; 2424 2425 p = f; 2426 f++; 2427 zeropad = 0; 2428 if (*f == '0') { 2429 zeropad = 1; 2430 f++; 2431 } 2432 2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2434 width = -1; 2435 if (Py_ISDIGIT((unsigned)*f)) { 2436 width = *f - '0'; 2437 f++; 2438 while (Py_ISDIGIT((unsigned)*f)) { 2439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2440 PyErr_SetString(PyExc_ValueError, 2441 "width too big"); 2442 return NULL; 2443 } 2444 width = (width * 10) + (*f - '0'); 2445 f++; 2446 } 2447 } 2448 precision = -1; 2449 if (*f == '.') { 2450 f++; 2451 if (Py_ISDIGIT((unsigned)*f)) { 2452 precision = (*f - '0'); 2453 f++; 2454 while (Py_ISDIGIT((unsigned)*f)) { 2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2456 PyErr_SetString(PyExc_ValueError, 2457 "precision too big"); 2458 return NULL; 2459 } 2460 precision = (precision * 10) + (*f - '0'); 2461 f++; 2462 } 2463 } 2464 if (*f == '%') { 2465 /* "%.3%s" => f points to "3" */ 2466 f--; 2467 } 2468 } 2469 if (*f == '\0') { 2470 /* bogus format "%.123" => go backward, f points to "3" */ 2471 f--; 2472 } 2473 2474 /* Handle %ld, %lu, %lld and %llu. */ 2475 longflag = 0; 2476 longlongflag = 0; 2477 size_tflag = 0; 2478 if (*f == 'l') { 2479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2480 longflag = 1; 2481 ++f; 2482 } 2483#ifdef HAVE_LONG_LONG 2484 else if (f[1] == 'l' && 2485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2486 longlongflag = 1; 2487 f += 2; 2488 } 2489#endif 2490 } 2491 /* handle the size_t flag. */ 2492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2493 size_tflag = 1; 2494 ++f; 2495 } 2496 2497 if (f[1] == '\0') 2498 writer->overallocate = 0; 2499 2500 switch (*f) { 2501 case 'c': 2502 { 2503 int ordinal = va_arg(*vargs, int); 2504 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2505 PyErr_SetString(PyExc_OverflowError, 2506 "character argument not in range(0x110000)"); 2507 return NULL; 2508 } 2509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2510 return NULL; 2511 break; 2512 } 2513 2514 case 'i': 2515 case 'd': 2516 case 'u': 2517 case 'x': 2518 { 2519 /* used by sprintf */ 2520 char fmt[10]; /* should be enough for "%0lld\0" */ 2521 char buffer[MAX_LONG_LONG_CHARS]; 2522 Py_ssize_t arglen; 2523 2524 if (*f == 'u') { 2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2526 2527 if (longflag) 2528 len = sprintf(buffer, fmt, 2529 va_arg(*vargs, unsigned long)); 2530#ifdef HAVE_LONG_LONG 2531 else if (longlongflag) 2532 len = sprintf(buffer, fmt, 2533 va_arg(*vargs, unsigned PY_LONG_LONG)); 2534#endif 2535 else if (size_tflag) 2536 len = sprintf(buffer, fmt, 2537 va_arg(*vargs, size_t)); 2538 else 2539 len = sprintf(buffer, fmt, 2540 va_arg(*vargs, unsigned int)); 2541 } 2542 else if (*f == 'x') { 2543 makefmt(fmt, 0, 0, 0, 'x'); 2544 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2545 } 2546 else { 2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2548 2549 if (longflag) 2550 len = sprintf(buffer, fmt, 2551 va_arg(*vargs, long)); 2552#ifdef HAVE_LONG_LONG 2553 else if (longlongflag) 2554 len = sprintf(buffer, fmt, 2555 va_arg(*vargs, PY_LONG_LONG)); 2556#endif 2557 else if (size_tflag) 2558 len = sprintf(buffer, fmt, 2559 va_arg(*vargs, Py_ssize_t)); 2560 else 2561 len = sprintf(buffer, fmt, 2562 va_arg(*vargs, int)); 2563 } 2564 assert(len >= 0); 2565 2566 if (precision < len) 2567 precision = len; 2568 2569 arglen = Py_MAX(precision, width); 2570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2571 return NULL; 2572 2573 if (width > precision) { 2574 Py_UCS4 fillchar; 2575 fill = width - precision; 2576 fillchar = zeropad?'0':' '; 2577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2578 return NULL; 2579 writer->pos += fill; 2580 } 2581 if (precision > len) { 2582 fill = precision - len; 2583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2584 return NULL; 2585 writer->pos += fill; 2586 } 2587 2588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2589 return NULL; 2590 break; 2591 } 2592 2593 case 'p': 2594 { 2595 char number[MAX_LONG_LONG_CHARS]; 2596 2597 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2598 assert(len >= 0); 2599 2600 /* %p is ill-defined: ensure leading 0x. */ 2601 if (number[1] == 'X') 2602 number[1] = 'x'; 2603 else if (number[1] != 'x') { 2604 memmove(number + 2, number, 2605 strlen(number) + 1); 2606 number[0] = '0'; 2607 number[1] = 'x'; 2608 len += 2; 2609 } 2610 2611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2612 return NULL; 2613 break; 2614 } 2615 2616 case 's': 2617 { 2618 /* UTF-8 */ 2619 const char *s = va_arg(*vargs, const char*); 2620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2621 return NULL; 2622 break; 2623 } 2624 2625 case 'U': 2626 { 2627 PyObject *obj = va_arg(*vargs, PyObject *); 2628 assert(obj && _PyUnicode_CHECK(obj)); 2629 2630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2631 return NULL; 2632 break; 2633 } 2634 2635 case 'V': 2636 { 2637 PyObject *obj = va_arg(*vargs, PyObject *); 2638 const char *str = va_arg(*vargs, const char *); 2639 if (obj) { 2640 assert(_PyUnicode_CHECK(obj)); 2641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2642 return NULL; 2643 } 2644 else { 2645 assert(str != NULL); 2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2647 return NULL; 2648 } 2649 break; 2650 } 2651 2652 case 'S': 2653 { 2654 PyObject *obj = va_arg(*vargs, PyObject *); 2655 PyObject *str; 2656 assert(obj); 2657 str = PyObject_Str(obj); 2658 if (!str) 2659 return NULL; 2660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2661 Py_DECREF(str); 2662 return NULL; 2663 } 2664 Py_DECREF(str); 2665 break; 2666 } 2667 2668 case 'R': 2669 { 2670 PyObject *obj = va_arg(*vargs, PyObject *); 2671 PyObject *repr; 2672 assert(obj); 2673 repr = PyObject_Repr(obj); 2674 if (!repr) 2675 return NULL; 2676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2677 Py_DECREF(repr); 2678 return NULL; 2679 } 2680 Py_DECREF(repr); 2681 break; 2682 } 2683 2684 case 'A': 2685 { 2686 PyObject *obj = va_arg(*vargs, PyObject *); 2687 PyObject *ascii; 2688 assert(obj); 2689 ascii = PyObject_ASCII(obj); 2690 if (!ascii) 2691 return NULL; 2692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2693 Py_DECREF(ascii); 2694 return NULL; 2695 } 2696 Py_DECREF(ascii); 2697 break; 2698 } 2699 2700 case '%': 2701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2702 return NULL; 2703 break; 2704 2705 default: 2706 /* if we stumble upon an unknown formatting code, copy the rest 2707 of the format string to the output string. (we cannot just 2708 skip the code, since there's no way to know what's in the 2709 argument list) */ 2710 len = strlen(p); 2711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2712 return NULL; 2713 f = p+len; 2714 return f; 2715 } 2716 2717 f++; 2718 return f; 2719} 2720 2721PyObject * 2722PyUnicode_FromFormatV(const char *format, va_list vargs) 2723{ 2724 va_list vargs2; 2725 const char *f; 2726 _PyUnicodeWriter writer; 2727 2728 _PyUnicodeWriter_Init(&writer); 2729 writer.min_length = strlen(format) + 100; 2730 writer.overallocate = 1; 2731 2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2733 Copy it to be able to pass a reference to a subfunction. */ 2734 Py_VA_COPY(vargs2, vargs); 2735 2736 for (f = format; *f; ) { 2737 if (*f == '%') { 2738 f = unicode_fromformat_arg(&writer, f, &vargs2); 2739 if (f == NULL) 2740 goto fail; 2741 } 2742 else { 2743 const char *p; 2744 Py_ssize_t len; 2745 2746 p = f; 2747 do 2748 { 2749 if ((unsigned char)*p > 127) { 2750 PyErr_Format(PyExc_ValueError, 2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2752 "string, got a non-ASCII byte: 0x%02x", 2753 (unsigned char)*p); 2754 return NULL; 2755 } 2756 p++; 2757 } 2758 while (*p != '\0' && *p != '%'); 2759 len = p - f; 2760 2761 if (*p == '\0') 2762 writer.overallocate = 0; 2763 2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2765 goto fail; 2766 2767 f = p; 2768 } 2769 } 2770 return _PyUnicodeWriter_Finish(&writer); 2771 2772 fail: 2773 _PyUnicodeWriter_Dealloc(&writer); 2774 return NULL; 2775} 2776 2777PyObject * 2778PyUnicode_FromFormat(const char *format, ...) 2779{ 2780 PyObject* ret; 2781 va_list vargs; 2782 2783#ifdef HAVE_STDARG_PROTOTYPES 2784 va_start(vargs, format); 2785#else 2786 va_start(vargs); 2787#endif 2788 ret = PyUnicode_FromFormatV(format, vargs); 2789 va_end(vargs); 2790 return ret; 2791} 2792 2793#ifdef HAVE_WCHAR_H 2794 2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2796 convert a Unicode object to a wide character string. 2797 2798 - If w is NULL: return the number of wide characters (including the null 2799 character) required to convert the unicode object. Ignore size argument. 2800 2801 - Otherwise: return the number of wide characters (excluding the null 2802 character) written into w. Write at most size wide characters (including 2803 the null character). */ 2804static Py_ssize_t 2805unicode_aswidechar(PyObject *unicode, 2806 wchar_t *w, 2807 Py_ssize_t size) 2808{ 2809 Py_ssize_t res; 2810 const wchar_t *wstr; 2811 2812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2813 if (wstr == NULL) 2814 return -1; 2815 2816 if (w != NULL) { 2817 if (size > res) 2818 size = res + 1; 2819 else 2820 res = size; 2821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2822 return res; 2823 } 2824 else 2825 return res + 1; 2826} 2827 2828Py_ssize_t 2829PyUnicode_AsWideChar(PyObject *unicode, 2830 wchar_t *w, 2831 Py_ssize_t size) 2832{ 2833 if (unicode == NULL) { 2834 PyErr_BadInternalCall(); 2835 return -1; 2836 } 2837 return unicode_aswidechar(unicode, w, size); 2838} 2839 2840wchar_t* 2841PyUnicode_AsWideCharString(PyObject *unicode, 2842 Py_ssize_t *size) 2843{ 2844 wchar_t* buffer; 2845 Py_ssize_t buflen; 2846 2847 if (unicode == NULL) { 2848 PyErr_BadInternalCall(); 2849 return NULL; 2850 } 2851 2852 buflen = unicode_aswidechar(unicode, NULL, 0); 2853 if (buflen == -1) 2854 return NULL; 2855 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < buflen) { 2856 PyErr_NoMemory(); 2857 return NULL; 2858 } 2859 2860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2861 if (buffer == NULL) { 2862 PyErr_NoMemory(); 2863 return NULL; 2864 } 2865 buflen = unicode_aswidechar(unicode, buffer, buflen); 2866 if (buflen == -1) { 2867 PyMem_FREE(buffer); 2868 return NULL; 2869 } 2870 if (size != NULL) 2871 *size = buflen; 2872 return buffer; 2873} 2874 2875#endif /* HAVE_WCHAR_H */ 2876 2877PyObject * 2878PyUnicode_FromOrdinal(int ordinal) 2879{ 2880 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2881 PyErr_SetString(PyExc_ValueError, 2882 "chr() arg not in range(0x110000)"); 2883 return NULL; 2884 } 2885 2886 return unicode_char((Py_UCS4)ordinal); 2887} 2888 2889PyObject * 2890PyUnicode_FromObject(PyObject *obj) 2891{ 2892 /* XXX Perhaps we should make this API an alias of 2893 PyObject_Str() instead ?! */ 2894 if (PyUnicode_CheckExact(obj)) { 2895 if (PyUnicode_READY(obj) == -1) 2896 return NULL; 2897 Py_INCREF(obj); 2898 return obj; 2899 } 2900 if (PyUnicode_Check(obj)) { 2901 /* For a Unicode subtype that's not a Unicode object, 2902 return a true Unicode object with the same data. */ 2903 return _PyUnicode_Copy(obj); 2904 } 2905 PyErr_Format(PyExc_TypeError, 2906 "Can't convert '%.100s' object to str implicitly", 2907 Py_TYPE(obj)->tp_name); 2908 return NULL; 2909} 2910 2911PyObject * 2912PyUnicode_FromEncodedObject(PyObject *obj, 2913 const char *encoding, 2914 const char *errors) 2915{ 2916 Py_buffer buffer; 2917 PyObject *v; 2918 2919 if (obj == NULL) { 2920 PyErr_BadInternalCall(); 2921 return NULL; 2922 } 2923 2924 /* Decoding bytes objects is the most common case and should be fast */ 2925 if (PyBytes_Check(obj)) { 2926 if (PyBytes_GET_SIZE(obj) == 0) 2927 _Py_RETURN_UNICODE_EMPTY(); 2928 v = PyUnicode_Decode( 2929 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2930 encoding, errors); 2931 return v; 2932 } 2933 2934 if (PyUnicode_Check(obj)) { 2935 PyErr_SetString(PyExc_TypeError, 2936 "decoding str is not supported"); 2937 return NULL; 2938 } 2939 2940 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2941 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2942 PyErr_Format(PyExc_TypeError, 2943 "coercing to str: need bytes, bytearray " 2944 "or buffer-like object, %.80s found", 2945 Py_TYPE(obj)->tp_name); 2946 return NULL; 2947 } 2948 2949 if (buffer.len == 0) { 2950 PyBuffer_Release(&buffer); 2951 _Py_RETURN_UNICODE_EMPTY(); 2952 } 2953 2954 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2955 PyBuffer_Release(&buffer); 2956 return v; 2957} 2958 2959/* Convert encoding to lower case and replace '_' with '-' in order to 2960 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2961 1 on success. */ 2962int 2963_Py_normalize_encoding(const char *encoding, 2964 char *lower, 2965 size_t lower_len) 2966{ 2967 const char *e; 2968 char *l; 2969 char *l_end; 2970 2971 if (encoding == NULL) { 2972 /* 6 == strlen("utf-8") + 1 */ 2973 if (lower_len < 6) 2974 return 0; 2975 strcpy(lower, "utf-8"); 2976 return 1; 2977 } 2978 e = encoding; 2979 l = lower; 2980 l_end = &lower[lower_len - 1]; 2981 while (*e) { 2982 if (l == l_end) 2983 return 0; 2984 if (Py_ISUPPER(*e)) { 2985 *l++ = Py_TOLOWER(*e++); 2986 } 2987 else if (*e == '_') { 2988 *l++ = '-'; 2989 e++; 2990 } 2991 else { 2992 *l++ = *e++; 2993 } 2994 } 2995 *l = '\0'; 2996 return 1; 2997} 2998 2999PyObject * 3000PyUnicode_Decode(const char *s, 3001 Py_ssize_t size, 3002 const char *encoding, 3003 const char *errors) 3004{ 3005 PyObject *buffer = NULL, *unicode; 3006 Py_buffer info; 3007 char lower[11]; /* Enough for any encoding shortcut */ 3008 3009 /* Shortcuts for common default encodings */ 3010 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3011 if ((strcmp(lower, "utf-8") == 0) || 3012 (strcmp(lower, "utf8") == 0)) 3013 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3014 else if ((strcmp(lower, "latin-1") == 0) || 3015 (strcmp(lower, "latin1") == 0) || 3016 (strcmp(lower, "iso-8859-1") == 0) || 3017 (strcmp(lower, "iso8859-1") == 0)) 3018 return PyUnicode_DecodeLatin1(s, size, errors); 3019#ifdef HAVE_MBCS 3020 else if (strcmp(lower, "mbcs") == 0) 3021 return PyUnicode_DecodeMBCS(s, size, errors); 3022#endif 3023 else if (strcmp(lower, "ascii") == 0) 3024 return PyUnicode_DecodeASCII(s, size, errors); 3025 else if (strcmp(lower, "utf-16") == 0) 3026 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3027 else if (strcmp(lower, "utf-32") == 0) 3028 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3029 } 3030 3031 /* Decode via the codec registry */ 3032 buffer = NULL; 3033 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3034 goto onError; 3035 buffer = PyMemoryView_FromBuffer(&info); 3036 if (buffer == NULL) 3037 goto onError; 3038 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3039 if (unicode == NULL) 3040 goto onError; 3041 if (!PyUnicode_Check(unicode)) { 3042 PyErr_Format(PyExc_TypeError, 3043 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3044 "use codecs.decode() to decode to arbitrary types", 3045 encoding, 3046 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3047 Py_DECREF(unicode); 3048 goto onError; 3049 } 3050 Py_DECREF(buffer); 3051 return unicode_result(unicode); 3052 3053 onError: 3054 Py_XDECREF(buffer); 3055 return NULL; 3056} 3057 3058PyObject * 3059PyUnicode_AsDecodedObject(PyObject *unicode, 3060 const char *encoding, 3061 const char *errors) 3062{ 3063 PyObject *v; 3064 3065 if (!PyUnicode_Check(unicode)) { 3066 PyErr_BadArgument(); 3067 goto onError; 3068 } 3069 3070 if (encoding == NULL) 3071 encoding = PyUnicode_GetDefaultEncoding(); 3072 3073 /* Decode via the codec registry */ 3074 v = PyCodec_Decode(unicode, encoding, errors); 3075 if (v == NULL) 3076 goto onError; 3077 return unicode_result(v); 3078 3079 onError: 3080 return NULL; 3081} 3082 3083PyObject * 3084PyUnicode_AsDecodedUnicode(PyObject *unicode, 3085 const char *encoding, 3086 const char *errors) 3087{ 3088 PyObject *v; 3089 3090 if (!PyUnicode_Check(unicode)) { 3091 PyErr_BadArgument(); 3092 goto onError; 3093 } 3094 3095 if (encoding == NULL) 3096 encoding = PyUnicode_GetDefaultEncoding(); 3097 3098 /* Decode via the codec registry */ 3099 v = PyCodec_Decode(unicode, encoding, errors); 3100 if (v == NULL) 3101 goto onError; 3102 if (!PyUnicode_Check(v)) { 3103 PyErr_Format(PyExc_TypeError, 3104 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3105 "use codecs.decode() to decode to arbitrary types", 3106 encoding, 3107 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3108 Py_DECREF(v); 3109 goto onError; 3110 } 3111 return unicode_result(v); 3112 3113 onError: 3114 return NULL; 3115} 3116 3117PyObject * 3118PyUnicode_Encode(const Py_UNICODE *s, 3119 Py_ssize_t size, 3120 const char *encoding, 3121 const char *errors) 3122{ 3123 PyObject *v, *unicode; 3124 3125 unicode = PyUnicode_FromUnicode(s, size); 3126 if (unicode == NULL) 3127 return NULL; 3128 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3129 Py_DECREF(unicode); 3130 return v; 3131} 3132 3133PyObject * 3134PyUnicode_AsEncodedObject(PyObject *unicode, 3135 const char *encoding, 3136 const char *errors) 3137{ 3138 PyObject *v; 3139 3140 if (!PyUnicode_Check(unicode)) { 3141 PyErr_BadArgument(); 3142 goto onError; 3143 } 3144 3145 if (encoding == NULL) 3146 encoding = PyUnicode_GetDefaultEncoding(); 3147 3148 /* Encode via the codec registry */ 3149 v = PyCodec_Encode(unicode, encoding, errors); 3150 if (v == NULL) 3151 goto onError; 3152 return v; 3153 3154 onError: 3155 return NULL; 3156} 3157 3158static size_t 3159wcstombs_errorpos(const wchar_t *wstr) 3160{ 3161 size_t len; 3162#if SIZEOF_WCHAR_T == 2 3163 wchar_t buf[3]; 3164#else 3165 wchar_t buf[2]; 3166#endif 3167 char outbuf[MB_LEN_MAX]; 3168 const wchar_t *start, *previous; 3169 3170#if SIZEOF_WCHAR_T == 2 3171 buf[2] = 0; 3172#else 3173 buf[1] = 0; 3174#endif 3175 start = wstr; 3176 while (*wstr != L'\0') 3177 { 3178 previous = wstr; 3179#if SIZEOF_WCHAR_T == 2 3180 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3181 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3182 { 3183 buf[0] = wstr[0]; 3184 buf[1] = wstr[1]; 3185 wstr += 2; 3186 } 3187 else { 3188 buf[0] = *wstr; 3189 buf[1] = 0; 3190 wstr++; 3191 } 3192#else 3193 buf[0] = *wstr; 3194 wstr++; 3195#endif 3196 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3197 if (len == (size_t)-1) 3198 return previous - start; 3199 } 3200 3201 /* failed to find the unencodable character */ 3202 return 0; 3203} 3204 3205static int 3206locale_error_handler(const char *errors, int *surrogateescape) 3207{ 3208 if (errors == NULL) { 3209 *surrogateescape = 0; 3210 return 0; 3211 } 3212 3213 if (strcmp(errors, "strict") == 0) { 3214 *surrogateescape = 0; 3215 return 0; 3216 } 3217 if (strcmp(errors, "surrogateescape") == 0) { 3218 *surrogateescape = 1; 3219 return 0; 3220 } 3221 PyErr_Format(PyExc_ValueError, 3222 "only 'strict' and 'surrogateescape' error handlers " 3223 "are supported, not '%s'", 3224 errors); 3225 return -1; 3226} 3227 3228PyObject * 3229PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3230{ 3231 Py_ssize_t wlen, wlen2; 3232 wchar_t *wstr; 3233 PyObject *bytes = NULL; 3234 char *errmsg; 3235 PyObject *reason = NULL; 3236 PyObject *exc; 3237 size_t error_pos; 3238 int surrogateescape; 3239 3240 if (locale_error_handler(errors, &surrogateescape) < 0) 3241 return NULL; 3242 3243 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3244 if (wstr == NULL) 3245 return NULL; 3246 3247 wlen2 = wcslen(wstr); 3248 if (wlen2 != wlen) { 3249 PyMem_Free(wstr); 3250 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3251 return NULL; 3252 } 3253 3254 if (surrogateescape) { 3255 /* "surrogateescape" error handler */ 3256 char *str; 3257 3258 str = Py_EncodeLocale(wstr, &error_pos); 3259 if (str == NULL) { 3260 if (error_pos == (size_t)-1) { 3261 PyErr_NoMemory(); 3262 PyMem_Free(wstr); 3263 return NULL; 3264 } 3265 else { 3266 goto encode_error; 3267 } 3268 } 3269 PyMem_Free(wstr); 3270 3271 bytes = PyBytes_FromString(str); 3272 PyMem_Free(str); 3273 } 3274 else { 3275 /* strict mode */ 3276 size_t len, len2; 3277 3278 len = wcstombs(NULL, wstr, 0); 3279 if (len == (size_t)-1) { 3280 error_pos = (size_t)-1; 3281 goto encode_error; 3282 } 3283 3284 bytes = PyBytes_FromStringAndSize(NULL, len); 3285 if (bytes == NULL) { 3286 PyMem_Free(wstr); 3287 return NULL; 3288 } 3289 3290 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3291 if (len2 == (size_t)-1 || len2 > len) { 3292 error_pos = (size_t)-1; 3293 goto encode_error; 3294 } 3295 PyMem_Free(wstr); 3296 } 3297 return bytes; 3298 3299encode_error: 3300 errmsg = strerror(errno); 3301 assert(errmsg != NULL); 3302 3303 if (error_pos == (size_t)-1) 3304 error_pos = wcstombs_errorpos(wstr); 3305 3306 PyMem_Free(wstr); 3307 Py_XDECREF(bytes); 3308 3309 if (errmsg != NULL) { 3310 size_t errlen; 3311 wstr = Py_DecodeLocale(errmsg, &errlen); 3312 if (wstr != NULL) { 3313 reason = PyUnicode_FromWideChar(wstr, errlen); 3314 PyMem_RawFree(wstr); 3315 } else 3316 errmsg = NULL; 3317 } 3318 if (errmsg == NULL) 3319 reason = PyUnicode_FromString( 3320 "wcstombs() encountered an unencodable " 3321 "wide character"); 3322 if (reason == NULL) 3323 return NULL; 3324 3325 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3326 "locale", unicode, 3327 (Py_ssize_t)error_pos, 3328 (Py_ssize_t)(error_pos+1), 3329 reason); 3330 Py_DECREF(reason); 3331 if (exc != NULL) { 3332 PyCodec_StrictErrors(exc); 3333 Py_XDECREF(exc); 3334 } 3335 return NULL; 3336} 3337 3338PyObject * 3339PyUnicode_EncodeFSDefault(PyObject *unicode) 3340{ 3341#ifdef HAVE_MBCS 3342 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3343#elif defined(__APPLE__) 3344 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3345#else 3346 PyInterpreterState *interp = PyThreadState_GET()->interp; 3347 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3348 cannot use it to encode and decode filenames before it is loaded. Load 3349 the Python codec requires to encode at least its own filename. Use the C 3350 version of the locale codec until the codec registry is initialized and 3351 the Python codec is loaded. 3352 3353 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3354 cannot only rely on it: check also interp->fscodec_initialized for 3355 subinterpreters. */ 3356 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3357 return PyUnicode_AsEncodedString(unicode, 3358 Py_FileSystemDefaultEncoding, 3359 "surrogateescape"); 3360 } 3361 else { 3362 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3363 } 3364#endif 3365} 3366 3367PyObject * 3368PyUnicode_AsEncodedString(PyObject *unicode, 3369 const char *encoding, 3370 const char *errors) 3371{ 3372 PyObject *v; 3373 char lower[11]; /* Enough for any encoding shortcut */ 3374 3375 if (!PyUnicode_Check(unicode)) { 3376 PyErr_BadArgument(); 3377 return NULL; 3378 } 3379 3380 /* Shortcuts for common default encodings */ 3381 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3382 if ((strcmp(lower, "utf-8") == 0) || 3383 (strcmp(lower, "utf8") == 0)) 3384 { 3385 if (errors == NULL || strcmp(errors, "strict") == 0) 3386 return _PyUnicode_AsUTF8String(unicode, NULL); 3387 else 3388 return _PyUnicode_AsUTF8String(unicode, errors); 3389 } 3390 else if ((strcmp(lower, "latin-1") == 0) || 3391 (strcmp(lower, "latin1") == 0) || 3392 (strcmp(lower, "iso-8859-1") == 0) || 3393 (strcmp(lower, "iso8859-1") == 0)) 3394 return _PyUnicode_AsLatin1String(unicode, errors); 3395#ifdef HAVE_MBCS 3396 else if (strcmp(lower, "mbcs") == 0) 3397 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3398#endif 3399 else if (strcmp(lower, "ascii") == 0) 3400 return _PyUnicode_AsASCIIString(unicode, errors); 3401 } 3402 3403 /* Encode via the codec registry */ 3404 v = _PyCodec_EncodeText(unicode, encoding, errors); 3405 if (v == NULL) 3406 return NULL; 3407 3408 /* The normal path */ 3409 if (PyBytes_Check(v)) 3410 return v; 3411 3412 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3413 if (PyByteArray_Check(v)) { 3414 int error; 3415 PyObject *b; 3416 3417 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3418 "encoder %s returned bytearray instead of bytes; " 3419 "use codecs.encode() to encode to arbitrary types", 3420 encoding); 3421 if (error) { 3422 Py_DECREF(v); 3423 return NULL; 3424 } 3425 3426 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3427 Py_DECREF(v); 3428 return b; 3429 } 3430 3431 PyErr_Format(PyExc_TypeError, 3432 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3433 "use codecs.encode() to encode to arbitrary types", 3434 encoding, 3435 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3436 Py_DECREF(v); 3437 return NULL; 3438} 3439 3440PyObject * 3441PyUnicode_AsEncodedUnicode(PyObject *unicode, 3442 const char *encoding, 3443 const char *errors) 3444{ 3445 PyObject *v; 3446 3447 if (!PyUnicode_Check(unicode)) { 3448 PyErr_BadArgument(); 3449 goto onError; 3450 } 3451 3452 if (encoding == NULL) 3453 encoding = PyUnicode_GetDefaultEncoding(); 3454 3455 /* Encode via the codec registry */ 3456 v = PyCodec_Encode(unicode, encoding, errors); 3457 if (v == NULL) 3458 goto onError; 3459 if (!PyUnicode_Check(v)) { 3460 PyErr_Format(PyExc_TypeError, 3461 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3462 "use codecs.encode() to encode to arbitrary types", 3463 encoding, 3464 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3465 Py_DECREF(v); 3466 goto onError; 3467 } 3468 return v; 3469 3470 onError: 3471 return NULL; 3472} 3473 3474static size_t 3475mbstowcs_errorpos(const char *str, size_t len) 3476{ 3477#ifdef HAVE_MBRTOWC 3478 const char *start = str; 3479 mbstate_t mbs; 3480 size_t converted; 3481 wchar_t ch; 3482 3483 memset(&mbs, 0, sizeof mbs); 3484 while (len) 3485 { 3486 converted = mbrtowc(&ch, str, len, &mbs); 3487 if (converted == 0) 3488 /* Reached end of string */ 3489 break; 3490 if (converted == (size_t)-1 || converted == (size_t)-2) { 3491 /* Conversion error or incomplete character */ 3492 return str - start; 3493 } 3494 else { 3495 str += converted; 3496 len -= converted; 3497 } 3498 } 3499 /* failed to find the undecodable byte sequence */ 3500 return 0; 3501#endif 3502 return 0; 3503} 3504 3505PyObject* 3506PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3507 const char *errors) 3508{ 3509 wchar_t smallbuf[256]; 3510 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3511 wchar_t *wstr; 3512 size_t wlen, wlen2; 3513 PyObject *unicode; 3514 int surrogateescape; 3515 size_t error_pos; 3516 char *errmsg; 3517 PyObject *reason, *exc; 3518 3519 if (locale_error_handler(errors, &surrogateescape) < 0) 3520 return NULL; 3521 3522 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3523 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3524 return NULL; 3525 } 3526 3527 if (surrogateescape) { 3528 /* "surrogateescape" error handler */ 3529 wstr = Py_DecodeLocale(str, &wlen); 3530 if (wstr == NULL) { 3531 if (wlen == (size_t)-1) 3532 PyErr_NoMemory(); 3533 else 3534 PyErr_SetFromErrno(PyExc_OSError); 3535 return NULL; 3536 } 3537 3538 unicode = PyUnicode_FromWideChar(wstr, wlen); 3539 PyMem_RawFree(wstr); 3540 } 3541 else { 3542 /* strict mode */ 3543#ifndef HAVE_BROKEN_MBSTOWCS 3544 wlen = mbstowcs(NULL, str, 0); 3545#else 3546 wlen = len; 3547#endif 3548 if (wlen == (size_t)-1) 3549 goto decode_error; 3550 if (wlen+1 <= smallbuf_len) { 3551 wstr = smallbuf; 3552 } 3553 else { 3554 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3555 return PyErr_NoMemory(); 3556 3557 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3558 if (!wstr) 3559 return PyErr_NoMemory(); 3560 } 3561 3562 wlen2 = mbstowcs(wstr, str, wlen+1); 3563 if (wlen2 == (size_t)-1) { 3564 if (wstr != smallbuf) 3565 PyMem_Free(wstr); 3566 goto decode_error; 3567 } 3568#ifdef HAVE_BROKEN_MBSTOWCS 3569 assert(wlen2 == wlen); 3570#endif 3571 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3572 if (wstr != smallbuf) 3573 PyMem_Free(wstr); 3574 } 3575 return unicode; 3576 3577decode_error: 3578 errmsg = strerror(errno); 3579 assert(errmsg != NULL); 3580 3581 error_pos = mbstowcs_errorpos(str, len); 3582 if (errmsg != NULL) { 3583 size_t errlen; 3584 wstr = Py_DecodeLocale(errmsg, &errlen); 3585 if (wstr != NULL) { 3586 reason = PyUnicode_FromWideChar(wstr, errlen); 3587 PyMem_RawFree(wstr); 3588 } else 3589 errmsg = NULL; 3590 } 3591 if (errmsg == NULL) 3592 reason = PyUnicode_FromString( 3593 "mbstowcs() encountered an invalid multibyte sequence"); 3594 if (reason == NULL) 3595 return NULL; 3596 3597 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3598 "locale", str, len, 3599 (Py_ssize_t)error_pos, 3600 (Py_ssize_t)(error_pos+1), 3601 reason); 3602 Py_DECREF(reason); 3603 if (exc != NULL) { 3604 PyCodec_StrictErrors(exc); 3605 Py_XDECREF(exc); 3606 } 3607 return NULL; 3608} 3609 3610PyObject* 3611PyUnicode_DecodeLocale(const char *str, const char *errors) 3612{ 3613 Py_ssize_t size = (Py_ssize_t)strlen(str); 3614 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3615} 3616 3617 3618PyObject* 3619PyUnicode_DecodeFSDefault(const char *s) { 3620 Py_ssize_t size = (Py_ssize_t)strlen(s); 3621 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3622} 3623 3624PyObject* 3625PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3626{ 3627#ifdef HAVE_MBCS 3628 return PyUnicode_DecodeMBCS(s, size, NULL); 3629#elif defined(__APPLE__) 3630 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3631#else 3632 PyInterpreterState *interp = PyThreadState_GET()->interp; 3633 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3634 cannot use it to encode and decode filenames before it is loaded. Load 3635 the Python codec requires to encode at least its own filename. Use the C 3636 version of the locale codec until the codec registry is initialized and 3637 the Python codec is loaded. 3638 3639 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3640 cannot only rely on it: check also interp->fscodec_initialized for 3641 subinterpreters. */ 3642 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3643 return PyUnicode_Decode(s, size, 3644 Py_FileSystemDefaultEncoding, 3645 "surrogateescape"); 3646 } 3647 else { 3648 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3649 } 3650#endif 3651} 3652 3653 3654int 3655_PyUnicode_HasNULChars(PyObject* str) 3656{ 3657 Py_ssize_t pos; 3658 3659 if (PyUnicode_READY(str) == -1) 3660 return -1; 3661 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3662 PyUnicode_GET_LENGTH(str), '\0', 1); 3663 if (pos == -1) 3664 return 0; 3665 else 3666 return 1; 3667} 3668 3669int 3670PyUnicode_FSConverter(PyObject* arg, void* addr) 3671{ 3672 PyObject *output = NULL; 3673 Py_ssize_t size; 3674 void *data; 3675 if (arg == NULL) { 3676 Py_DECREF(*(PyObject**)addr); 3677 return 1; 3678 } 3679 if (PyBytes_Check(arg)) { 3680 output = arg; 3681 Py_INCREF(output); 3682 } 3683 else { 3684 arg = PyUnicode_FromObject(arg); 3685 if (!arg) 3686 return 0; 3687 output = PyUnicode_EncodeFSDefault(arg); 3688 Py_DECREF(arg); 3689 if (!output) 3690 return 0; 3691 if (!PyBytes_Check(output)) { 3692 Py_DECREF(output); 3693 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3694 return 0; 3695 } 3696 } 3697 size = PyBytes_GET_SIZE(output); 3698 data = PyBytes_AS_STRING(output); 3699 if ((size_t)size != strlen(data)) { 3700 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3701 Py_DECREF(output); 3702 return 0; 3703 } 3704 *(PyObject**)addr = output; 3705 return Py_CLEANUP_SUPPORTED; 3706} 3707 3708 3709int 3710PyUnicode_FSDecoder(PyObject* arg, void* addr) 3711{ 3712 PyObject *output = NULL; 3713 if (arg == NULL) { 3714 Py_DECREF(*(PyObject**)addr); 3715 return 1; 3716 } 3717 if (PyUnicode_Check(arg)) { 3718 if (PyUnicode_READY(arg) == -1) 3719 return 0; 3720 output = arg; 3721 Py_INCREF(output); 3722 } 3723 else { 3724 arg = PyBytes_FromObject(arg); 3725 if (!arg) 3726 return 0; 3727 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3728 PyBytes_GET_SIZE(arg)); 3729 Py_DECREF(arg); 3730 if (!output) 3731 return 0; 3732 if (!PyUnicode_Check(output)) { 3733 Py_DECREF(output); 3734 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3735 return 0; 3736 } 3737 } 3738 if (PyUnicode_READY(output) == -1) { 3739 Py_DECREF(output); 3740 return 0; 3741 } 3742 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3743 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3744 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3745 Py_DECREF(output); 3746 return 0; 3747 } 3748 *(PyObject**)addr = output; 3749 return Py_CLEANUP_SUPPORTED; 3750} 3751 3752 3753char* 3754PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3755{ 3756 PyObject *bytes; 3757 3758 if (!PyUnicode_Check(unicode)) { 3759 PyErr_BadArgument(); 3760 return NULL; 3761 } 3762 if (PyUnicode_READY(unicode) == -1) 3763 return NULL; 3764 3765 if (PyUnicode_UTF8(unicode) == NULL) { 3766 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3767 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3768 if (bytes == NULL) 3769 return NULL; 3770 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3771 if (_PyUnicode_UTF8(unicode) == NULL) { 3772 PyErr_NoMemory(); 3773 Py_DECREF(bytes); 3774 return NULL; 3775 } 3776 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3777 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3778 PyBytes_AS_STRING(bytes), 3779 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3780 Py_DECREF(bytes); 3781 } 3782 3783 if (psize) 3784 *psize = PyUnicode_UTF8_LENGTH(unicode); 3785 return PyUnicode_UTF8(unicode); 3786} 3787 3788char* 3789PyUnicode_AsUTF8(PyObject *unicode) 3790{ 3791 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3792} 3793 3794Py_UNICODE * 3795PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3796{ 3797 const unsigned char *one_byte; 3798#if SIZEOF_WCHAR_T == 4 3799 const Py_UCS2 *two_bytes; 3800#else 3801 const Py_UCS4 *four_bytes; 3802 const Py_UCS4 *ucs4_end; 3803 Py_ssize_t num_surrogates; 3804#endif 3805 wchar_t *w; 3806 wchar_t *wchar_end; 3807 3808 if (!PyUnicode_Check(unicode)) { 3809 PyErr_BadArgument(); 3810 return NULL; 3811 } 3812 if (_PyUnicode_WSTR(unicode) == NULL) { 3813 /* Non-ASCII compact unicode object */ 3814 assert(_PyUnicode_KIND(unicode) != 0); 3815 assert(PyUnicode_IS_READY(unicode)); 3816 3817 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3818#if SIZEOF_WCHAR_T == 2 3819 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3820 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3821 num_surrogates = 0; 3822 3823 for (; four_bytes < ucs4_end; ++four_bytes) { 3824 if (*four_bytes > 0xFFFF) 3825 ++num_surrogates; 3826 } 3827 3828 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3829 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3830 if (!_PyUnicode_WSTR(unicode)) { 3831 PyErr_NoMemory(); 3832 return NULL; 3833 } 3834 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3835 3836 w = _PyUnicode_WSTR(unicode); 3837 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3838 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3839 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3840 if (*four_bytes > 0xFFFF) { 3841 assert(*four_bytes <= MAX_UNICODE); 3842 /* encode surrogate pair in this case */ 3843 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3844 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3845 } 3846 else 3847 *w = *four_bytes; 3848 3849 if (w > wchar_end) { 3850 assert(0 && "Miscalculated string end"); 3851 } 3852 } 3853 *w = 0; 3854#else 3855 /* sizeof(wchar_t) == 4 */ 3856 Py_FatalError("Impossible unicode object state, wstr and str " 3857 "should share memory already."); 3858 return NULL; 3859#endif 3860 } 3861 else { 3862 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3863 (_PyUnicode_LENGTH(unicode) + 1)); 3864 if (!_PyUnicode_WSTR(unicode)) { 3865 PyErr_NoMemory(); 3866 return NULL; 3867 } 3868 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3869 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3870 w = _PyUnicode_WSTR(unicode); 3871 wchar_end = w + _PyUnicode_LENGTH(unicode); 3872 3873 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3874 one_byte = PyUnicode_1BYTE_DATA(unicode); 3875 for (; w < wchar_end; ++one_byte, ++w) 3876 *w = *one_byte; 3877 /* null-terminate the wstr */ 3878 *w = 0; 3879 } 3880 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3881#if SIZEOF_WCHAR_T == 4 3882 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3883 for (; w < wchar_end; ++two_bytes, ++w) 3884 *w = *two_bytes; 3885 /* null-terminate the wstr */ 3886 *w = 0; 3887#else 3888 /* sizeof(wchar_t) == 2 */ 3889 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3890 _PyUnicode_WSTR(unicode) = NULL; 3891 Py_FatalError("Impossible unicode object state, wstr " 3892 "and str should share memory already."); 3893 return NULL; 3894#endif 3895 } 3896 else { 3897 assert(0 && "This should never happen."); 3898 } 3899 } 3900 } 3901 if (size != NULL) 3902 *size = PyUnicode_WSTR_LENGTH(unicode); 3903 return _PyUnicode_WSTR(unicode); 3904} 3905 3906Py_UNICODE * 3907PyUnicode_AsUnicode(PyObject *unicode) 3908{ 3909 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3910} 3911 3912 3913Py_ssize_t 3914PyUnicode_GetSize(PyObject *unicode) 3915{ 3916 if (!PyUnicode_Check(unicode)) { 3917 PyErr_BadArgument(); 3918 goto onError; 3919 } 3920 return PyUnicode_GET_SIZE(unicode); 3921 3922 onError: 3923 return -1; 3924} 3925 3926Py_ssize_t 3927PyUnicode_GetLength(PyObject *unicode) 3928{ 3929 if (!PyUnicode_Check(unicode)) { 3930 PyErr_BadArgument(); 3931 return -1; 3932 } 3933 if (PyUnicode_READY(unicode) == -1) 3934 return -1; 3935 return PyUnicode_GET_LENGTH(unicode); 3936} 3937 3938Py_UCS4 3939PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3940{ 3941 void *data; 3942 int kind; 3943 3944 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3945 PyErr_BadArgument(); 3946 return (Py_UCS4)-1; 3947 } 3948 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3949 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3950 return (Py_UCS4)-1; 3951 } 3952 data = PyUnicode_DATA(unicode); 3953 kind = PyUnicode_KIND(unicode); 3954 return PyUnicode_READ(kind, data, index); 3955} 3956 3957int 3958PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3959{ 3960 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3961 PyErr_BadArgument(); 3962 return -1; 3963 } 3964 assert(PyUnicode_IS_READY(unicode)); 3965 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3966 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3967 return -1; 3968 } 3969 if (unicode_check_modifiable(unicode)) 3970 return -1; 3971 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3972 PyErr_SetString(PyExc_ValueError, "character out of range"); 3973 return -1; 3974 } 3975 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3976 index, ch); 3977 return 0; 3978} 3979 3980const char * 3981PyUnicode_GetDefaultEncoding(void) 3982{ 3983 return "utf-8"; 3984} 3985 3986/* create or adjust a UnicodeDecodeError */ 3987static void 3988make_decode_exception(PyObject **exceptionObject, 3989 const char *encoding, 3990 const char *input, Py_ssize_t length, 3991 Py_ssize_t startpos, Py_ssize_t endpos, 3992 const char *reason) 3993{ 3994 if (*exceptionObject == NULL) { 3995 *exceptionObject = PyUnicodeDecodeError_Create( 3996 encoding, input, length, startpos, endpos, reason); 3997 } 3998 else { 3999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4000 goto onError; 4001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4002 goto onError; 4003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4004 goto onError; 4005 } 4006 return; 4007 4008onError: 4009 Py_CLEAR(*exceptionObject); 4010} 4011 4012#ifdef HAVE_MBCS 4013/* error handling callback helper: 4014 build arguments, call the callback and check the arguments, 4015 if no exception occurred, copy the replacement to the output 4016 and adjust various state variables. 4017 return 0 on success, -1 on error 4018*/ 4019 4020static int 4021unicode_decode_call_errorhandler_wchar( 4022 const char *errors, PyObject **errorHandler, 4023 const char *encoding, const char *reason, 4024 const char **input, const char **inend, Py_ssize_t *startinpos, 4025 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4026 PyObject **output, Py_ssize_t *outpos) 4027{ 4028 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4029 4030 PyObject *restuple = NULL; 4031 PyObject *repunicode = NULL; 4032 Py_ssize_t outsize; 4033 Py_ssize_t insize; 4034 Py_ssize_t requiredsize; 4035 Py_ssize_t newpos; 4036 PyObject *inputobj = NULL; 4037 wchar_t *repwstr; 4038 Py_ssize_t repwlen; 4039 4040 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4041 outsize = _PyUnicode_WSTR_LENGTH(*output); 4042 4043 if (*errorHandler == NULL) { 4044 *errorHandler = PyCodec_LookupError(errors); 4045 if (*errorHandler == NULL) 4046 goto onError; 4047 } 4048 4049 make_decode_exception(exceptionObject, 4050 encoding, 4051 *input, *inend - *input, 4052 *startinpos, *endinpos, 4053 reason); 4054 if (*exceptionObject == NULL) 4055 goto onError; 4056 4057 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4058 if (restuple == NULL) 4059 goto onError; 4060 if (!PyTuple_Check(restuple)) { 4061 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4062 goto onError; 4063 } 4064 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4065 goto onError; 4066 4067 /* Copy back the bytes variables, which might have been modified by the 4068 callback */ 4069 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4070 if (!inputobj) 4071 goto onError; 4072 if (!PyBytes_Check(inputobj)) { 4073 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4074 } 4075 *input = PyBytes_AS_STRING(inputobj); 4076 insize = PyBytes_GET_SIZE(inputobj); 4077 *inend = *input + insize; 4078 /* we can DECREF safely, as the exception has another reference, 4079 so the object won't go away. */ 4080 Py_DECREF(inputobj); 4081 4082 if (newpos<0) 4083 newpos = insize+newpos; 4084 if (newpos<0 || newpos>insize) { 4085 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4086 goto onError; 4087 } 4088 4089 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4090 if (repwstr == NULL) 4091 goto onError; 4092 /* need more space? (at least enough for what we 4093 have+the replacement+the rest of the string (starting 4094 at the new input position), so we won't have to check space 4095 when there are no errors in the rest of the string) */ 4096 requiredsize = *outpos; 4097 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4098 goto overflow; 4099 requiredsize += repwlen; 4100 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4101 goto overflow; 4102 requiredsize += insize - newpos; 4103 if (requiredsize > outsize) { 4104 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4105 requiredsize = 2*outsize; 4106 if (unicode_resize(output, requiredsize) < 0) 4107 goto onError; 4108 } 4109 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4110 *outpos += repwlen; 4111 *endinpos = newpos; 4112 *inptr = *input + newpos; 4113 4114 /* we made it! */ 4115 Py_XDECREF(restuple); 4116 return 0; 4117 4118 overflow: 4119 PyErr_SetString(PyExc_OverflowError, 4120 "decoded result is too long for a Python string"); 4121 4122 onError: 4123 Py_XDECREF(restuple); 4124 return -1; 4125} 4126#endif /* HAVE_MBCS */ 4127 4128static int 4129unicode_decode_call_errorhandler_writer( 4130 const char *errors, PyObject **errorHandler, 4131 const char *encoding, const char *reason, 4132 const char **input, const char **inend, Py_ssize_t *startinpos, 4133 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4134 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4135{ 4136 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4137 4138 PyObject *restuple = NULL; 4139 PyObject *repunicode = NULL; 4140 Py_ssize_t insize; 4141 Py_ssize_t newpos; 4142 Py_ssize_t replen; 4143 PyObject *inputobj = NULL; 4144 4145 if (*errorHandler == NULL) { 4146 *errorHandler = PyCodec_LookupError(errors); 4147 if (*errorHandler == NULL) 4148 goto onError; 4149 } 4150 4151 make_decode_exception(exceptionObject, 4152 encoding, 4153 *input, *inend - *input, 4154 *startinpos, *endinpos, 4155 reason); 4156 if (*exceptionObject == NULL) 4157 goto onError; 4158 4159 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4160 if (restuple == NULL) 4161 goto onError; 4162 if (!PyTuple_Check(restuple)) { 4163 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4164 goto onError; 4165 } 4166 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4167 goto onError; 4168 4169 /* Copy back the bytes variables, which might have been modified by the 4170 callback */ 4171 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4172 if (!inputobj) 4173 goto onError; 4174 if (!PyBytes_Check(inputobj)) { 4175 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4176 } 4177 *input = PyBytes_AS_STRING(inputobj); 4178 insize = PyBytes_GET_SIZE(inputobj); 4179 *inend = *input + insize; 4180 /* we can DECREF safely, as the exception has another reference, 4181 so the object won't go away. */ 4182 Py_DECREF(inputobj); 4183 4184 if (newpos<0) 4185 newpos = insize+newpos; 4186 if (newpos<0 || newpos>insize) { 4187 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4188 goto onError; 4189 } 4190 4191 if (PyUnicode_READY(repunicode) < 0) 4192 goto onError; 4193 replen = PyUnicode_GET_LENGTH(repunicode); 4194 writer->min_length += replen; 4195 if (replen > 1) 4196 writer->overallocate = 1; 4197 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4198 goto onError; 4199 4200 *endinpos = newpos; 4201 *inptr = *input + newpos; 4202 4203 /* we made it! */ 4204 Py_XDECREF(restuple); 4205 return 0; 4206 4207 onError: 4208 Py_XDECREF(restuple); 4209 return -1; 4210} 4211 4212/* --- UTF-7 Codec -------------------------------------------------------- */ 4213 4214/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4215 4216/* Three simple macros defining base-64. */ 4217 4218/* Is c a base-64 character? */ 4219 4220#define IS_BASE64(c) \ 4221 (((c) >= 'A' && (c) <= 'Z') || \ 4222 ((c) >= 'a' && (c) <= 'z') || \ 4223 ((c) >= '0' && (c) <= '9') || \ 4224 (c) == '+' || (c) == '/') 4225 4226/* given that c is a base-64 character, what is its base-64 value? */ 4227 4228#define FROM_BASE64(c) \ 4229 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4230 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4231 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4232 (c) == '+' ? 62 : 63) 4233 4234/* What is the base-64 character of the bottom 6 bits of n? */ 4235 4236#define TO_BASE64(n) \ 4237 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4238 4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4240 * decoded as itself. We are permissive on decoding; the only ASCII 4241 * byte not decoding to itself is the + which begins a base64 4242 * string. */ 4243 4244#define DECODE_DIRECT(c) \ 4245 ((c) <= 127 && (c) != '+') 4246 4247/* The UTF-7 encoder treats ASCII characters differently according to 4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4249 * the above). See RFC2152. This array identifies these different 4250 * sets: 4251 * 0 : "Set D" 4252 * alphanumeric and '(),-./:? 4253 * 1 : "Set O" 4254 * !"#$%&*;<=>@[]^_`{|} 4255 * 2 : "whitespace" 4256 * ht nl cr sp 4257 * 3 : special (must be base64 encoded) 4258 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4259 */ 4260 4261static 4262char utf7_category[128] = { 4263/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4264 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4266 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4267/* sp ! " # $ % & ' ( ) * + , - . / */ 4268 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4269/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4271/* @ A B C D E F G H I J K L M N O */ 4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4273/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4275/* ` a b c d e f g h i j k l m n o */ 4276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4277/* p q r s t u v w x y z { | } ~ del */ 4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4279}; 4280 4281/* ENCODE_DIRECT: this character should be encoded as itself. The 4282 * answer depends on whether we are encoding set O as itself, and also 4283 * on whether we are encoding whitespace as itself. RFC2152 makes it 4284 * clear that the answers to these questions vary between 4285 * applications, so this code needs to be flexible. */ 4286 4287#define ENCODE_DIRECT(c, directO, directWS) \ 4288 ((c) < 128 && (c) > 0 && \ 4289 ((utf7_category[(c)] == 0) || \ 4290 (directWS && (utf7_category[(c)] == 2)) || \ 4291 (directO && (utf7_category[(c)] == 1)))) 4292 4293PyObject * 4294PyUnicode_DecodeUTF7(const char *s, 4295 Py_ssize_t size, 4296 const char *errors) 4297{ 4298 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4299} 4300 4301/* The decoder. The only state we preserve is our read position, 4302 * i.e. how many characters we have consumed. So if we end in the 4303 * middle of a shift sequence we have to back off the read position 4304 * and the output to the beginning of the sequence, otherwise we lose 4305 * all the shift state (seen bits, number of bits seen, high 4306 * surrogate). */ 4307 4308PyObject * 4309PyUnicode_DecodeUTF7Stateful(const char *s, 4310 Py_ssize_t size, 4311 const char *errors, 4312 Py_ssize_t *consumed) 4313{ 4314 const char *starts = s; 4315 Py_ssize_t startinpos; 4316 Py_ssize_t endinpos; 4317 const char *e; 4318 _PyUnicodeWriter writer; 4319 const char *errmsg = ""; 4320 int inShift = 0; 4321 Py_ssize_t shiftOutStart; 4322 unsigned int base64bits = 0; 4323 unsigned long base64buffer = 0; 4324 Py_UCS4 surrogate = 0; 4325 PyObject *errorHandler = NULL; 4326 PyObject *exc = NULL; 4327 4328 if (size == 0) { 4329 if (consumed) 4330 *consumed = 0; 4331 _Py_RETURN_UNICODE_EMPTY(); 4332 } 4333 4334 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4335 _PyUnicodeWriter_Init(&writer); 4336 writer.min_length = size; 4337 4338 shiftOutStart = 0; 4339 e = s + size; 4340 4341 while (s < e) { 4342 Py_UCS4 ch; 4343 restart: 4344 ch = (unsigned char) *s; 4345 4346 if (inShift) { /* in a base-64 section */ 4347 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4348 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4349 base64bits += 6; 4350 s++; 4351 if (base64bits >= 16) { 4352 /* we have enough bits for a UTF-16 value */ 4353 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4354 base64bits -= 16; 4355 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4356 assert(outCh <= 0xffff); 4357 if (surrogate) { 4358 /* expecting a second surrogate */ 4359 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4360 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4361 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4362 goto onError; 4363 surrogate = 0; 4364 continue; 4365 } 4366 else { 4367 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4368 goto onError; 4369 surrogate = 0; 4370 } 4371 } 4372 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4373 /* first surrogate */ 4374 surrogate = outCh; 4375 } 4376 else { 4377 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4378 goto onError; 4379 } 4380 } 4381 } 4382 else { /* now leaving a base-64 section */ 4383 inShift = 0; 4384 s++; 4385 if (surrogate) { 4386 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4387 goto onError; 4388 surrogate = 0; 4389 } 4390 if (base64bits > 0) { /* left-over bits */ 4391 if (base64bits >= 6) { 4392 /* We've seen at least one base-64 character */ 4393 errmsg = "partial character in shift sequence"; 4394 goto utf7Error; 4395 } 4396 else { 4397 /* Some bits remain; they should be zero */ 4398 if (base64buffer != 0) { 4399 errmsg = "non-zero padding bits in shift sequence"; 4400 goto utf7Error; 4401 } 4402 } 4403 } 4404 if (ch != '-') { 4405 /* '-' is absorbed; other terminating 4406 characters are preserved */ 4407 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4408 goto onError; 4409 } 4410 } 4411 } 4412 else if ( ch == '+' ) { 4413 startinpos = s-starts; 4414 s++; /* consume '+' */ 4415 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4416 s++; 4417 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4418 goto onError; 4419 } 4420 else { /* begin base64-encoded section */ 4421 inShift = 1; 4422 shiftOutStart = writer.pos; 4423 base64bits = 0; 4424 base64buffer = 0; 4425 } 4426 } 4427 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4428 s++; 4429 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4430 goto onError; 4431 } 4432 else { 4433 startinpos = s-starts; 4434 s++; 4435 errmsg = "unexpected special character"; 4436 goto utf7Error; 4437 } 4438 continue; 4439utf7Error: 4440 endinpos = s-starts; 4441 if (unicode_decode_call_errorhandler_writer( 4442 errors, &errorHandler, 4443 "utf7", errmsg, 4444 &starts, &e, &startinpos, &endinpos, &exc, &s, 4445 &writer)) 4446 goto onError; 4447 } 4448 4449 /* end of string */ 4450 4451 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4452 /* if we're in an inconsistent state, that's an error */ 4453 if (surrogate || 4454 (base64bits >= 6) || 4455 (base64bits > 0 && base64buffer != 0)) { 4456 endinpos = size; 4457 if (unicode_decode_call_errorhandler_writer( 4458 errors, &errorHandler, 4459 "utf7", "unterminated shift sequence", 4460 &starts, &e, &startinpos, &endinpos, &exc, &s, 4461 &writer)) 4462 goto onError; 4463 if (s < e) 4464 goto restart; 4465 } 4466 } 4467 4468 /* return state */ 4469 if (consumed) { 4470 if (inShift) { 4471 *consumed = startinpos; 4472 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4473 PyObject *result = PyUnicode_FromKindAndData( 4474 writer.kind, writer.data, shiftOutStart); 4475 Py_XDECREF(errorHandler); 4476 Py_XDECREF(exc); 4477 _PyUnicodeWriter_Dealloc(&writer); 4478 return result; 4479 } 4480 writer.pos = shiftOutStart; /* back off output */ 4481 } 4482 else { 4483 *consumed = s-starts; 4484 } 4485 } 4486 4487 Py_XDECREF(errorHandler); 4488 Py_XDECREF(exc); 4489 return _PyUnicodeWriter_Finish(&writer); 4490 4491 onError: 4492 Py_XDECREF(errorHandler); 4493 Py_XDECREF(exc); 4494 _PyUnicodeWriter_Dealloc(&writer); 4495 return NULL; 4496} 4497 4498 4499PyObject * 4500_PyUnicode_EncodeUTF7(PyObject *str, 4501 int base64SetO, 4502 int base64WhiteSpace, 4503 const char *errors) 4504{ 4505 int kind; 4506 void *data; 4507 Py_ssize_t len; 4508 PyObject *v; 4509 int inShift = 0; 4510 Py_ssize_t i; 4511 unsigned int base64bits = 0; 4512 unsigned long base64buffer = 0; 4513 char * out; 4514 char * start; 4515 4516 if (PyUnicode_READY(str) == -1) 4517 return NULL; 4518 kind = PyUnicode_KIND(str); 4519 data = PyUnicode_DATA(str); 4520 len = PyUnicode_GET_LENGTH(str); 4521 4522 if (len == 0) 4523 return PyBytes_FromStringAndSize(NULL, 0); 4524 4525 /* It might be possible to tighten this worst case */ 4526 if (len > PY_SSIZE_T_MAX / 8) 4527 return PyErr_NoMemory(); 4528 v = PyBytes_FromStringAndSize(NULL, len * 8); 4529 if (v == NULL) 4530 return NULL; 4531 4532 start = out = PyBytes_AS_STRING(v); 4533 for (i = 0; i < len; ++i) { 4534 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4535 4536 if (inShift) { 4537 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4538 /* shifting out */ 4539 if (base64bits) { /* output remaining bits */ 4540 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4541 base64buffer = 0; 4542 base64bits = 0; 4543 } 4544 inShift = 0; 4545 /* Characters not in the BASE64 set implicitly unshift the sequence 4546 so no '-' is required, except if the character is itself a '-' */ 4547 if (IS_BASE64(ch) || ch == '-') { 4548 *out++ = '-'; 4549 } 4550 *out++ = (char) ch; 4551 } 4552 else { 4553 goto encode_char; 4554 } 4555 } 4556 else { /* not in a shift sequence */ 4557 if (ch == '+') { 4558 *out++ = '+'; 4559 *out++ = '-'; 4560 } 4561 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4562 *out++ = (char) ch; 4563 } 4564 else { 4565 *out++ = '+'; 4566 inShift = 1; 4567 goto encode_char; 4568 } 4569 } 4570 continue; 4571encode_char: 4572 if (ch >= 0x10000) { 4573 assert(ch <= MAX_UNICODE); 4574 4575 /* code first surrogate */ 4576 base64bits += 16; 4577 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4578 while (base64bits >= 6) { 4579 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4580 base64bits -= 6; 4581 } 4582 /* prepare second surrogate */ 4583 ch = Py_UNICODE_LOW_SURROGATE(ch); 4584 } 4585 base64bits += 16; 4586 base64buffer = (base64buffer << 16) | ch; 4587 while (base64bits >= 6) { 4588 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4589 base64bits -= 6; 4590 } 4591 } 4592 if (base64bits) 4593 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4594 if (inShift) 4595 *out++ = '-'; 4596 if (_PyBytes_Resize(&v, out - start) < 0) 4597 return NULL; 4598 return v; 4599} 4600PyObject * 4601PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4602 Py_ssize_t size, 4603 int base64SetO, 4604 int base64WhiteSpace, 4605 const char *errors) 4606{ 4607 PyObject *result; 4608 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4609 if (tmp == NULL) 4610 return NULL; 4611 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4612 base64WhiteSpace, errors); 4613 Py_DECREF(tmp); 4614 return result; 4615} 4616 4617#undef IS_BASE64 4618#undef FROM_BASE64 4619#undef TO_BASE64 4620#undef DECODE_DIRECT 4621#undef ENCODE_DIRECT 4622 4623/* --- UTF-8 Codec -------------------------------------------------------- */ 4624 4625PyObject * 4626PyUnicode_DecodeUTF8(const char *s, 4627 Py_ssize_t size, 4628 const char *errors) 4629{ 4630 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4631} 4632 4633#include "stringlib/asciilib.h" 4634#include "stringlib/codecs.h" 4635#include "stringlib/undef.h" 4636 4637#include "stringlib/ucs1lib.h" 4638#include "stringlib/codecs.h" 4639#include "stringlib/undef.h" 4640 4641#include "stringlib/ucs2lib.h" 4642#include "stringlib/codecs.h" 4643#include "stringlib/undef.h" 4644 4645#include "stringlib/ucs4lib.h" 4646#include "stringlib/codecs.h" 4647#include "stringlib/undef.h" 4648 4649/* Mask to quickly check whether a C 'long' contains a 4650 non-ASCII, UTF8-encoded char. */ 4651#if (SIZEOF_LONG == 8) 4652# define ASCII_CHAR_MASK 0x8080808080808080UL 4653#elif (SIZEOF_LONG == 4) 4654# define ASCII_CHAR_MASK 0x80808080UL 4655#else 4656# error C 'long' size should be either 4 or 8! 4657#endif 4658 4659static Py_ssize_t 4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4661{ 4662 const char *p = start; 4663 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4664 4665 /* 4666 * Issue #17237: m68k is a bit different from most architectures in 4667 * that objects do not use "natural alignment" - for example, int and 4668 * long are only aligned at 2-byte boundaries. Therefore the assert() 4669 * won't work; also, tests have shown that skipping the "optimised 4670 * version" will even speed up m68k. 4671 */ 4672#if !defined(__m68k__) 4673#if SIZEOF_LONG <= SIZEOF_VOID_P 4674 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4675 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4676 /* Fast path, see in STRINGLIB(utf8_decode) for 4677 an explanation. */ 4678 /* Help allocation */ 4679 const char *_p = p; 4680 Py_UCS1 * q = dest; 4681 while (_p < aligned_end) { 4682 unsigned long value = *(const unsigned long *) _p; 4683 if (value & ASCII_CHAR_MASK) 4684 break; 4685 *((unsigned long *)q) = value; 4686 _p += SIZEOF_LONG; 4687 q += SIZEOF_LONG; 4688 } 4689 p = _p; 4690 while (p < end) { 4691 if ((unsigned char)*p & 0x80) 4692 break; 4693 *q++ = *p++; 4694 } 4695 return p - start; 4696 } 4697#endif 4698#endif 4699 while (p < end) { 4700 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4701 for an explanation. */ 4702 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4703 /* Help allocation */ 4704 const char *_p = p; 4705 while (_p < aligned_end) { 4706 unsigned long value = *(unsigned long *) _p; 4707 if (value & ASCII_CHAR_MASK) 4708 break; 4709 _p += SIZEOF_LONG; 4710 } 4711 p = _p; 4712 if (_p == end) 4713 break; 4714 } 4715 if ((unsigned char)*p & 0x80) 4716 break; 4717 ++p; 4718 } 4719 memcpy(dest, start, p - start); 4720 return p - start; 4721} 4722 4723PyObject * 4724PyUnicode_DecodeUTF8Stateful(const char *s, 4725 Py_ssize_t size, 4726 const char *errors, 4727 Py_ssize_t *consumed) 4728{ 4729 _PyUnicodeWriter writer; 4730 const char *starts = s; 4731 const char *end = s + size; 4732 4733 Py_ssize_t startinpos; 4734 Py_ssize_t endinpos; 4735 const char *errmsg = ""; 4736 PyObject *errorHandler = NULL; 4737 PyObject *exc = NULL; 4738 4739 if (size == 0) { 4740 if (consumed) 4741 *consumed = 0; 4742 _Py_RETURN_UNICODE_EMPTY(); 4743 } 4744 4745 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4746 if (size == 1 && (unsigned char)s[0] < 128) { 4747 if (consumed) 4748 *consumed = 1; 4749 return get_latin1_char((unsigned char)s[0]); 4750 } 4751 4752 _PyUnicodeWriter_Init(&writer); 4753 writer.min_length = size; 4754 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4755 goto onError; 4756 4757 writer.pos = ascii_decode(s, end, writer.data); 4758 s += writer.pos; 4759 while (s < end) { 4760 Py_UCS4 ch; 4761 int kind = writer.kind; 4762 if (kind == PyUnicode_1BYTE_KIND) { 4763 if (PyUnicode_IS_ASCII(writer.buffer)) 4764 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4765 else 4766 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4767 } else if (kind == PyUnicode_2BYTE_KIND) { 4768 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4769 } else { 4770 assert(kind == PyUnicode_4BYTE_KIND); 4771 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4772 } 4773 4774 switch (ch) { 4775 case 0: 4776 if (s == end || consumed) 4777 goto End; 4778 errmsg = "unexpected end of data"; 4779 startinpos = s - starts; 4780 endinpos = end - starts; 4781 break; 4782 case 1: 4783 errmsg = "invalid start byte"; 4784 startinpos = s - starts; 4785 endinpos = startinpos + 1; 4786 break; 4787 case 2: 4788 case 3: 4789 case 4: 4790 errmsg = "invalid continuation byte"; 4791 startinpos = s - starts; 4792 endinpos = startinpos + ch - 1; 4793 break; 4794 default: 4795 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4796 goto onError; 4797 continue; 4798 } 4799 4800 if (unicode_decode_call_errorhandler_writer( 4801 errors, &errorHandler, 4802 "utf-8", errmsg, 4803 &starts, &end, &startinpos, &endinpos, &exc, &s, 4804 &writer)) 4805 goto onError; 4806 } 4807 4808End: 4809 if (consumed) 4810 *consumed = s - starts; 4811 4812 Py_XDECREF(errorHandler); 4813 Py_XDECREF(exc); 4814 return _PyUnicodeWriter_Finish(&writer); 4815 4816onError: 4817 Py_XDECREF(errorHandler); 4818 Py_XDECREF(exc); 4819 _PyUnicodeWriter_Dealloc(&writer); 4820 return NULL; 4821} 4822 4823#ifdef __APPLE__ 4824 4825/* Simplified UTF-8 decoder using surrogateescape error handler, 4826 used to decode the command line arguments on Mac OS X. 4827 4828 Return a pointer to a newly allocated wide character string (use 4829 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4830 4831wchar_t* 4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4833{ 4834 const char *e; 4835 wchar_t *unicode; 4836 Py_ssize_t outpos; 4837 4838 /* Note: size will always be longer than the resulting Unicode 4839 character count */ 4840 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4841 return NULL; 4842 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4843 if (!unicode) 4844 return NULL; 4845 4846 /* Unpack UTF-8 encoded data */ 4847 e = s + size; 4848 outpos = 0; 4849 while (s < e) { 4850 Py_UCS4 ch; 4851#if SIZEOF_WCHAR_T == 4 4852 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4853#else 4854 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4855#endif 4856 if (ch > 0xFF) { 4857#if SIZEOF_WCHAR_T == 4 4858 assert(0); 4859#else 4860 assert(Py_UNICODE_IS_SURROGATE(ch)); 4861 /* compute and append the two surrogates: */ 4862 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4863 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4864#endif 4865 } 4866 else { 4867 if (!ch && s == e) 4868 break; 4869 /* surrogateescape */ 4870 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4871 } 4872 } 4873 unicode[outpos] = L'\0'; 4874 return unicode; 4875} 4876 4877#endif /* __APPLE__ */ 4878 4879/* Primary internal function which creates utf8 encoded bytes objects. 4880 4881 Allocation strategy: if the string is short, convert into a stack buffer 4882 and allocate exactly as much space needed at the end. Else allocate the 4883 maximum possible needed (4 result bytes per Unicode character), and return 4884 the excess memory at the end. 4885*/ 4886PyObject * 4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4888{ 4889 enum PyUnicode_Kind kind; 4890 void *data; 4891 Py_ssize_t size; 4892 4893 if (!PyUnicode_Check(unicode)) { 4894 PyErr_BadArgument(); 4895 return NULL; 4896 } 4897 4898 if (PyUnicode_READY(unicode) == -1) 4899 return NULL; 4900 4901 if (PyUnicode_UTF8(unicode)) 4902 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4903 PyUnicode_UTF8_LENGTH(unicode)); 4904 4905 kind = PyUnicode_KIND(unicode); 4906 data = PyUnicode_DATA(unicode); 4907 size = PyUnicode_GET_LENGTH(unicode); 4908 4909 switch (kind) { 4910 default: 4911 assert(0); 4912 case PyUnicode_1BYTE_KIND: 4913 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4914 assert(!PyUnicode_IS_ASCII(unicode)); 4915 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4916 case PyUnicode_2BYTE_KIND: 4917 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4918 case PyUnicode_4BYTE_KIND: 4919 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4920 } 4921} 4922 4923PyObject * 4924PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4925 Py_ssize_t size, 4926 const char *errors) 4927{ 4928 PyObject *v, *unicode; 4929 4930 unicode = PyUnicode_FromUnicode(s, size); 4931 if (unicode == NULL) 4932 return NULL; 4933 v = _PyUnicode_AsUTF8String(unicode, errors); 4934 Py_DECREF(unicode); 4935 return v; 4936} 4937 4938PyObject * 4939PyUnicode_AsUTF8String(PyObject *unicode) 4940{ 4941 return _PyUnicode_AsUTF8String(unicode, NULL); 4942} 4943 4944/* --- UTF-32 Codec ------------------------------------------------------- */ 4945 4946PyObject * 4947PyUnicode_DecodeUTF32(const char *s, 4948 Py_ssize_t size, 4949 const char *errors, 4950 int *byteorder) 4951{ 4952 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4953} 4954 4955PyObject * 4956PyUnicode_DecodeUTF32Stateful(const char *s, 4957 Py_ssize_t size, 4958 const char *errors, 4959 int *byteorder, 4960 Py_ssize_t *consumed) 4961{ 4962 const char *starts = s; 4963 Py_ssize_t startinpos; 4964 Py_ssize_t endinpos; 4965 _PyUnicodeWriter writer; 4966 const unsigned char *q, *e; 4967 int le, bo = 0; /* assume native ordering by default */ 4968 const char *encoding; 4969 const char *errmsg = ""; 4970 PyObject *errorHandler = NULL; 4971 PyObject *exc = NULL; 4972 4973 q = (unsigned char *)s; 4974 e = q + size; 4975 4976 if (byteorder) 4977 bo = *byteorder; 4978 4979 /* Check for BOM marks (U+FEFF) in the input and adjust current 4980 byte order setting accordingly. In native mode, the leading BOM 4981 mark is skipped, in all other modes, it is copied to the output 4982 stream as-is (giving a ZWNBSP character). */ 4983 if (bo == 0 && size >= 4) { 4984 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4985 if (bom == 0x0000FEFF) { 4986 bo = -1; 4987 q += 4; 4988 } 4989 else if (bom == 0xFFFE0000) { 4990 bo = 1; 4991 q += 4; 4992 } 4993 if (byteorder) 4994 *byteorder = bo; 4995 } 4996 4997 if (q == e) { 4998 if (consumed) 4999 *consumed = size; 5000 _Py_RETURN_UNICODE_EMPTY(); 5001 } 5002 5003#ifdef WORDS_BIGENDIAN 5004 le = bo < 0; 5005#else 5006 le = bo <= 0; 5007#endif 5008 encoding = le ? "utf-32-le" : "utf-32-be"; 5009 5010 _PyUnicodeWriter_Init(&writer); 5011 writer.min_length = (e - q + 3) / 4; 5012 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5013 goto onError; 5014 5015 while (1) { 5016 Py_UCS4 ch = 0; 5017 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5018 5019 if (e - q >= 4) { 5020 enum PyUnicode_Kind kind = writer.kind; 5021 void *data = writer.data; 5022 const unsigned char *last = e - 4; 5023 Py_ssize_t pos = writer.pos; 5024 if (le) { 5025 do { 5026 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5027 if (ch > maxch) 5028 break; 5029 if (kind != PyUnicode_1BYTE_KIND && 5030 Py_UNICODE_IS_SURROGATE(ch)) 5031 break; 5032 PyUnicode_WRITE(kind, data, pos++, ch); 5033 q += 4; 5034 } while (q <= last); 5035 } 5036 else { 5037 do { 5038 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5039 if (ch > maxch) 5040 break; 5041 if (kind != PyUnicode_1BYTE_KIND && 5042 Py_UNICODE_IS_SURROGATE(ch)) 5043 break; 5044 PyUnicode_WRITE(kind, data, pos++, ch); 5045 q += 4; 5046 } while (q <= last); 5047 } 5048 writer.pos = pos; 5049 } 5050 5051 if (Py_UNICODE_IS_SURROGATE(ch)) { 5052 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)"; 5053 startinpos = ((const char *)q) - starts; 5054 endinpos = startinpos + 4; 5055 } 5056 else if (ch <= maxch) { 5057 if (q == e || consumed) 5058 break; 5059 /* remaining bytes at the end? (size should be divisible by 4) */ 5060 errmsg = "truncated data"; 5061 startinpos = ((const char *)q) - starts; 5062 endinpos = ((const char *)e) - starts; 5063 } 5064 else { 5065 if (ch < 0x110000) { 5066 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5067 goto onError; 5068 q += 4; 5069 continue; 5070 } 5071 errmsg = "codepoint not in range(0x110000)"; 5072 startinpos = ((const char *)q) - starts; 5073 endinpos = startinpos + 4; 5074 } 5075 5076 /* The remaining input chars are ignored if the callback 5077 chooses to skip the input */ 5078 if (unicode_decode_call_errorhandler_writer( 5079 errors, &errorHandler, 5080 encoding, errmsg, 5081 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5082 &writer)) 5083 goto onError; 5084 } 5085 5086 if (consumed) 5087 *consumed = (const char *)q-starts; 5088 5089 Py_XDECREF(errorHandler); 5090 Py_XDECREF(exc); 5091 return _PyUnicodeWriter_Finish(&writer); 5092 5093 onError: 5094 _PyUnicodeWriter_Dealloc(&writer); 5095 Py_XDECREF(errorHandler); 5096 Py_XDECREF(exc); 5097 return NULL; 5098} 5099 5100PyObject * 5101_PyUnicode_EncodeUTF32(PyObject *str, 5102 const char *errors, 5103 int byteorder) 5104{ 5105 int kind; 5106 void *data; 5107 Py_ssize_t len; 5108 PyObject *v; 5109 unsigned char *p; 5110 Py_ssize_t nsize, i; 5111 /* Offsets from p for storing byte pairs in the right order. */ 5112#if PY_LITTLE_ENDIAN 5113 int iorder[] = {0, 1, 2, 3}; 5114#else 5115 int iorder[] = {3, 2, 1, 0}; 5116#endif 5117 const char *encoding; 5118 PyObject *errorHandler = NULL; 5119 PyObject *exc = NULL; 5120 PyObject *rep = NULL; 5121 5122#define STORECHAR(CH) \ 5123 do { \ 5124 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5125 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5126 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5127 p[iorder[0]] = (CH) & 0xff; \ 5128 p += 4; \ 5129 } while(0) 5130 5131 if (!PyUnicode_Check(str)) { 5132 PyErr_BadArgument(); 5133 return NULL; 5134 } 5135 if (PyUnicode_READY(str) == -1) 5136 return NULL; 5137 kind = PyUnicode_KIND(str); 5138 data = PyUnicode_DATA(str); 5139 len = PyUnicode_GET_LENGTH(str); 5140 5141 nsize = len + (byteorder == 0); 5142 if (nsize > PY_SSIZE_T_MAX / 4) 5143 return PyErr_NoMemory(); 5144 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5145 if (v == NULL) 5146 return NULL; 5147 5148 p = (unsigned char *)PyBytes_AS_STRING(v); 5149 if (byteorder == 0) 5150 STORECHAR(0xFEFF); 5151 if (len == 0) 5152 return v; 5153 5154 if (byteorder == -1) { 5155 /* force LE */ 5156 iorder[0] = 0; 5157 iorder[1] = 1; 5158 iorder[2] = 2; 5159 iorder[3] = 3; 5160 encoding = "utf-32-le"; 5161 } 5162 else if (byteorder == 1) { 5163 /* force BE */ 5164 iorder[0] = 3; 5165 iorder[1] = 2; 5166 iorder[2] = 1; 5167 iorder[3] = 0; 5168 encoding = "utf-32-be"; 5169 } 5170 else 5171 encoding = "utf-32"; 5172 5173 if (kind == PyUnicode_1BYTE_KIND) { 5174 for (i = 0; i < len; i++) 5175 STORECHAR(PyUnicode_READ(kind, data, i)); 5176 return v; 5177 } 5178 5179 for (i = 0; i < len;) { 5180 Py_ssize_t repsize, moreunits; 5181 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5182 i++; 5183 assert(ch <= MAX_UNICODE); 5184 if (!Py_UNICODE_IS_SURROGATE(ch)) { 5185 STORECHAR(ch); 5186 continue; 5187 } 5188 5189 rep = unicode_encode_call_errorhandler( 5190 errors, &errorHandler, 5191 encoding, "surrogates not allowed", 5192 str, &exc, i-1, i, &i); 5193 5194 if (!rep) 5195 goto error; 5196 5197 if (PyBytes_Check(rep)) { 5198 repsize = PyBytes_GET_SIZE(rep); 5199 if (repsize & 3) { 5200 raise_encode_exception(&exc, encoding, 5201 str, i - 1, i, 5202 "surrogates not allowed"); 5203 goto error; 5204 } 5205 moreunits = repsize / 4; 5206 } 5207 else { 5208 assert(PyUnicode_Check(rep)); 5209 if (PyUnicode_READY(rep) < 0) 5210 goto error; 5211 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5212 if (!PyUnicode_IS_ASCII(rep)) { 5213 raise_encode_exception(&exc, encoding, 5214 str, i - 1, i, 5215 "surrogates not allowed"); 5216 goto error; 5217 } 5218 } 5219 5220 /* four bytes are reserved for each surrogate */ 5221 if (moreunits > 1) { 5222 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); 5223 Py_ssize_t morebytes = 4 * (moreunits - 1); 5224 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5225 /* integer overflow */ 5226 PyErr_NoMemory(); 5227 goto error; 5228 } 5229 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5230 goto error; 5231 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; 5232 } 5233 5234 if (PyBytes_Check(rep)) { 5235 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); 5236 p += repsize; 5237 } else /* rep is unicode */ { 5238 const Py_UCS1 *repdata; 5239 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5240 repdata = PyUnicode_1BYTE_DATA(rep); 5241 while (repsize--) { 5242 Py_UCS4 ch = *repdata++; 5243 STORECHAR(ch); 5244 } 5245 } 5246 5247 Py_CLEAR(rep); 5248 } 5249 5250 /* Cut back to size actually needed. This is necessary for, for example, 5251 encoding of a string containing isolated surrogates and the 'ignore' 5252 handler is used. */ 5253 nsize = p - (unsigned char*) PyBytes_AS_STRING(v); 5254 if (nsize != PyBytes_GET_SIZE(v)) 5255 _PyBytes_Resize(&v, nsize); 5256 Py_XDECREF(errorHandler); 5257 Py_XDECREF(exc); 5258 return v; 5259 error: 5260 Py_XDECREF(rep); 5261 Py_XDECREF(errorHandler); 5262 Py_XDECREF(exc); 5263 Py_XDECREF(v); 5264 return NULL; 5265#undef STORECHAR 5266} 5267 5268PyObject * 5269PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5270 Py_ssize_t size, 5271 const char *errors, 5272 int byteorder) 5273{ 5274 PyObject *result; 5275 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5276 if (tmp == NULL) 5277 return NULL; 5278 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5279 Py_DECREF(tmp); 5280 return result; 5281} 5282 5283PyObject * 5284PyUnicode_AsUTF32String(PyObject *unicode) 5285{ 5286 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5287} 5288 5289/* --- UTF-16 Codec ------------------------------------------------------- */ 5290 5291PyObject * 5292PyUnicode_DecodeUTF16(const char *s, 5293 Py_ssize_t size, 5294 const char *errors, 5295 int *byteorder) 5296{ 5297 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5298} 5299 5300PyObject * 5301PyUnicode_DecodeUTF16Stateful(const char *s, 5302 Py_ssize_t size, 5303 const char *errors, 5304 int *byteorder, 5305 Py_ssize_t *consumed) 5306{ 5307 const char *starts = s; 5308 Py_ssize_t startinpos; 5309 Py_ssize_t endinpos; 5310 _PyUnicodeWriter writer; 5311 const unsigned char *q, *e; 5312 int bo = 0; /* assume native ordering by default */ 5313 int native_ordering; 5314 const char *errmsg = ""; 5315 PyObject *errorHandler = NULL; 5316 PyObject *exc = NULL; 5317 const char *encoding; 5318 5319 q = (unsigned char *)s; 5320 e = q + size; 5321 5322 if (byteorder) 5323 bo = *byteorder; 5324 5325 /* Check for BOM marks (U+FEFF) in the input and adjust current 5326 byte order setting accordingly. In native mode, the leading BOM 5327 mark is skipped, in all other modes, it is copied to the output 5328 stream as-is (giving a ZWNBSP character). */ 5329 if (bo == 0 && size >= 2) { 5330 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5331 if (bom == 0xFEFF) { 5332 q += 2; 5333 bo = -1; 5334 } 5335 else if (bom == 0xFFFE) { 5336 q += 2; 5337 bo = 1; 5338 } 5339 if (byteorder) 5340 *byteorder = bo; 5341 } 5342 5343 if (q == e) { 5344 if (consumed) 5345 *consumed = size; 5346 _Py_RETURN_UNICODE_EMPTY(); 5347 } 5348 5349#if PY_LITTLE_ENDIAN 5350 native_ordering = bo <= 0; 5351 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5352#else 5353 native_ordering = bo >= 0; 5354 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5355#endif 5356 5357 /* Note: size will always be longer than the resulting Unicode 5358 character count */ 5359 _PyUnicodeWriter_Init(&writer); 5360 writer.min_length = (e - q + 1) / 2; 5361 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5362 goto onError; 5363 5364 while (1) { 5365 Py_UCS4 ch = 0; 5366 if (e - q >= 2) { 5367 int kind = writer.kind; 5368 if (kind == PyUnicode_1BYTE_KIND) { 5369 if (PyUnicode_IS_ASCII(writer.buffer)) 5370 ch = asciilib_utf16_decode(&q, e, 5371 (Py_UCS1*)writer.data, &writer.pos, 5372 native_ordering); 5373 else 5374 ch = ucs1lib_utf16_decode(&q, e, 5375 (Py_UCS1*)writer.data, &writer.pos, 5376 native_ordering); 5377 } else if (kind == PyUnicode_2BYTE_KIND) { 5378 ch = ucs2lib_utf16_decode(&q, e, 5379 (Py_UCS2*)writer.data, &writer.pos, 5380 native_ordering); 5381 } else { 5382 assert(kind == PyUnicode_4BYTE_KIND); 5383 ch = ucs4lib_utf16_decode(&q, e, 5384 (Py_UCS4*)writer.data, &writer.pos, 5385 native_ordering); 5386 } 5387 } 5388 5389 switch (ch) 5390 { 5391 case 0: 5392 /* remaining byte at the end? (size should be even) */ 5393 if (q == e || consumed) 5394 goto End; 5395 errmsg = "truncated data"; 5396 startinpos = ((const char *)q) - starts; 5397 endinpos = ((const char *)e) - starts; 5398 break; 5399 /* The remaining input chars are ignored if the callback 5400 chooses to skip the input */ 5401 case 1: 5402 q -= 2; 5403 if (consumed) 5404 goto End; 5405 errmsg = "unexpected end of data"; 5406 startinpos = ((const char *)q) - starts; 5407 endinpos = ((const char *)e) - starts; 5408 break; 5409 case 2: 5410 errmsg = "illegal encoding"; 5411 startinpos = ((const char *)q) - 2 - starts; 5412 endinpos = startinpos + 2; 5413 break; 5414 case 3: 5415 errmsg = "illegal UTF-16 surrogate"; 5416 startinpos = ((const char *)q) - 4 - starts; 5417 endinpos = startinpos + 2; 5418 break; 5419 default: 5420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5421 goto onError; 5422 continue; 5423 } 5424 5425 if (unicode_decode_call_errorhandler_writer( 5426 errors, 5427 &errorHandler, 5428 encoding, errmsg, 5429 &starts, 5430 (const char **)&e, 5431 &startinpos, 5432 &endinpos, 5433 &exc, 5434 (const char **)&q, 5435 &writer)) 5436 goto onError; 5437 } 5438 5439End: 5440 if (consumed) 5441 *consumed = (const char *)q-starts; 5442 5443 Py_XDECREF(errorHandler); 5444 Py_XDECREF(exc); 5445 return _PyUnicodeWriter_Finish(&writer); 5446 5447 onError: 5448 _PyUnicodeWriter_Dealloc(&writer); 5449 Py_XDECREF(errorHandler); 5450 Py_XDECREF(exc); 5451 return NULL; 5452} 5453 5454PyObject * 5455_PyUnicode_EncodeUTF16(PyObject *str, 5456 const char *errors, 5457 int byteorder) 5458{ 5459 enum PyUnicode_Kind kind; 5460 const void *data; 5461 Py_ssize_t len; 5462 PyObject *v; 5463 unsigned short *out; 5464 Py_ssize_t pairs; 5465#if PY_BIG_ENDIAN 5466 int native_ordering = byteorder >= 0; 5467#else 5468 int native_ordering = byteorder <= 0; 5469#endif 5470 const char *encoding; 5471 Py_ssize_t nsize, pos; 5472 PyObject *errorHandler = NULL; 5473 PyObject *exc = NULL; 5474 PyObject *rep = NULL; 5475 5476 if (!PyUnicode_Check(str)) { 5477 PyErr_BadArgument(); 5478 return NULL; 5479 } 5480 if (PyUnicode_READY(str) == -1) 5481 return NULL; 5482 kind = PyUnicode_KIND(str); 5483 data = PyUnicode_DATA(str); 5484 len = PyUnicode_GET_LENGTH(str); 5485 5486 pairs = 0; 5487 if (kind == PyUnicode_4BYTE_KIND) { 5488 const Py_UCS4 *in = (const Py_UCS4 *)data; 5489 const Py_UCS4 *end = in + len; 5490 while (in < end) 5491 if (*in++ >= 0x10000) 5492 pairs++; 5493 } 5494 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5495 return PyErr_NoMemory(); 5496 nsize = len + pairs + (byteorder == 0); 5497 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5498 if (v == NULL) 5499 return NULL; 5500 5501 /* output buffer is 2-bytes aligned */ 5502 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5503 out = (unsigned short *)PyBytes_AS_STRING(v); 5504 if (byteorder == 0) 5505 *out++ = 0xFEFF; 5506 if (len == 0) 5507 goto done; 5508 5509 if (kind == PyUnicode_1BYTE_KIND) { 5510 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5511 goto done; 5512 } 5513 5514 if (byteorder < 0) 5515 encoding = "utf-16-le"; 5516 else if (byteorder > 0) 5517 encoding = "utf-16-be"; 5518 else 5519 encoding = "utf-16"; 5520 5521 pos = 0; 5522 while (pos < len) { 5523 Py_ssize_t repsize, moreunits; 5524 5525 if (kind == PyUnicode_2BYTE_KIND) { 5526 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5527 &out, native_ordering); 5528 } 5529 else { 5530 assert(kind == PyUnicode_4BYTE_KIND); 5531 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5532 &out, native_ordering); 5533 } 5534 if (pos == len) 5535 break; 5536 5537 rep = unicode_encode_call_errorhandler( 5538 errors, &errorHandler, 5539 encoding, "surrogates not allowed", 5540 str, &exc, pos, pos + 1, &pos); 5541 if (!rep) 5542 goto error; 5543 5544 if (PyBytes_Check(rep)) { 5545 repsize = PyBytes_GET_SIZE(rep); 5546 if (repsize & 1) { 5547 raise_encode_exception(&exc, encoding, 5548 str, pos - 1, pos, 5549 "surrogates not allowed"); 5550 goto error; 5551 } 5552 moreunits = repsize / 2; 5553 } 5554 else { 5555 assert(PyUnicode_Check(rep)); 5556 if (PyUnicode_READY(rep) < 0) 5557 goto error; 5558 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5559 if (!PyUnicode_IS_ASCII(rep)) { 5560 raise_encode_exception(&exc, encoding, 5561 str, pos - 1, pos, 5562 "surrogates not allowed"); 5563 goto error; 5564 } 5565 } 5566 5567 /* two bytes are reserved for each surrogate */ 5568 if (moreunits > 1) { 5569 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5570 Py_ssize_t morebytes = 2 * (moreunits - 1); 5571 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5572 /* integer overflow */ 5573 PyErr_NoMemory(); 5574 goto error; 5575 } 5576 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5577 goto error; 5578 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5579 } 5580 5581 if (PyBytes_Check(rep)) { 5582 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5583 out += moreunits; 5584 } else /* rep is unicode */ { 5585 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5586 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5587 &out, native_ordering); 5588 } 5589 5590 Py_CLEAR(rep); 5591 } 5592 5593 /* Cut back to size actually needed. This is necessary for, for example, 5594 encoding of a string containing isolated surrogates and the 'ignore' handler 5595 is used. */ 5596 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5597 if (nsize != PyBytes_GET_SIZE(v)) 5598 _PyBytes_Resize(&v, nsize); 5599 Py_XDECREF(errorHandler); 5600 Py_XDECREF(exc); 5601 done: 5602 return v; 5603 error: 5604 Py_XDECREF(rep); 5605 Py_XDECREF(errorHandler); 5606 Py_XDECREF(exc); 5607 Py_XDECREF(v); 5608 return NULL; 5609#undef STORECHAR 5610} 5611 5612PyObject * 5613PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5614 Py_ssize_t size, 5615 const char *errors, 5616 int byteorder) 5617{ 5618 PyObject *result; 5619 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5620 if (tmp == NULL) 5621 return NULL; 5622 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5623 Py_DECREF(tmp); 5624 return result; 5625} 5626 5627PyObject * 5628PyUnicode_AsUTF16String(PyObject *unicode) 5629{ 5630 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5631} 5632 5633/* --- Unicode Escape Codec ----------------------------------------------- */ 5634 5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5636 if all the escapes in the string make it still a valid ASCII string. 5637 Returns -1 if any escapes were found which cause the string to 5638 pop out of ASCII range. Otherwise returns the length of the 5639 required buffer to hold the string. 5640 */ 5641static Py_ssize_t 5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5643{ 5644 const unsigned char *p = (const unsigned char *)s; 5645 const unsigned char *end = p + size; 5646 Py_ssize_t length = 0; 5647 5648 if (size < 0) 5649 return -1; 5650 5651 for (; p < end; ++p) { 5652 if (*p > 127) { 5653 /* Non-ASCII */ 5654 return -1; 5655 } 5656 else if (*p != '\\') { 5657 /* Normal character */ 5658 ++length; 5659 } 5660 else { 5661 /* Backslash-escape, check next char */ 5662 ++p; 5663 /* Escape sequence reaches till end of string or 5664 non-ASCII follow-up. */ 5665 if (p >= end || *p > 127) 5666 return -1; 5667 switch (*p) { 5668 case '\n': 5669 /* backslash + \n result in zero characters */ 5670 break; 5671 case '\\': case '\'': case '\"': 5672 case 'b': case 'f': case 't': 5673 case 'n': case 'r': case 'v': case 'a': 5674 ++length; 5675 break; 5676 case '0': case '1': case '2': case '3': 5677 case '4': case '5': case '6': case '7': 5678 case 'x': case 'u': case 'U': case 'N': 5679 /* these do not guarantee ASCII characters */ 5680 return -1; 5681 default: 5682 /* count the backslash + the other character */ 5683 length += 2; 5684 } 5685 } 5686 } 5687 return length; 5688} 5689 5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5691 5692PyObject * 5693PyUnicode_DecodeUnicodeEscape(const char *s, 5694 Py_ssize_t size, 5695 const char *errors) 5696{ 5697 const char *starts = s; 5698 Py_ssize_t startinpos; 5699 Py_ssize_t endinpos; 5700 _PyUnicodeWriter writer; 5701 const char *end; 5702 char* message; 5703 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5704 PyObject *errorHandler = NULL; 5705 PyObject *exc = NULL; 5706 Py_ssize_t len; 5707 5708 len = length_of_escaped_ascii_string(s, size); 5709 if (len == 0) 5710 _Py_RETURN_UNICODE_EMPTY(); 5711 5712 /* After length_of_escaped_ascii_string() there are two alternatives, 5713 either the string is pure ASCII with named escapes like \n, etc. 5714 and we determined it's exact size (common case) 5715 or it contains \x, \u, ... escape sequences. then we create a 5716 legacy wchar string and resize it at the end of this function. */ 5717 _PyUnicodeWriter_Init(&writer); 5718 if (len > 0) { 5719 writer.min_length = len; 5720 } 5721 else { 5722 /* Escaped strings will always be longer than the resulting 5723 Unicode string, so we start with size here and then reduce the 5724 length after conversion to the true value. 5725 (but if the error callback returns a long replacement string 5726 we'll have to allocate more space) */ 5727 writer.min_length = size; 5728 } 5729 5730 if (size == 0) 5731 return _PyUnicodeWriter_Finish(&writer); 5732 end = s + size; 5733 5734 while (s < end) { 5735 unsigned char c; 5736 Py_UCS4 x; 5737 int digits; 5738 5739 /* Non-escape characters are interpreted as Unicode ordinals */ 5740 if (*s != '\\') { 5741 x = (unsigned char)*s; 5742 s++; 5743 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5744 goto onError; 5745 continue; 5746 } 5747 5748 startinpos = s-starts; 5749 /* \ - Escapes */ 5750 s++; 5751 c = *s++; 5752 if (s > end) 5753 c = '\0'; /* Invalid after \ */ 5754 5755 switch (c) { 5756 5757 /* \x escapes */ 5758#define WRITECHAR(ch) \ 5759 do { \ 5760 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5761 goto onError; \ 5762 } while(0) 5763 5764 case '\n': break; 5765 case '\\': WRITECHAR('\\'); break; 5766 case '\'': WRITECHAR('\''); break; 5767 case '\"': WRITECHAR('\"'); break; 5768 case 'b': WRITECHAR('\b'); break; 5769 /* FF */ 5770 case 'f': WRITECHAR('\014'); break; 5771 case 't': WRITECHAR('\t'); break; 5772 case 'n': WRITECHAR('\n'); break; 5773 case 'r': WRITECHAR('\r'); break; 5774 /* VT */ 5775 case 'v': WRITECHAR('\013'); break; 5776 /* BEL, not classic C */ 5777 case 'a': WRITECHAR('\007'); break; 5778 5779 /* \OOO (octal) escapes */ 5780 case '0': case '1': case '2': case '3': 5781 case '4': case '5': case '6': case '7': 5782 x = s[-1] - '0'; 5783 if (s < end && '0' <= *s && *s <= '7') { 5784 x = (x<<3) + *s++ - '0'; 5785 if (s < end && '0' <= *s && *s <= '7') 5786 x = (x<<3) + *s++ - '0'; 5787 } 5788 WRITECHAR(x); 5789 break; 5790 5791 /* hex escapes */ 5792 /* \xXX */ 5793 case 'x': 5794 digits = 2; 5795 message = "truncated \\xXX escape"; 5796 goto hexescape; 5797 5798 /* \uXXXX */ 5799 case 'u': 5800 digits = 4; 5801 message = "truncated \\uXXXX escape"; 5802 goto hexescape; 5803 5804 /* \UXXXXXXXX */ 5805 case 'U': 5806 digits = 8; 5807 message = "truncated \\UXXXXXXXX escape"; 5808 hexescape: 5809 chr = 0; 5810 if (end - s < digits) { 5811 /* count only hex digits */ 5812 for (; s < end; ++s) { 5813 c = (unsigned char)*s; 5814 if (!Py_ISXDIGIT(c)) 5815 goto error; 5816 } 5817 goto error; 5818 } 5819 for (; digits--; ++s) { 5820 c = (unsigned char)*s; 5821 if (!Py_ISXDIGIT(c)) 5822 goto error; 5823 chr = (chr<<4) & ~0xF; 5824 if (c >= '0' && c <= '9') 5825 chr += c - '0'; 5826 else if (c >= 'a' && c <= 'f') 5827 chr += 10 + c - 'a'; 5828 else 5829 chr += 10 + c - 'A'; 5830 } 5831 if (chr == 0xffffffff && PyErr_Occurred()) 5832 /* _decoding_error will have already written into the 5833 target buffer. */ 5834 break; 5835 store: 5836 /* when we get here, chr is a 32-bit unicode character */ 5837 message = "illegal Unicode character"; 5838 if (chr > MAX_UNICODE) 5839 goto error; 5840 WRITECHAR(chr); 5841 break; 5842 5843 /* \N{name} */ 5844 case 'N': 5845 message = "malformed \\N character escape"; 5846 if (ucnhash_CAPI == NULL) { 5847 /* load the unicode data module */ 5848 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5849 PyUnicodeData_CAPSULE_NAME, 1); 5850 if (ucnhash_CAPI == NULL) 5851 goto ucnhashError; 5852 } 5853 if (*s == '{') { 5854 const char *start = s+1; 5855 /* look for the closing brace */ 5856 while (*s != '}' && s < end) 5857 s++; 5858 if (s > start && s < end && *s == '}') { 5859 /* found a name. look it up in the unicode database */ 5860 message = "unknown Unicode character name"; 5861 s++; 5862 if (s - start - 1 <= INT_MAX && 5863 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5864 &chr, 0)) 5865 goto store; 5866 } 5867 } 5868 goto error; 5869 5870 default: 5871 if (s > end) { 5872 message = "\\ at end of string"; 5873 s--; 5874 goto error; 5875 } 5876 else { 5877 WRITECHAR('\\'); 5878 WRITECHAR((unsigned char)s[-1]); 5879 } 5880 break; 5881 } 5882 continue; 5883 5884 error: 5885 endinpos = s-starts; 5886 if (unicode_decode_call_errorhandler_writer( 5887 errors, &errorHandler, 5888 "unicodeescape", message, 5889 &starts, &end, &startinpos, &endinpos, &exc, &s, 5890 &writer)) 5891 goto onError; 5892 continue; 5893 } 5894#undef WRITECHAR 5895 5896 Py_XDECREF(errorHandler); 5897 Py_XDECREF(exc); 5898 return _PyUnicodeWriter_Finish(&writer); 5899 5900 ucnhashError: 5901 PyErr_SetString( 5902 PyExc_UnicodeError, 5903 "\\N escapes not supported (can't load unicodedata module)" 5904 ); 5905 _PyUnicodeWriter_Dealloc(&writer); 5906 Py_XDECREF(errorHandler); 5907 Py_XDECREF(exc); 5908 return NULL; 5909 5910 onError: 5911 _PyUnicodeWriter_Dealloc(&writer); 5912 Py_XDECREF(errorHandler); 5913 Py_XDECREF(exc); 5914 return NULL; 5915} 5916 5917/* Return a Unicode-Escape string version of the Unicode object. 5918 5919 If quotes is true, the string is enclosed in u"" or u'' quotes as 5920 appropriate. 5921 5922*/ 5923 5924PyObject * 5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5926{ 5927 Py_ssize_t i, len; 5928 PyObject *repr; 5929 char *p; 5930 int kind; 5931 void *data; 5932 Py_ssize_t expandsize = 0; 5933 5934 /* Initial allocation is based on the longest-possible character 5935 escape. 5936 5937 For UCS1 strings it's '\xxx', 4 bytes per source character. 5938 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5939 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5940 */ 5941 5942 if (!PyUnicode_Check(unicode)) { 5943 PyErr_BadArgument(); 5944 return NULL; 5945 } 5946 if (PyUnicode_READY(unicode) == -1) 5947 return NULL; 5948 len = PyUnicode_GET_LENGTH(unicode); 5949 kind = PyUnicode_KIND(unicode); 5950 data = PyUnicode_DATA(unicode); 5951 switch (kind) { 5952 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5953 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5954 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5955 } 5956 5957 if (len == 0) 5958 return PyBytes_FromStringAndSize(NULL, 0); 5959 5960 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5961 return PyErr_NoMemory(); 5962 5963 repr = PyBytes_FromStringAndSize(NULL, 5964 2 5965 + expandsize*len 5966 + 1); 5967 if (repr == NULL) 5968 return NULL; 5969 5970 p = PyBytes_AS_STRING(repr); 5971 5972 for (i = 0; i < len; i++) { 5973 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5974 5975 /* Escape backslashes */ 5976 if (ch == '\\') { 5977 *p++ = '\\'; 5978 *p++ = (char) ch; 5979 continue; 5980 } 5981 5982 /* Map 21-bit characters to '\U00xxxxxx' */ 5983 else if (ch >= 0x10000) { 5984 assert(ch <= MAX_UNICODE); 5985 *p++ = '\\'; 5986 *p++ = 'U'; 5987 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5988 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5989 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5990 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5991 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5992 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5993 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5994 *p++ = Py_hexdigits[ch & 0x0000000F]; 5995 continue; 5996 } 5997 5998 /* Map 16-bit characters to '\uxxxx' */ 5999 if (ch >= 256) { 6000 *p++ = '\\'; 6001 *p++ = 'u'; 6002 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6003 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6004 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6005 *p++ = Py_hexdigits[ch & 0x000F]; 6006 } 6007 6008 /* Map special whitespace to '\t', \n', '\r' */ 6009 else if (ch == '\t') { 6010 *p++ = '\\'; 6011 *p++ = 't'; 6012 } 6013 else if (ch == '\n') { 6014 *p++ = '\\'; 6015 *p++ = 'n'; 6016 } 6017 else if (ch == '\r') { 6018 *p++ = '\\'; 6019 *p++ = 'r'; 6020 } 6021 6022 /* Map non-printable US ASCII to '\xhh' */ 6023 else if (ch < ' ' || ch >= 0x7F) { 6024 *p++ = '\\'; 6025 *p++ = 'x'; 6026 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6027 *p++ = Py_hexdigits[ch & 0x000F]; 6028 } 6029 6030 /* Copy everything else as-is */ 6031 else 6032 *p++ = (char) ch; 6033 } 6034 6035 assert(p - PyBytes_AS_STRING(repr) > 0); 6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6037 return NULL; 6038 return repr; 6039} 6040 6041PyObject * 6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6043 Py_ssize_t size) 6044{ 6045 PyObject *result; 6046 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6047 if (tmp == NULL) 6048 return NULL; 6049 result = PyUnicode_AsUnicodeEscapeString(tmp); 6050 Py_DECREF(tmp); 6051 return result; 6052} 6053 6054/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6055 6056PyObject * 6057PyUnicode_DecodeRawUnicodeEscape(const char *s, 6058 Py_ssize_t size, 6059 const char *errors) 6060{ 6061 const char *starts = s; 6062 Py_ssize_t startinpos; 6063 Py_ssize_t endinpos; 6064 _PyUnicodeWriter writer; 6065 const char *end; 6066 const char *bs; 6067 PyObject *errorHandler = NULL; 6068 PyObject *exc = NULL; 6069 6070 if (size == 0) 6071 _Py_RETURN_UNICODE_EMPTY(); 6072 6073 /* Escaped strings will always be longer than the resulting 6074 Unicode string, so we start with size here and then reduce the 6075 length after conversion to the true value. (But decoding error 6076 handler might have to resize the string) */ 6077 _PyUnicodeWriter_Init(&writer); 6078 writer.min_length = size; 6079 6080 end = s + size; 6081 while (s < end) { 6082 unsigned char c; 6083 Py_UCS4 x; 6084 int i; 6085 int count; 6086 6087 /* Non-escape characters are interpreted as Unicode ordinals */ 6088 if (*s != '\\') { 6089 x = (unsigned char)*s++; 6090 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6091 goto onError; 6092 continue; 6093 } 6094 startinpos = s-starts; 6095 6096 /* \u-escapes are only interpreted iff the number of leading 6097 backslashes if odd */ 6098 bs = s; 6099 for (;s < end;) { 6100 if (*s != '\\') 6101 break; 6102 x = (unsigned char)*s++; 6103 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6104 goto onError; 6105 } 6106 if (((s - bs) & 1) == 0 || 6107 s >= end || 6108 (*s != 'u' && *s != 'U')) { 6109 continue; 6110 } 6111 writer.pos--; 6112 count = *s=='u' ? 4 : 8; 6113 s++; 6114 6115 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6116 for (x = 0, i = 0; i < count; ++i, ++s) { 6117 c = (unsigned char)*s; 6118 if (!Py_ISXDIGIT(c)) { 6119 endinpos = s-starts; 6120 if (unicode_decode_call_errorhandler_writer( 6121 errors, &errorHandler, 6122 "rawunicodeescape", "truncated \\uXXXX", 6123 &starts, &end, &startinpos, &endinpos, &exc, &s, 6124 &writer)) 6125 goto onError; 6126 goto nextByte; 6127 } 6128 x = (x<<4) & ~0xF; 6129 if (c >= '0' && c <= '9') 6130 x += c - '0'; 6131 else if (c >= 'a' && c <= 'f') 6132 x += 10 + c - 'a'; 6133 else 6134 x += 10 + c - 'A'; 6135 } 6136 if (x <= MAX_UNICODE) { 6137 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6138 goto onError; 6139 } 6140 else { 6141 endinpos = s-starts; 6142 if (unicode_decode_call_errorhandler_writer( 6143 errors, &errorHandler, 6144 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6145 &starts, &end, &startinpos, &endinpos, &exc, &s, 6146 &writer)) 6147 goto onError; 6148 } 6149 nextByte: 6150 ; 6151 } 6152 Py_XDECREF(errorHandler); 6153 Py_XDECREF(exc); 6154 return _PyUnicodeWriter_Finish(&writer); 6155 6156 onError: 6157 _PyUnicodeWriter_Dealloc(&writer); 6158 Py_XDECREF(errorHandler); 6159 Py_XDECREF(exc); 6160 return NULL; 6161} 6162 6163 6164PyObject * 6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6166{ 6167 PyObject *repr; 6168 char *p; 6169 char *q; 6170 Py_ssize_t expandsize, pos; 6171 int kind; 6172 void *data; 6173 Py_ssize_t len; 6174 6175 if (!PyUnicode_Check(unicode)) { 6176 PyErr_BadArgument(); 6177 return NULL; 6178 } 6179 if (PyUnicode_READY(unicode) == -1) 6180 return NULL; 6181 kind = PyUnicode_KIND(unicode); 6182 data = PyUnicode_DATA(unicode); 6183 len = PyUnicode_GET_LENGTH(unicode); 6184 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6185 bytes, and 1 byte characters 4. */ 6186 expandsize = kind * 2 + 2; 6187 6188 if (len > PY_SSIZE_T_MAX / expandsize) 6189 return PyErr_NoMemory(); 6190 6191 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6192 if (repr == NULL) 6193 return NULL; 6194 if (len == 0) 6195 return repr; 6196 6197 p = q = PyBytes_AS_STRING(repr); 6198 for (pos = 0; pos < len; pos++) { 6199 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6200 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6201 if (ch >= 0x10000) { 6202 assert(ch <= MAX_UNICODE); 6203 *p++ = '\\'; 6204 *p++ = 'U'; 6205 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6206 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6207 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6208 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6209 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6212 *p++ = Py_hexdigits[ch & 15]; 6213 } 6214 /* Map 16-bit characters to '\uxxxx' */ 6215 else if (ch >= 256) { 6216 *p++ = '\\'; 6217 *p++ = 'u'; 6218 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6219 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6220 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6221 *p++ = Py_hexdigits[ch & 15]; 6222 } 6223 /* Copy everything else as-is */ 6224 else 6225 *p++ = (char) ch; 6226 } 6227 6228 assert(p > q); 6229 if (_PyBytes_Resize(&repr, p - q) < 0) 6230 return NULL; 6231 return repr; 6232} 6233 6234PyObject * 6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6236 Py_ssize_t size) 6237{ 6238 PyObject *result; 6239 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6240 if (tmp == NULL) 6241 return NULL; 6242 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6243 Py_DECREF(tmp); 6244 return result; 6245} 6246 6247/* --- Unicode Internal Codec ------------------------------------------- */ 6248 6249PyObject * 6250_PyUnicode_DecodeUnicodeInternal(const char *s, 6251 Py_ssize_t size, 6252 const char *errors) 6253{ 6254 const char *starts = s; 6255 Py_ssize_t startinpos; 6256 Py_ssize_t endinpos; 6257 _PyUnicodeWriter writer; 6258 const char *end; 6259 const char *reason; 6260 PyObject *errorHandler = NULL; 6261 PyObject *exc = NULL; 6262 6263 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6264 "unicode_internal codec has been deprecated", 6265 1)) 6266 return NULL; 6267 6268 if (size == 0) 6269 _Py_RETURN_UNICODE_EMPTY(); 6270 6271 _PyUnicodeWriter_Init(&writer); 6272 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6273 PyErr_NoMemory(); 6274 goto onError; 6275 } 6276 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6277 6278 end = s + size; 6279 while (s < end) { 6280 Py_UNICODE uch; 6281 Py_UCS4 ch; 6282 if (end - s < Py_UNICODE_SIZE) { 6283 endinpos = end-starts; 6284 reason = "truncated input"; 6285 goto error; 6286 } 6287 /* We copy the raw representation one byte at a time because the 6288 pointer may be unaligned (see test_codeccallbacks). */ 6289 ((char *) &uch)[0] = s[0]; 6290 ((char *) &uch)[1] = s[1]; 6291#ifdef Py_UNICODE_WIDE 6292 ((char *) &uch)[2] = s[2]; 6293 ((char *) &uch)[3] = s[3]; 6294#endif 6295 ch = uch; 6296#ifdef Py_UNICODE_WIDE 6297 /* We have to sanity check the raw data, otherwise doom looms for 6298 some malformed UCS-4 data. */ 6299 if (ch > 0x10ffff) { 6300 endinpos = s - starts + Py_UNICODE_SIZE; 6301 reason = "illegal code point (> 0x10FFFF)"; 6302 goto error; 6303 } 6304#endif 6305 s += Py_UNICODE_SIZE; 6306#ifndef Py_UNICODE_WIDE 6307 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6308 { 6309 Py_UNICODE uch2; 6310 ((char *) &uch2)[0] = s[0]; 6311 ((char *) &uch2)[1] = s[1]; 6312 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6313 { 6314 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6315 s += Py_UNICODE_SIZE; 6316 } 6317 } 6318#endif 6319 6320 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6321 goto onError; 6322 continue; 6323 6324 error: 6325 startinpos = s - starts; 6326 if (unicode_decode_call_errorhandler_writer( 6327 errors, &errorHandler, 6328 "unicode_internal", reason, 6329 &starts, &end, &startinpos, &endinpos, &exc, &s, 6330 &writer)) 6331 goto onError; 6332 } 6333 6334 Py_XDECREF(errorHandler); 6335 Py_XDECREF(exc); 6336 return _PyUnicodeWriter_Finish(&writer); 6337 6338 onError: 6339 _PyUnicodeWriter_Dealloc(&writer); 6340 Py_XDECREF(errorHandler); 6341 Py_XDECREF(exc); 6342 return NULL; 6343} 6344 6345/* --- Latin-1 Codec ------------------------------------------------------ */ 6346 6347PyObject * 6348PyUnicode_DecodeLatin1(const char *s, 6349 Py_ssize_t size, 6350 const char *errors) 6351{ 6352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6353 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6354} 6355 6356/* create or adjust a UnicodeEncodeError */ 6357static void 6358make_encode_exception(PyObject **exceptionObject, 6359 const char *encoding, 6360 PyObject *unicode, 6361 Py_ssize_t startpos, Py_ssize_t endpos, 6362 const char *reason) 6363{ 6364 if (*exceptionObject == NULL) { 6365 *exceptionObject = PyObject_CallFunction( 6366 PyExc_UnicodeEncodeError, "sOnns", 6367 encoding, unicode, startpos, endpos, reason); 6368 } 6369 else { 6370 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6371 goto onError; 6372 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6373 goto onError; 6374 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6375 goto onError; 6376 return; 6377 onError: 6378 Py_CLEAR(*exceptionObject); 6379 } 6380} 6381 6382/* raises a UnicodeEncodeError */ 6383static void 6384raise_encode_exception(PyObject **exceptionObject, 6385 const char *encoding, 6386 PyObject *unicode, 6387 Py_ssize_t startpos, Py_ssize_t endpos, 6388 const char *reason) 6389{ 6390 make_encode_exception(exceptionObject, 6391 encoding, unicode, startpos, endpos, reason); 6392 if (*exceptionObject != NULL) 6393 PyCodec_StrictErrors(*exceptionObject); 6394} 6395 6396/* error handling callback helper: 6397 build arguments, call the callback and check the arguments, 6398 put the result into newpos and return the replacement string, which 6399 has to be freed by the caller */ 6400static PyObject * 6401unicode_encode_call_errorhandler(const char *errors, 6402 PyObject **errorHandler, 6403 const char *encoding, const char *reason, 6404 PyObject *unicode, PyObject **exceptionObject, 6405 Py_ssize_t startpos, Py_ssize_t endpos, 6406 Py_ssize_t *newpos) 6407{ 6408 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6409 Py_ssize_t len; 6410 PyObject *restuple; 6411 PyObject *resunicode; 6412 6413 if (*errorHandler == NULL) { 6414 *errorHandler = PyCodec_LookupError(errors); 6415 if (*errorHandler == NULL) 6416 return NULL; 6417 } 6418 6419 if (PyUnicode_READY(unicode) == -1) 6420 return NULL; 6421 len = PyUnicode_GET_LENGTH(unicode); 6422 6423 make_encode_exception(exceptionObject, 6424 encoding, unicode, startpos, endpos, reason); 6425 if (*exceptionObject == NULL) 6426 return NULL; 6427 6428 restuple = PyObject_CallFunctionObjArgs( 6429 *errorHandler, *exceptionObject, NULL); 6430 if (restuple == NULL) 6431 return NULL; 6432 if (!PyTuple_Check(restuple)) { 6433 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6434 Py_DECREF(restuple); 6435 return NULL; 6436 } 6437 if (!PyArg_ParseTuple(restuple, argparse, 6438 &resunicode, newpos)) { 6439 Py_DECREF(restuple); 6440 return NULL; 6441 } 6442 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6443 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6444 Py_DECREF(restuple); 6445 return NULL; 6446 } 6447 if (*newpos<0) 6448 *newpos = len + *newpos; 6449 if (*newpos<0 || *newpos>len) { 6450 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6451 Py_DECREF(restuple); 6452 return NULL; 6453 } 6454 Py_INCREF(resunicode); 6455 Py_DECREF(restuple); 6456 return resunicode; 6457} 6458 6459static PyObject * 6460unicode_encode_ucs1(PyObject *unicode, 6461 const char *errors, 6462 unsigned int limit) 6463{ 6464 /* input state */ 6465 Py_ssize_t pos=0, size; 6466 int kind; 6467 void *data; 6468 /* output object */ 6469 PyObject *res; 6470 /* pointer into the output */ 6471 char *str; 6472 /* current output position */ 6473 Py_ssize_t ressize; 6474 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6475 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6476 PyObject *errorHandler = NULL; 6477 PyObject *exc = NULL; 6478 /* the following variable is used for caching string comparisons 6479 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6480 int known_errorHandler = -1; 6481 6482 if (PyUnicode_READY(unicode) == -1) 6483 return NULL; 6484 size = PyUnicode_GET_LENGTH(unicode); 6485 kind = PyUnicode_KIND(unicode); 6486 data = PyUnicode_DATA(unicode); 6487 /* allocate enough for a simple encoding without 6488 replacements, if we need more, we'll resize */ 6489 if (size == 0) 6490 return PyBytes_FromStringAndSize(NULL, 0); 6491 res = PyBytes_FromStringAndSize(NULL, size); 6492 if (res == NULL) 6493 return NULL; 6494 str = PyBytes_AS_STRING(res); 6495 ressize = size; 6496 6497 while (pos < size) { 6498 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6499 6500 /* can we encode this? */ 6501 if (c<limit) { 6502 /* no overflow check, because we know that the space is enough */ 6503 *str++ = (char)c; 6504 ++pos; 6505 } 6506 else { 6507 Py_ssize_t requiredsize; 6508 PyObject *repunicode; 6509 Py_ssize_t repsize, newpos, respos, i; 6510 /* startpos for collecting unencodable chars */ 6511 Py_ssize_t collstart = pos; 6512 Py_ssize_t collend = pos; 6513 /* find all unecodable characters */ 6514 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6515 ++collend; 6516 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6517 if (known_errorHandler==-1) { 6518 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6519 known_errorHandler = 1; 6520 else if (!strcmp(errors, "replace")) 6521 known_errorHandler = 2; 6522 else if (!strcmp(errors, "ignore")) 6523 known_errorHandler = 3; 6524 else if (!strcmp(errors, "xmlcharrefreplace")) 6525 known_errorHandler = 4; 6526 else 6527 known_errorHandler = 0; 6528 } 6529 switch (known_errorHandler) { 6530 case 1: /* strict */ 6531 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6532 goto onError; 6533 case 2: /* replace */ 6534 while (collstart++ < collend) 6535 *str++ = '?'; /* fall through */ 6536 case 3: /* ignore */ 6537 pos = collend; 6538 break; 6539 case 4: /* xmlcharrefreplace */ 6540 respos = str - PyBytes_AS_STRING(res); 6541 requiredsize = respos; 6542 /* determine replacement size */ 6543 for (i = collstart; i < collend; ++i) { 6544 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6545 Py_ssize_t incr; 6546 if (ch < 10) 6547 incr = 2+1+1; 6548 else if (ch < 100) 6549 incr = 2+2+1; 6550 else if (ch < 1000) 6551 incr = 2+3+1; 6552 else if (ch < 10000) 6553 incr = 2+4+1; 6554 else if (ch < 100000) 6555 incr = 2+5+1; 6556 else if (ch < 1000000) 6557 incr = 2+6+1; 6558 else { 6559 assert(ch <= MAX_UNICODE); 6560 incr = 2+7+1; 6561 } 6562 if (requiredsize > PY_SSIZE_T_MAX - incr) 6563 goto overflow; 6564 requiredsize += incr; 6565 } 6566 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6567 goto overflow; 6568 requiredsize += size - collend; 6569 if (requiredsize > ressize) { 6570 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6571 requiredsize = 2*ressize; 6572 if (_PyBytes_Resize(&res, requiredsize)) 6573 goto onError; 6574 str = PyBytes_AS_STRING(res) + respos; 6575 ressize = requiredsize; 6576 } 6577 /* generate replacement */ 6578 for (i = collstart; i < collend; ++i) { 6579 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6580 } 6581 pos = collend; 6582 break; 6583 default: 6584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6585 encoding, reason, unicode, &exc, 6586 collstart, collend, &newpos); 6587 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6588 PyUnicode_READY(repunicode) == -1)) 6589 goto onError; 6590 if (PyBytes_Check(repunicode)) { 6591 /* Directly copy bytes result to output. */ 6592 repsize = PyBytes_Size(repunicode); 6593 if (repsize > 1) { 6594 /* Make room for all additional bytes. */ 6595 respos = str - PyBytes_AS_STRING(res); 6596 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6597 Py_DECREF(repunicode); 6598 goto overflow; 6599 } 6600 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6601 Py_DECREF(repunicode); 6602 goto onError; 6603 } 6604 str = PyBytes_AS_STRING(res) + respos; 6605 ressize += repsize-1; 6606 } 6607 memcpy(str, PyBytes_AsString(repunicode), repsize); 6608 str += repsize; 6609 pos = newpos; 6610 Py_DECREF(repunicode); 6611 break; 6612 } 6613 /* need more space? (at least enough for what we 6614 have+the replacement+the rest of the string, so 6615 we won't have to check space for encodable characters) */ 6616 respos = str - PyBytes_AS_STRING(res); 6617 repsize = PyUnicode_GET_LENGTH(repunicode); 6618 requiredsize = respos; 6619 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6620 goto overflow; 6621 requiredsize += repsize; 6622 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6623 goto overflow; 6624 requiredsize += size - collend; 6625 if (requiredsize > ressize) { 6626 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6627 requiredsize = 2*ressize; 6628 if (_PyBytes_Resize(&res, requiredsize)) { 6629 Py_DECREF(repunicode); 6630 goto onError; 6631 } 6632 str = PyBytes_AS_STRING(res) + respos; 6633 ressize = requiredsize; 6634 } 6635 /* check if there is anything unencodable in the replacement 6636 and copy it to the output */ 6637 for (i = 0; repsize-->0; ++i, ++str) { 6638 c = PyUnicode_READ_CHAR(repunicode, i); 6639 if (c >= limit) { 6640 raise_encode_exception(&exc, encoding, unicode, 6641 pos, pos+1, reason); 6642 Py_DECREF(repunicode); 6643 goto onError; 6644 } 6645 *str = (char)c; 6646 } 6647 pos = newpos; 6648 Py_DECREF(repunicode); 6649 } 6650 } 6651 } 6652 /* Resize if we allocated to much */ 6653 size = str - PyBytes_AS_STRING(res); 6654 if (size < ressize) { /* If this falls res will be NULL */ 6655 assert(size >= 0); 6656 if (_PyBytes_Resize(&res, size) < 0) 6657 goto onError; 6658 } 6659 6660 Py_XDECREF(errorHandler); 6661 Py_XDECREF(exc); 6662 return res; 6663 6664 overflow: 6665 PyErr_SetString(PyExc_OverflowError, 6666 "encoded result is too long for a Python string"); 6667 6668 onError: 6669 Py_XDECREF(res); 6670 Py_XDECREF(errorHandler); 6671 Py_XDECREF(exc); 6672 return NULL; 6673} 6674 6675/* Deprecated */ 6676PyObject * 6677PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6678 Py_ssize_t size, 6679 const char *errors) 6680{ 6681 PyObject *result; 6682 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6683 if (unicode == NULL) 6684 return NULL; 6685 result = unicode_encode_ucs1(unicode, errors, 256); 6686 Py_DECREF(unicode); 6687 return result; 6688} 6689 6690PyObject * 6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6692{ 6693 if (!PyUnicode_Check(unicode)) { 6694 PyErr_BadArgument(); 6695 return NULL; 6696 } 6697 if (PyUnicode_READY(unicode) == -1) 6698 return NULL; 6699 /* Fast path: if it is a one-byte string, construct 6700 bytes object directly. */ 6701 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6702 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6703 PyUnicode_GET_LENGTH(unicode)); 6704 /* Non-Latin-1 characters present. Defer to above function to 6705 raise the exception. */ 6706 return unicode_encode_ucs1(unicode, errors, 256); 6707} 6708 6709PyObject* 6710PyUnicode_AsLatin1String(PyObject *unicode) 6711{ 6712 return _PyUnicode_AsLatin1String(unicode, NULL); 6713} 6714 6715/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6716 6717PyObject * 6718PyUnicode_DecodeASCII(const char *s, 6719 Py_ssize_t size, 6720 const char *errors) 6721{ 6722 const char *starts = s; 6723 _PyUnicodeWriter writer; 6724 int kind; 6725 void *data; 6726 Py_ssize_t startinpos; 6727 Py_ssize_t endinpos; 6728 Py_ssize_t outpos; 6729 const char *e; 6730 PyObject *errorHandler = NULL; 6731 PyObject *exc = NULL; 6732 6733 if (size == 0) 6734 _Py_RETURN_UNICODE_EMPTY(); 6735 6736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6737 if (size == 1 && (unsigned char)s[0] < 128) 6738 return get_latin1_char((unsigned char)s[0]); 6739 6740 _PyUnicodeWriter_Init(&writer); 6741 writer.min_length = size; 6742 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6743 return NULL; 6744 6745 e = s + size; 6746 data = writer.data; 6747 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6748 writer.pos = outpos; 6749 if (writer.pos == size) 6750 return _PyUnicodeWriter_Finish(&writer); 6751 6752 s += writer.pos; 6753 kind = writer.kind; 6754 while (s < e) { 6755 unsigned char c = (unsigned char)*s; 6756 if (c < 128) { 6757 PyUnicode_WRITE(kind, data, writer.pos, c); 6758 writer.pos++; 6759 ++s; 6760 } 6761 else { 6762 startinpos = s-starts; 6763 endinpos = startinpos + 1; 6764 if (unicode_decode_call_errorhandler_writer( 6765 errors, &errorHandler, 6766 "ascii", "ordinal not in range(128)", 6767 &starts, &e, &startinpos, &endinpos, &exc, &s, 6768 &writer)) 6769 goto onError; 6770 kind = writer.kind; 6771 data = writer.data; 6772 } 6773 } 6774 Py_XDECREF(errorHandler); 6775 Py_XDECREF(exc); 6776 return _PyUnicodeWriter_Finish(&writer); 6777 6778 onError: 6779 _PyUnicodeWriter_Dealloc(&writer); 6780 Py_XDECREF(errorHandler); 6781 Py_XDECREF(exc); 6782 return NULL; 6783} 6784 6785/* Deprecated */ 6786PyObject * 6787PyUnicode_EncodeASCII(const Py_UNICODE *p, 6788 Py_ssize_t size, 6789 const char *errors) 6790{ 6791 PyObject *result; 6792 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6793 if (unicode == NULL) 6794 return NULL; 6795 result = unicode_encode_ucs1(unicode, errors, 128); 6796 Py_DECREF(unicode); 6797 return result; 6798} 6799 6800PyObject * 6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6802{ 6803 if (!PyUnicode_Check(unicode)) { 6804 PyErr_BadArgument(); 6805 return NULL; 6806 } 6807 if (PyUnicode_READY(unicode) == -1) 6808 return NULL; 6809 /* Fast path: if it is an ASCII-only string, construct bytes object 6810 directly. Else defer to above function to raise the exception. */ 6811 if (PyUnicode_IS_ASCII(unicode)) 6812 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6813 PyUnicode_GET_LENGTH(unicode)); 6814 return unicode_encode_ucs1(unicode, errors, 128); 6815} 6816 6817PyObject * 6818PyUnicode_AsASCIIString(PyObject *unicode) 6819{ 6820 return _PyUnicode_AsASCIIString(unicode, NULL); 6821} 6822 6823#ifdef HAVE_MBCS 6824 6825/* --- MBCS codecs for Windows -------------------------------------------- */ 6826 6827#if SIZEOF_INT < SIZEOF_SIZE_T 6828#define NEED_RETRY 6829#endif 6830 6831#ifndef WC_ERR_INVALID_CHARS 6832# define WC_ERR_INVALID_CHARS 0x0080 6833#endif 6834 6835static char* 6836code_page_name(UINT code_page, PyObject **obj) 6837{ 6838 *obj = NULL; 6839 if (code_page == CP_ACP) 6840 return "mbcs"; 6841 if (code_page == CP_UTF7) 6842 return "CP_UTF7"; 6843 if (code_page == CP_UTF8) 6844 return "CP_UTF8"; 6845 6846 *obj = PyBytes_FromFormat("cp%u", code_page); 6847 if (*obj == NULL) 6848 return NULL; 6849 return PyBytes_AS_STRING(*obj); 6850} 6851 6852static DWORD 6853decode_code_page_flags(UINT code_page) 6854{ 6855 if (code_page == CP_UTF7) { 6856 /* The CP_UTF7 decoder only supports flags=0 */ 6857 return 0; 6858 } 6859 else 6860 return MB_ERR_INVALID_CHARS; 6861} 6862 6863/* 6864 * Decode a byte string from a Windows code page into unicode object in strict 6865 * mode. 6866 * 6867 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6868 * OSError and returns -1 on other error. 6869 */ 6870static int 6871decode_code_page_strict(UINT code_page, 6872 PyObject **v, 6873 const char *in, 6874 int insize) 6875{ 6876 const DWORD flags = decode_code_page_flags(code_page); 6877 wchar_t *out; 6878 DWORD outsize; 6879 6880 /* First get the size of the result */ 6881 assert(insize > 0); 6882 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6883 if (outsize <= 0) 6884 goto error; 6885 6886 if (*v == NULL) { 6887 /* Create unicode object */ 6888 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6889 *v = (PyObject*)_PyUnicode_New(outsize); 6890 if (*v == NULL) 6891 return -1; 6892 out = PyUnicode_AS_UNICODE(*v); 6893 } 6894 else { 6895 /* Extend unicode object */ 6896 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6897 if (unicode_resize(v, n + outsize) < 0) 6898 return -1; 6899 out = PyUnicode_AS_UNICODE(*v) + n; 6900 } 6901 6902 /* Do the conversion */ 6903 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6904 if (outsize <= 0) 6905 goto error; 6906 return insize; 6907 6908error: 6909 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6910 return -2; 6911 PyErr_SetFromWindowsErr(0); 6912 return -1; 6913} 6914 6915/* 6916 * Decode a byte string from a code page into unicode object with an error 6917 * handler. 6918 * 6919 * Returns consumed size if succeed, or raise an OSError or 6920 * UnicodeDecodeError exception and returns -1 on error. 6921 */ 6922static int 6923decode_code_page_errors(UINT code_page, 6924 PyObject **v, 6925 const char *in, const int size, 6926 const char *errors, int final) 6927{ 6928 const char *startin = in; 6929 const char *endin = in + size; 6930 const DWORD flags = decode_code_page_flags(code_page); 6931 /* Ideally, we should get reason from FormatMessage. This is the Windows 6932 2000 English version of the message. */ 6933 const char *reason = "No mapping for the Unicode character exists " 6934 "in the target code page."; 6935 /* each step cannot decode more than 1 character, but a character can be 6936 represented as a surrogate pair */ 6937 wchar_t buffer[2], *startout, *out; 6938 int insize; 6939 Py_ssize_t outsize; 6940 PyObject *errorHandler = NULL; 6941 PyObject *exc = NULL; 6942 PyObject *encoding_obj = NULL; 6943 char *encoding; 6944 DWORD err; 6945 int ret = -1; 6946 6947 assert(size > 0); 6948 6949 encoding = code_page_name(code_page, &encoding_obj); 6950 if (encoding == NULL) 6951 return -1; 6952 6953 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 6954 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6955 UnicodeDecodeError. */ 6956 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6957 if (exc != NULL) { 6958 PyCodec_StrictErrors(exc); 6959 Py_CLEAR(exc); 6960 } 6961 goto error; 6962 } 6963 6964 if (*v == NULL) { 6965 /* Create unicode object */ 6966 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6967 PyErr_NoMemory(); 6968 goto error; 6969 } 6970 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6971 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6972 if (*v == NULL) 6973 goto error; 6974 startout = PyUnicode_AS_UNICODE(*v); 6975 } 6976 else { 6977 /* Extend unicode object */ 6978 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6979 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6980 PyErr_NoMemory(); 6981 goto error; 6982 } 6983 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6984 goto error; 6985 startout = PyUnicode_AS_UNICODE(*v) + n; 6986 } 6987 6988 /* Decode the byte string character per character */ 6989 out = startout; 6990 while (in < endin) 6991 { 6992 /* Decode a character */ 6993 insize = 1; 6994 do 6995 { 6996 outsize = MultiByteToWideChar(code_page, flags, 6997 in, insize, 6998 buffer, Py_ARRAY_LENGTH(buffer)); 6999 if (outsize > 0) 7000 break; 7001 err = GetLastError(); 7002 if (err != ERROR_NO_UNICODE_TRANSLATION 7003 && err != ERROR_INSUFFICIENT_BUFFER) 7004 { 7005 PyErr_SetFromWindowsErr(0); 7006 goto error; 7007 } 7008 insize++; 7009 } 7010 /* 4=maximum length of a UTF-8 sequence */ 7011 while (insize <= 4 && (in + insize) <= endin); 7012 7013 if (outsize <= 0) { 7014 Py_ssize_t startinpos, endinpos, outpos; 7015 7016 /* last character in partial decode? */ 7017 if (in + insize >= endin && !final) 7018 break; 7019 7020 startinpos = in - startin; 7021 endinpos = startinpos + 1; 7022 outpos = out - PyUnicode_AS_UNICODE(*v); 7023 if (unicode_decode_call_errorhandler_wchar( 7024 errors, &errorHandler, 7025 encoding, reason, 7026 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7027 v, &outpos)) 7028 { 7029 goto error; 7030 } 7031 out = PyUnicode_AS_UNICODE(*v) + outpos; 7032 } 7033 else { 7034 in += insize; 7035 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7036 out += outsize; 7037 } 7038 } 7039 7040 /* write a NUL character at the end */ 7041 *out = 0; 7042 7043 /* Extend unicode object */ 7044 outsize = out - startout; 7045 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7046 if (unicode_resize(v, outsize) < 0) 7047 goto error; 7048 /* (in - startin) <= size and size is an int */ 7049 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7050 7051error: 7052 Py_XDECREF(encoding_obj); 7053 Py_XDECREF(errorHandler); 7054 Py_XDECREF(exc); 7055 return ret; 7056} 7057 7058static PyObject * 7059decode_code_page_stateful(int code_page, 7060 const char *s, Py_ssize_t size, 7061 const char *errors, Py_ssize_t *consumed) 7062{ 7063 PyObject *v = NULL; 7064 int chunk_size, final, converted, done; 7065 7066 if (code_page < 0) { 7067 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7068 return NULL; 7069 } 7070 7071 if (consumed) 7072 *consumed = 0; 7073 7074 do 7075 { 7076#ifdef NEED_RETRY 7077 if (size > INT_MAX) { 7078 chunk_size = INT_MAX; 7079 final = 0; 7080 done = 0; 7081 } 7082 else 7083#endif 7084 { 7085 chunk_size = (int)size; 7086 final = (consumed == NULL); 7087 done = 1; 7088 } 7089 7090 if (chunk_size == 0 && done) { 7091 if (v != NULL) 7092 break; 7093 _Py_RETURN_UNICODE_EMPTY(); 7094 } 7095 7096 converted = decode_code_page_strict(code_page, &v, 7097 s, chunk_size); 7098 if (converted == -2) 7099 converted = decode_code_page_errors(code_page, &v, 7100 s, chunk_size, 7101 errors, final); 7102 assert(converted != 0 || done); 7103 7104 if (converted < 0) { 7105 Py_XDECREF(v); 7106 return NULL; 7107 } 7108 7109 if (consumed) 7110 *consumed += converted; 7111 7112 s += converted; 7113 size -= converted; 7114 } while (!done); 7115 7116 return unicode_result(v); 7117} 7118 7119PyObject * 7120PyUnicode_DecodeCodePageStateful(int code_page, 7121 const char *s, 7122 Py_ssize_t size, 7123 const char *errors, 7124 Py_ssize_t *consumed) 7125{ 7126 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7127} 7128 7129PyObject * 7130PyUnicode_DecodeMBCSStateful(const char *s, 7131 Py_ssize_t size, 7132 const char *errors, 7133 Py_ssize_t *consumed) 7134{ 7135 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7136} 7137 7138PyObject * 7139PyUnicode_DecodeMBCS(const char *s, 7140 Py_ssize_t size, 7141 const char *errors) 7142{ 7143 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7144} 7145 7146static DWORD 7147encode_code_page_flags(UINT code_page, const char *errors) 7148{ 7149 if (code_page == CP_UTF8) { 7150 if (winver.dwMajorVersion >= 6) 7151 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7152 and later */ 7153 return WC_ERR_INVALID_CHARS; 7154 else 7155 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7156 return 0; 7157 } 7158 else if (code_page == CP_UTF7) { 7159 /* CP_UTF7 only supports flags=0 */ 7160 return 0; 7161 } 7162 else { 7163 if (errors != NULL && strcmp(errors, "replace") == 0) 7164 return 0; 7165 else 7166 return WC_NO_BEST_FIT_CHARS; 7167 } 7168} 7169 7170/* 7171 * Encode a Unicode string to a Windows code page into a byte string in strict 7172 * mode. 7173 * 7174 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7175 * an OSError and returns -1 on other error. 7176 */ 7177static int 7178encode_code_page_strict(UINT code_page, PyObject **outbytes, 7179 PyObject *unicode, Py_ssize_t offset, int len, 7180 const char* errors) 7181{ 7182 BOOL usedDefaultChar = FALSE; 7183 BOOL *pusedDefaultChar = &usedDefaultChar; 7184 int outsize; 7185 PyObject *exc = NULL; 7186 wchar_t *p; 7187 Py_ssize_t size; 7188 const DWORD flags = encode_code_page_flags(code_page, NULL); 7189 char *out; 7190 /* Create a substring so that we can get the UTF-16 representation 7191 of just the slice under consideration. */ 7192 PyObject *substring; 7193 7194 assert(len > 0); 7195 7196 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7197 pusedDefaultChar = &usedDefaultChar; 7198 else 7199 pusedDefaultChar = NULL; 7200 7201 substring = PyUnicode_Substring(unicode, offset, offset+len); 7202 if (substring == NULL) 7203 return -1; 7204 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7205 if (p == NULL) { 7206 Py_DECREF(substring); 7207 return -1; 7208 } 7209 assert(size <= INT_MAX); 7210 7211 /* First get the size of the result */ 7212 outsize = WideCharToMultiByte(code_page, flags, 7213 p, (int)size, 7214 NULL, 0, 7215 NULL, pusedDefaultChar); 7216 if (outsize <= 0) 7217 goto error; 7218 /* If we used a default char, then we failed! */ 7219 if (pusedDefaultChar && *pusedDefaultChar) { 7220 Py_DECREF(substring); 7221 return -2; 7222 } 7223 7224 if (*outbytes == NULL) { 7225 /* Create string object */ 7226 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7227 if (*outbytes == NULL) { 7228 Py_DECREF(substring); 7229 return -1; 7230 } 7231 out = PyBytes_AS_STRING(*outbytes); 7232 } 7233 else { 7234 /* Extend string object */ 7235 const Py_ssize_t n = PyBytes_Size(*outbytes); 7236 if (outsize > PY_SSIZE_T_MAX - n) { 7237 PyErr_NoMemory(); 7238 Py_DECREF(substring); 7239 return -1; 7240 } 7241 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7242 Py_DECREF(substring); 7243 return -1; 7244 } 7245 out = PyBytes_AS_STRING(*outbytes) + n; 7246 } 7247 7248 /* Do the conversion */ 7249 outsize = WideCharToMultiByte(code_page, flags, 7250 p, (int)size, 7251 out, outsize, 7252 NULL, pusedDefaultChar); 7253 Py_CLEAR(substring); 7254 if (outsize <= 0) 7255 goto error; 7256 if (pusedDefaultChar && *pusedDefaultChar) 7257 return -2; 7258 return 0; 7259 7260error: 7261 Py_XDECREF(substring); 7262 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7263 return -2; 7264 PyErr_SetFromWindowsErr(0); 7265 return -1; 7266} 7267 7268/* 7269 * Encode a Unicode string to a Windows code page into a byte string using a 7270 * error handler. 7271 * 7272 * Returns consumed characters if succeed, or raise an OSError and returns 7273 * -1 on other error. 7274 */ 7275static int 7276encode_code_page_errors(UINT code_page, PyObject **outbytes, 7277 PyObject *unicode, Py_ssize_t unicode_offset, 7278 Py_ssize_t insize, const char* errors) 7279{ 7280 const DWORD flags = encode_code_page_flags(code_page, errors); 7281 Py_ssize_t pos = unicode_offset; 7282 Py_ssize_t endin = unicode_offset + insize; 7283 /* Ideally, we should get reason from FormatMessage. This is the Windows 7284 2000 English version of the message. */ 7285 const char *reason = "invalid character"; 7286 /* 4=maximum length of a UTF-8 sequence */ 7287 char buffer[4]; 7288 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7289 Py_ssize_t outsize; 7290 char *out; 7291 PyObject *errorHandler = NULL; 7292 PyObject *exc = NULL; 7293 PyObject *encoding_obj = NULL; 7294 char *encoding; 7295 Py_ssize_t newpos, newoutsize; 7296 PyObject *rep; 7297 int ret = -1; 7298 7299 assert(insize > 0); 7300 7301 encoding = code_page_name(code_page, &encoding_obj); 7302 if (encoding == NULL) 7303 return -1; 7304 7305 if (errors == NULL || strcmp(errors, "strict") == 0) { 7306 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7307 then we raise a UnicodeEncodeError. */ 7308 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7309 if (exc != NULL) { 7310 PyCodec_StrictErrors(exc); 7311 Py_DECREF(exc); 7312 } 7313 Py_XDECREF(encoding_obj); 7314 return -1; 7315 } 7316 7317 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7318 pusedDefaultChar = &usedDefaultChar; 7319 else 7320 pusedDefaultChar = NULL; 7321 7322 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7323 PyErr_NoMemory(); 7324 goto error; 7325 } 7326 outsize = insize * Py_ARRAY_LENGTH(buffer); 7327 7328 if (*outbytes == NULL) { 7329 /* Create string object */ 7330 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7331 if (*outbytes == NULL) 7332 goto error; 7333 out = PyBytes_AS_STRING(*outbytes); 7334 } 7335 else { 7336 /* Extend string object */ 7337 Py_ssize_t n = PyBytes_Size(*outbytes); 7338 if (n > PY_SSIZE_T_MAX - outsize) { 7339 PyErr_NoMemory(); 7340 goto error; 7341 } 7342 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7343 goto error; 7344 out = PyBytes_AS_STRING(*outbytes) + n; 7345 } 7346 7347 /* Encode the string character per character */ 7348 while (pos < endin) 7349 { 7350 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7351 wchar_t chars[2]; 7352 int charsize; 7353 if (ch < 0x10000) { 7354 chars[0] = (wchar_t)ch; 7355 charsize = 1; 7356 } 7357 else { 7358 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7359 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7360 charsize = 2; 7361 } 7362 7363 outsize = WideCharToMultiByte(code_page, flags, 7364 chars, charsize, 7365 buffer, Py_ARRAY_LENGTH(buffer), 7366 NULL, pusedDefaultChar); 7367 if (outsize > 0) { 7368 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7369 { 7370 pos++; 7371 memcpy(out, buffer, outsize); 7372 out += outsize; 7373 continue; 7374 } 7375 } 7376 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7377 PyErr_SetFromWindowsErr(0); 7378 goto error; 7379 } 7380 7381 rep = unicode_encode_call_errorhandler( 7382 errors, &errorHandler, encoding, reason, 7383 unicode, &exc, 7384 pos, pos + 1, &newpos); 7385 if (rep == NULL) 7386 goto error; 7387 pos = newpos; 7388 7389 if (PyBytes_Check(rep)) { 7390 outsize = PyBytes_GET_SIZE(rep); 7391 if (outsize != 1) { 7392 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7393 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7394 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7395 Py_DECREF(rep); 7396 goto error; 7397 } 7398 out = PyBytes_AS_STRING(*outbytes) + offset; 7399 } 7400 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7401 out += outsize; 7402 } 7403 else { 7404 Py_ssize_t i; 7405 enum PyUnicode_Kind kind; 7406 void *data; 7407 7408 if (PyUnicode_READY(rep) == -1) { 7409 Py_DECREF(rep); 7410 goto error; 7411 } 7412 7413 outsize = PyUnicode_GET_LENGTH(rep); 7414 if (outsize != 1) { 7415 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7416 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7417 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7418 Py_DECREF(rep); 7419 goto error; 7420 } 7421 out = PyBytes_AS_STRING(*outbytes) + offset; 7422 } 7423 kind = PyUnicode_KIND(rep); 7424 data = PyUnicode_DATA(rep); 7425 for (i=0; i < outsize; i++) { 7426 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7427 if (ch > 127) { 7428 raise_encode_exception(&exc, 7429 encoding, unicode, 7430 pos, pos + 1, 7431 "unable to encode error handler result to ASCII"); 7432 Py_DECREF(rep); 7433 goto error; 7434 } 7435 *out = (unsigned char)ch; 7436 out++; 7437 } 7438 } 7439 Py_DECREF(rep); 7440 } 7441 /* write a NUL byte */ 7442 *out = 0; 7443 outsize = out - PyBytes_AS_STRING(*outbytes); 7444 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7445 if (_PyBytes_Resize(outbytes, outsize) < 0) 7446 goto error; 7447 ret = 0; 7448 7449error: 7450 Py_XDECREF(encoding_obj); 7451 Py_XDECREF(errorHandler); 7452 Py_XDECREF(exc); 7453 return ret; 7454} 7455 7456static PyObject * 7457encode_code_page(int code_page, 7458 PyObject *unicode, 7459 const char *errors) 7460{ 7461 Py_ssize_t len; 7462 PyObject *outbytes = NULL; 7463 Py_ssize_t offset; 7464 int chunk_len, ret, done; 7465 7466 if (PyUnicode_READY(unicode) == -1) 7467 return NULL; 7468 len = PyUnicode_GET_LENGTH(unicode); 7469 7470 if (code_page < 0) { 7471 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7472 return NULL; 7473 } 7474 7475 if (len == 0) 7476 return PyBytes_FromStringAndSize(NULL, 0); 7477 7478 offset = 0; 7479 do 7480 { 7481#ifdef NEED_RETRY 7482 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7483 chunks. */ 7484 if (len > INT_MAX/2) { 7485 chunk_len = INT_MAX/2; 7486 done = 0; 7487 } 7488 else 7489#endif 7490 { 7491 chunk_len = (int)len; 7492 done = 1; 7493 } 7494 7495 ret = encode_code_page_strict(code_page, &outbytes, 7496 unicode, offset, chunk_len, 7497 errors); 7498 if (ret == -2) 7499 ret = encode_code_page_errors(code_page, &outbytes, 7500 unicode, offset, 7501 chunk_len, errors); 7502 if (ret < 0) { 7503 Py_XDECREF(outbytes); 7504 return NULL; 7505 } 7506 7507 offset += chunk_len; 7508 len -= chunk_len; 7509 } while (!done); 7510 7511 return outbytes; 7512} 7513 7514PyObject * 7515PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7516 Py_ssize_t size, 7517 const char *errors) 7518{ 7519 PyObject *unicode, *res; 7520 unicode = PyUnicode_FromUnicode(p, size); 7521 if (unicode == NULL) 7522 return NULL; 7523 res = encode_code_page(CP_ACP, unicode, errors); 7524 Py_DECREF(unicode); 7525 return res; 7526} 7527 7528PyObject * 7529PyUnicode_EncodeCodePage(int code_page, 7530 PyObject *unicode, 7531 const char *errors) 7532{ 7533 return encode_code_page(code_page, unicode, errors); 7534} 7535 7536PyObject * 7537PyUnicode_AsMBCSString(PyObject *unicode) 7538{ 7539 if (!PyUnicode_Check(unicode)) { 7540 PyErr_BadArgument(); 7541 return NULL; 7542 } 7543 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7544} 7545 7546#undef NEED_RETRY 7547 7548#endif /* HAVE_MBCS */ 7549 7550/* --- Character Mapping Codec -------------------------------------------- */ 7551 7552static int 7553charmap_decode_string(const char *s, 7554 Py_ssize_t size, 7555 PyObject *mapping, 7556 const char *errors, 7557 _PyUnicodeWriter *writer) 7558{ 7559 const char *starts = s; 7560 const char *e; 7561 Py_ssize_t startinpos, endinpos; 7562 PyObject *errorHandler = NULL, *exc = NULL; 7563 Py_ssize_t maplen; 7564 enum PyUnicode_Kind mapkind; 7565 void *mapdata; 7566 Py_UCS4 x; 7567 unsigned char ch; 7568 7569 if (PyUnicode_READY(mapping) == -1) 7570 return -1; 7571 7572 maplen = PyUnicode_GET_LENGTH(mapping); 7573 mapdata = PyUnicode_DATA(mapping); 7574 mapkind = PyUnicode_KIND(mapping); 7575 7576 e = s + size; 7577 7578 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7579 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7580 * is disabled in encoding aliases, latin1 is preferred because 7581 * its implementation is faster. */ 7582 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7583 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7584 Py_UCS4 maxchar = writer->maxchar; 7585 7586 assert (writer->kind == PyUnicode_1BYTE_KIND); 7587 while (s < e) { 7588 ch = *s; 7589 x = mapdata_ucs1[ch]; 7590 if (x > maxchar) { 7591 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7592 goto onError; 7593 maxchar = writer->maxchar; 7594 outdata = (Py_UCS1 *)writer->data; 7595 } 7596 outdata[writer->pos] = x; 7597 writer->pos++; 7598 ++s; 7599 } 7600 return 0; 7601 } 7602 7603 while (s < e) { 7604 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7605 enum PyUnicode_Kind outkind = writer->kind; 7606 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7607 if (outkind == PyUnicode_1BYTE_KIND) { 7608 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7609 Py_UCS4 maxchar = writer->maxchar; 7610 while (s < e) { 7611 ch = *s; 7612 x = mapdata_ucs2[ch]; 7613 if (x > maxchar) 7614 goto Error; 7615 outdata[writer->pos] = x; 7616 writer->pos++; 7617 ++s; 7618 } 7619 break; 7620 } 7621 else if (outkind == PyUnicode_2BYTE_KIND) { 7622 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7623 while (s < e) { 7624 ch = *s; 7625 x = mapdata_ucs2[ch]; 7626 if (x == 0xFFFE) 7627 goto Error; 7628 outdata[writer->pos] = x; 7629 writer->pos++; 7630 ++s; 7631 } 7632 break; 7633 } 7634 } 7635 ch = *s; 7636 7637 if (ch < maplen) 7638 x = PyUnicode_READ(mapkind, mapdata, ch); 7639 else 7640 x = 0xfffe; /* invalid value */ 7641Error: 7642 if (x == 0xfffe) 7643 { 7644 /* undefined mapping */ 7645 startinpos = s-starts; 7646 endinpos = startinpos+1; 7647 if (unicode_decode_call_errorhandler_writer( 7648 errors, &errorHandler, 7649 "charmap", "character maps to <undefined>", 7650 &starts, &e, &startinpos, &endinpos, &exc, &s, 7651 writer)) { 7652 goto onError; 7653 } 7654 continue; 7655 } 7656 7657 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7658 goto onError; 7659 ++s; 7660 } 7661 Py_XDECREF(errorHandler); 7662 Py_XDECREF(exc); 7663 return 0; 7664 7665onError: 7666 Py_XDECREF(errorHandler); 7667 Py_XDECREF(exc); 7668 return -1; 7669} 7670 7671static int 7672charmap_decode_mapping(const char *s, 7673 Py_ssize_t size, 7674 PyObject *mapping, 7675 const char *errors, 7676 _PyUnicodeWriter *writer) 7677{ 7678 const char *starts = s; 7679 const char *e; 7680 Py_ssize_t startinpos, endinpos; 7681 PyObject *errorHandler = NULL, *exc = NULL; 7682 unsigned char ch; 7683 PyObject *key, *item = NULL; 7684 7685 e = s + size; 7686 7687 while (s < e) { 7688 ch = *s; 7689 7690 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7691 key = PyLong_FromLong((long)ch); 7692 if (key == NULL) 7693 goto onError; 7694 7695 item = PyObject_GetItem(mapping, key); 7696 Py_DECREF(key); 7697 if (item == NULL) { 7698 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7699 /* No mapping found means: mapping is undefined. */ 7700 PyErr_Clear(); 7701 goto Undefined; 7702 } else 7703 goto onError; 7704 } 7705 7706 /* Apply mapping */ 7707 if (item == Py_None) 7708 goto Undefined; 7709 if (PyLong_Check(item)) { 7710 long value = PyLong_AS_LONG(item); 7711 if (value == 0xFFFE) 7712 goto Undefined; 7713 if (value < 0 || value > MAX_UNICODE) { 7714 PyErr_Format(PyExc_TypeError, 7715 "character mapping must be in range(0x%lx)", 7716 (unsigned long)MAX_UNICODE + 1); 7717 goto onError; 7718 } 7719 7720 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7721 goto onError; 7722 } 7723 else if (PyUnicode_Check(item)) { 7724 if (PyUnicode_READY(item) == -1) 7725 goto onError; 7726 if (PyUnicode_GET_LENGTH(item) == 1) { 7727 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7728 if (value == 0xFFFE) 7729 goto Undefined; 7730 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7731 goto onError; 7732 } 7733 else { 7734 writer->overallocate = 1; 7735 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7736 goto onError; 7737 } 7738 } 7739 else { 7740 /* wrong return value */ 7741 PyErr_SetString(PyExc_TypeError, 7742 "character mapping must return integer, None or str"); 7743 goto onError; 7744 } 7745 Py_CLEAR(item); 7746 ++s; 7747 continue; 7748 7749Undefined: 7750 /* undefined mapping */ 7751 Py_CLEAR(item); 7752 startinpos = s-starts; 7753 endinpos = startinpos+1; 7754 if (unicode_decode_call_errorhandler_writer( 7755 errors, &errorHandler, 7756 "charmap", "character maps to <undefined>", 7757 &starts, &e, &startinpos, &endinpos, &exc, &s, 7758 writer)) { 7759 goto onError; 7760 } 7761 } 7762 Py_XDECREF(errorHandler); 7763 Py_XDECREF(exc); 7764 return 0; 7765 7766onError: 7767 Py_XDECREF(item); 7768 Py_XDECREF(errorHandler); 7769 Py_XDECREF(exc); 7770 return -1; 7771} 7772 7773PyObject * 7774PyUnicode_DecodeCharmap(const char *s, 7775 Py_ssize_t size, 7776 PyObject *mapping, 7777 const char *errors) 7778{ 7779 _PyUnicodeWriter writer; 7780 7781 /* Default to Latin-1 */ 7782 if (mapping == NULL) 7783 return PyUnicode_DecodeLatin1(s, size, errors); 7784 7785 if (size == 0) 7786 _Py_RETURN_UNICODE_EMPTY(); 7787 _PyUnicodeWriter_Init(&writer); 7788 writer.min_length = size; 7789 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7790 goto onError; 7791 7792 if (PyUnicode_CheckExact(mapping)) { 7793 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7794 goto onError; 7795 } 7796 else { 7797 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7798 goto onError; 7799 } 7800 return _PyUnicodeWriter_Finish(&writer); 7801 7802 onError: 7803 _PyUnicodeWriter_Dealloc(&writer); 7804 return NULL; 7805} 7806 7807/* Charmap encoding: the lookup table */ 7808 7809struct encoding_map { 7810 PyObject_HEAD 7811 unsigned char level1[32]; 7812 int count2, count3; 7813 unsigned char level23[1]; 7814}; 7815 7816static PyObject* 7817encoding_map_size(PyObject *obj, PyObject* args) 7818{ 7819 struct encoding_map *map = (struct encoding_map*)obj; 7820 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7821 128*map->count3); 7822} 7823 7824static PyMethodDef encoding_map_methods[] = { 7825 {"size", encoding_map_size, METH_NOARGS, 7826 PyDoc_STR("Return the size (in bytes) of this object") }, 7827 { 0 } 7828}; 7829 7830static void 7831encoding_map_dealloc(PyObject* o) 7832{ 7833 PyObject_FREE(o); 7834} 7835 7836static PyTypeObject EncodingMapType = { 7837 PyVarObject_HEAD_INIT(NULL, 0) 7838 "EncodingMap", /*tp_name*/ 7839 sizeof(struct encoding_map), /*tp_basicsize*/ 7840 0, /*tp_itemsize*/ 7841 /* methods */ 7842 encoding_map_dealloc, /*tp_dealloc*/ 7843 0, /*tp_print*/ 7844 0, /*tp_getattr*/ 7845 0, /*tp_setattr*/ 7846 0, /*tp_reserved*/ 7847 0, /*tp_repr*/ 7848 0, /*tp_as_number*/ 7849 0, /*tp_as_sequence*/ 7850 0, /*tp_as_mapping*/ 7851 0, /*tp_hash*/ 7852 0, /*tp_call*/ 7853 0, /*tp_str*/ 7854 0, /*tp_getattro*/ 7855 0, /*tp_setattro*/ 7856 0, /*tp_as_buffer*/ 7857 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7858 0, /*tp_doc*/ 7859 0, /*tp_traverse*/ 7860 0, /*tp_clear*/ 7861 0, /*tp_richcompare*/ 7862 0, /*tp_weaklistoffset*/ 7863 0, /*tp_iter*/ 7864 0, /*tp_iternext*/ 7865 encoding_map_methods, /*tp_methods*/ 7866 0, /*tp_members*/ 7867 0, /*tp_getset*/ 7868 0, /*tp_base*/ 7869 0, /*tp_dict*/ 7870 0, /*tp_descr_get*/ 7871 0, /*tp_descr_set*/ 7872 0, /*tp_dictoffset*/ 7873 0, /*tp_init*/ 7874 0, /*tp_alloc*/ 7875 0, /*tp_new*/ 7876 0, /*tp_free*/ 7877 0, /*tp_is_gc*/ 7878}; 7879 7880PyObject* 7881PyUnicode_BuildEncodingMap(PyObject* string) 7882{ 7883 PyObject *result; 7884 struct encoding_map *mresult; 7885 int i; 7886 int need_dict = 0; 7887 unsigned char level1[32]; 7888 unsigned char level2[512]; 7889 unsigned char *mlevel1, *mlevel2, *mlevel3; 7890 int count2 = 0, count3 = 0; 7891 int kind; 7892 void *data; 7893 Py_ssize_t length; 7894 Py_UCS4 ch; 7895 7896 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7897 PyErr_BadArgument(); 7898 return NULL; 7899 } 7900 kind = PyUnicode_KIND(string); 7901 data = PyUnicode_DATA(string); 7902 length = PyUnicode_GET_LENGTH(string); 7903 length = Py_MIN(length, 256); 7904 memset(level1, 0xFF, sizeof level1); 7905 memset(level2, 0xFF, sizeof level2); 7906 7907 /* If there isn't a one-to-one mapping of NULL to \0, 7908 or if there are non-BMP characters, we need to use 7909 a mapping dictionary. */ 7910 if (PyUnicode_READ(kind, data, 0) != 0) 7911 need_dict = 1; 7912 for (i = 1; i < length; i++) { 7913 int l1, l2; 7914 ch = PyUnicode_READ(kind, data, i); 7915 if (ch == 0 || ch > 0xFFFF) { 7916 need_dict = 1; 7917 break; 7918 } 7919 if (ch == 0xFFFE) 7920 /* unmapped character */ 7921 continue; 7922 l1 = ch >> 11; 7923 l2 = ch >> 7; 7924 if (level1[l1] == 0xFF) 7925 level1[l1] = count2++; 7926 if (level2[l2] == 0xFF) 7927 level2[l2] = count3++; 7928 } 7929 7930 if (count2 >= 0xFF || count3 >= 0xFF) 7931 need_dict = 1; 7932 7933 if (need_dict) { 7934 PyObject *result = PyDict_New(); 7935 PyObject *key, *value; 7936 if (!result) 7937 return NULL; 7938 for (i = 0; i < length; i++) { 7939 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7940 value = PyLong_FromLong(i); 7941 if (!key || !value) 7942 goto failed1; 7943 if (PyDict_SetItem(result, key, value) == -1) 7944 goto failed1; 7945 Py_DECREF(key); 7946 Py_DECREF(value); 7947 } 7948 return result; 7949 failed1: 7950 Py_XDECREF(key); 7951 Py_XDECREF(value); 7952 Py_DECREF(result); 7953 return NULL; 7954 } 7955 7956 /* Create a three-level trie */ 7957 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7958 16*count2 + 128*count3 - 1); 7959 if (!result) 7960 return PyErr_NoMemory(); 7961 PyObject_Init(result, &EncodingMapType); 7962 mresult = (struct encoding_map*)result; 7963 mresult->count2 = count2; 7964 mresult->count3 = count3; 7965 mlevel1 = mresult->level1; 7966 mlevel2 = mresult->level23; 7967 mlevel3 = mresult->level23 + 16*count2; 7968 memcpy(mlevel1, level1, 32); 7969 memset(mlevel2, 0xFF, 16*count2); 7970 memset(mlevel3, 0, 128*count3); 7971 count3 = 0; 7972 for (i = 1; i < length; i++) { 7973 int o1, o2, o3, i2, i3; 7974 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7975 if (ch == 0xFFFE) 7976 /* unmapped character */ 7977 continue; 7978 o1 = ch>>11; 7979 o2 = (ch>>7) & 0xF; 7980 i2 = 16*mlevel1[o1] + o2; 7981 if (mlevel2[i2] == 0xFF) 7982 mlevel2[i2] = count3++; 7983 o3 = ch & 0x7F; 7984 i3 = 128*mlevel2[i2] + o3; 7985 mlevel3[i3] = i; 7986 } 7987 return result; 7988} 7989 7990static int 7991encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7992{ 7993 struct encoding_map *map = (struct encoding_map*)mapping; 7994 int l1 = c>>11; 7995 int l2 = (c>>7) & 0xF; 7996 int l3 = c & 0x7F; 7997 int i; 7998 7999 if (c > 0xFFFF) 8000 return -1; 8001 if (c == 0) 8002 return 0; 8003 /* level 1*/ 8004 i = map->level1[l1]; 8005 if (i == 0xFF) { 8006 return -1; 8007 } 8008 /* level 2*/ 8009 i = map->level23[16*i+l2]; 8010 if (i == 0xFF) { 8011 return -1; 8012 } 8013 /* level 3 */ 8014 i = map->level23[16*map->count2 + 128*i + l3]; 8015 if (i == 0) { 8016 return -1; 8017 } 8018 return i; 8019} 8020 8021/* Lookup the character ch in the mapping. If the character 8022 can't be found, Py_None is returned (or NULL, if another 8023 error occurred). */ 8024static PyObject * 8025charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8026{ 8027 PyObject *w = PyLong_FromLong((long)c); 8028 PyObject *x; 8029 8030 if (w == NULL) 8031 return NULL; 8032 x = PyObject_GetItem(mapping, w); 8033 Py_DECREF(w); 8034 if (x == NULL) { 8035 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8036 /* No mapping found means: mapping is undefined. */ 8037 PyErr_Clear(); 8038 x = Py_None; 8039 Py_INCREF(x); 8040 return x; 8041 } else 8042 return NULL; 8043 } 8044 else if (x == Py_None) 8045 return x; 8046 else if (PyLong_Check(x)) { 8047 long value = PyLong_AS_LONG(x); 8048 if (value < 0 || value > 255) { 8049 PyErr_SetString(PyExc_TypeError, 8050 "character mapping must be in range(256)"); 8051 Py_DECREF(x); 8052 return NULL; 8053 } 8054 return x; 8055 } 8056 else if (PyBytes_Check(x)) 8057 return x; 8058 else { 8059 /* wrong return value */ 8060 PyErr_Format(PyExc_TypeError, 8061 "character mapping must return integer, bytes or None, not %.400s", 8062 x->ob_type->tp_name); 8063 Py_DECREF(x); 8064 return NULL; 8065 } 8066} 8067 8068static int 8069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8070{ 8071 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8072 /* exponentially overallocate to minimize reallocations */ 8073 if (requiredsize < 2*outsize) 8074 requiredsize = 2*outsize; 8075 if (_PyBytes_Resize(outobj, requiredsize)) 8076 return -1; 8077 return 0; 8078} 8079 8080typedef enum charmapencode_result { 8081 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8082} charmapencode_result; 8083/* lookup the character, put the result in the output string and adjust 8084 various state variables. Resize the output bytes object if not enough 8085 space is available. Return a new reference to the object that 8086 was put in the output buffer, or Py_None, if the mapping was undefined 8087 (in which case no character was written) or NULL, if a 8088 reallocation error occurred. The caller must decref the result */ 8089static charmapencode_result 8090charmapencode_output(Py_UCS4 c, PyObject *mapping, 8091 PyObject **outobj, Py_ssize_t *outpos) 8092{ 8093 PyObject *rep; 8094 char *outstart; 8095 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8096 8097 if (Py_TYPE(mapping) == &EncodingMapType) { 8098 int res = encoding_map_lookup(c, mapping); 8099 Py_ssize_t requiredsize = *outpos+1; 8100 if (res == -1) 8101 return enc_FAILED; 8102 if (outsize<requiredsize) 8103 if (charmapencode_resize(outobj, outpos, requiredsize)) 8104 return enc_EXCEPTION; 8105 outstart = PyBytes_AS_STRING(*outobj); 8106 outstart[(*outpos)++] = (char)res; 8107 return enc_SUCCESS; 8108 } 8109 8110 rep = charmapencode_lookup(c, mapping); 8111 if (rep==NULL) 8112 return enc_EXCEPTION; 8113 else if (rep==Py_None) { 8114 Py_DECREF(rep); 8115 return enc_FAILED; 8116 } else { 8117 if (PyLong_Check(rep)) { 8118 Py_ssize_t requiredsize = *outpos+1; 8119 if (outsize<requiredsize) 8120 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8121 Py_DECREF(rep); 8122 return enc_EXCEPTION; 8123 } 8124 outstart = PyBytes_AS_STRING(*outobj); 8125 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8126 } 8127 else { 8128 const char *repchars = PyBytes_AS_STRING(rep); 8129 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8130 Py_ssize_t requiredsize = *outpos+repsize; 8131 if (outsize<requiredsize) 8132 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8133 Py_DECREF(rep); 8134 return enc_EXCEPTION; 8135 } 8136 outstart = PyBytes_AS_STRING(*outobj); 8137 memcpy(outstart + *outpos, repchars, repsize); 8138 *outpos += repsize; 8139 } 8140 } 8141 Py_DECREF(rep); 8142 return enc_SUCCESS; 8143} 8144 8145/* handle an error in PyUnicode_EncodeCharmap 8146 Return 0 on success, -1 on error */ 8147static int 8148charmap_encoding_error( 8149 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8150 PyObject **exceptionObject, 8151 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8152 PyObject **res, Py_ssize_t *respos) 8153{ 8154 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8155 Py_ssize_t size, repsize; 8156 Py_ssize_t newpos; 8157 enum PyUnicode_Kind kind; 8158 void *data; 8159 Py_ssize_t index; 8160 /* startpos for collecting unencodable chars */ 8161 Py_ssize_t collstartpos = *inpos; 8162 Py_ssize_t collendpos = *inpos+1; 8163 Py_ssize_t collpos; 8164 char *encoding = "charmap"; 8165 char *reason = "character maps to <undefined>"; 8166 charmapencode_result x; 8167 Py_UCS4 ch; 8168 int val; 8169 8170 if (PyUnicode_READY(unicode) == -1) 8171 return -1; 8172 size = PyUnicode_GET_LENGTH(unicode); 8173 /* find all unencodable characters */ 8174 while (collendpos < size) { 8175 PyObject *rep; 8176 if (Py_TYPE(mapping) == &EncodingMapType) { 8177 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8178 val = encoding_map_lookup(ch, mapping); 8179 if (val != -1) 8180 break; 8181 ++collendpos; 8182 continue; 8183 } 8184 8185 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8186 rep = charmapencode_lookup(ch, mapping); 8187 if (rep==NULL) 8188 return -1; 8189 else if (rep!=Py_None) { 8190 Py_DECREF(rep); 8191 break; 8192 } 8193 Py_DECREF(rep); 8194 ++collendpos; 8195 } 8196 /* cache callback name lookup 8197 * (if not done yet, i.e. it's the first error) */ 8198 if (*known_errorHandler==-1) { 8199 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8200 *known_errorHandler = 1; 8201 else if (!strcmp(errors, "replace")) 8202 *known_errorHandler = 2; 8203 else if (!strcmp(errors, "ignore")) 8204 *known_errorHandler = 3; 8205 else if (!strcmp(errors, "xmlcharrefreplace")) 8206 *known_errorHandler = 4; 8207 else 8208 *known_errorHandler = 0; 8209 } 8210 switch (*known_errorHandler) { 8211 case 1: /* strict */ 8212 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8213 return -1; 8214 case 2: /* replace */ 8215 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8216 x = charmapencode_output('?', mapping, res, respos); 8217 if (x==enc_EXCEPTION) { 8218 return -1; 8219 } 8220 else if (x==enc_FAILED) { 8221 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8222 return -1; 8223 } 8224 } 8225 /* fall through */ 8226 case 3: /* ignore */ 8227 *inpos = collendpos; 8228 break; 8229 case 4: /* xmlcharrefreplace */ 8230 /* generate replacement (temporarily (mis)uses p) */ 8231 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8232 char buffer[2+29+1+1]; 8233 char *cp; 8234 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8235 for (cp = buffer; *cp; ++cp) { 8236 x = charmapencode_output(*cp, mapping, res, respos); 8237 if (x==enc_EXCEPTION) 8238 return -1; 8239 else if (x==enc_FAILED) { 8240 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8241 return -1; 8242 } 8243 } 8244 } 8245 *inpos = collendpos; 8246 break; 8247 default: 8248 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8249 encoding, reason, unicode, exceptionObject, 8250 collstartpos, collendpos, &newpos); 8251 if (repunicode == NULL) 8252 return -1; 8253 if (PyBytes_Check(repunicode)) { 8254 /* Directly copy bytes result to output. */ 8255 Py_ssize_t outsize = PyBytes_Size(*res); 8256 Py_ssize_t requiredsize; 8257 repsize = PyBytes_Size(repunicode); 8258 requiredsize = *respos + repsize; 8259 if (requiredsize > outsize) 8260 /* Make room for all additional bytes. */ 8261 if (charmapencode_resize(res, respos, requiredsize)) { 8262 Py_DECREF(repunicode); 8263 return -1; 8264 } 8265 memcpy(PyBytes_AsString(*res) + *respos, 8266 PyBytes_AsString(repunicode), repsize); 8267 *respos += repsize; 8268 *inpos = newpos; 8269 Py_DECREF(repunicode); 8270 break; 8271 } 8272 /* generate replacement */ 8273 if (PyUnicode_READY(repunicode) == -1) { 8274 Py_DECREF(repunicode); 8275 return -1; 8276 } 8277 repsize = PyUnicode_GET_LENGTH(repunicode); 8278 data = PyUnicode_DATA(repunicode); 8279 kind = PyUnicode_KIND(repunicode); 8280 for (index = 0; index < repsize; index++) { 8281 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8282 x = charmapencode_output(repch, mapping, res, respos); 8283 if (x==enc_EXCEPTION) { 8284 Py_DECREF(repunicode); 8285 return -1; 8286 } 8287 else if (x==enc_FAILED) { 8288 Py_DECREF(repunicode); 8289 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8290 return -1; 8291 } 8292 } 8293 *inpos = newpos; 8294 Py_DECREF(repunicode); 8295 } 8296 return 0; 8297} 8298 8299PyObject * 8300_PyUnicode_EncodeCharmap(PyObject *unicode, 8301 PyObject *mapping, 8302 const char *errors) 8303{ 8304 /* output object */ 8305 PyObject *res = NULL; 8306 /* current input position */ 8307 Py_ssize_t inpos = 0; 8308 Py_ssize_t size; 8309 /* current output position */ 8310 Py_ssize_t respos = 0; 8311 PyObject *errorHandler = NULL; 8312 PyObject *exc = NULL; 8313 /* the following variable is used for caching string comparisons 8314 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8315 * 3=ignore, 4=xmlcharrefreplace */ 8316 int known_errorHandler = -1; 8317 void *data; 8318 int kind; 8319 8320 if (PyUnicode_READY(unicode) == -1) 8321 return NULL; 8322 size = PyUnicode_GET_LENGTH(unicode); 8323 data = PyUnicode_DATA(unicode); 8324 kind = PyUnicode_KIND(unicode); 8325 8326 /* Default to Latin-1 */ 8327 if (mapping == NULL) 8328 return unicode_encode_ucs1(unicode, errors, 256); 8329 8330 /* allocate enough for a simple encoding without 8331 replacements, if we need more, we'll resize */ 8332 res = PyBytes_FromStringAndSize(NULL, size); 8333 if (res == NULL) 8334 goto onError; 8335 if (size == 0) 8336 return res; 8337 8338 while (inpos<size) { 8339 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8340 /* try to encode it */ 8341 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8342 if (x==enc_EXCEPTION) /* error */ 8343 goto onError; 8344 if (x==enc_FAILED) { /* unencodable character */ 8345 if (charmap_encoding_error(unicode, &inpos, mapping, 8346 &exc, 8347 &known_errorHandler, &errorHandler, errors, 8348 &res, &respos)) { 8349 goto onError; 8350 } 8351 } 8352 else 8353 /* done with this character => adjust input position */ 8354 ++inpos; 8355 } 8356 8357 /* Resize if we allocated to much */ 8358 if (respos<PyBytes_GET_SIZE(res)) 8359 if (_PyBytes_Resize(&res, respos) < 0) 8360 goto onError; 8361 8362 Py_XDECREF(exc); 8363 Py_XDECREF(errorHandler); 8364 return res; 8365 8366 onError: 8367 Py_XDECREF(res); 8368 Py_XDECREF(exc); 8369 Py_XDECREF(errorHandler); 8370 return NULL; 8371} 8372 8373/* Deprecated */ 8374PyObject * 8375PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8376 Py_ssize_t size, 8377 PyObject *mapping, 8378 const char *errors) 8379{ 8380 PyObject *result; 8381 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8382 if (unicode == NULL) 8383 return NULL; 8384 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8385 Py_DECREF(unicode); 8386 return result; 8387} 8388 8389PyObject * 8390PyUnicode_AsCharmapString(PyObject *unicode, 8391 PyObject *mapping) 8392{ 8393 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8394 PyErr_BadArgument(); 8395 return NULL; 8396 } 8397 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8398} 8399 8400/* create or adjust a UnicodeTranslateError */ 8401static void 8402make_translate_exception(PyObject **exceptionObject, 8403 PyObject *unicode, 8404 Py_ssize_t startpos, Py_ssize_t endpos, 8405 const char *reason) 8406{ 8407 if (*exceptionObject == NULL) { 8408 *exceptionObject = _PyUnicodeTranslateError_Create( 8409 unicode, startpos, endpos, reason); 8410 } 8411 else { 8412 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8413 goto onError; 8414 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8415 goto onError; 8416 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8417 goto onError; 8418 return; 8419 onError: 8420 Py_CLEAR(*exceptionObject); 8421 } 8422} 8423 8424/* error handling callback helper: 8425 build arguments, call the callback and check the arguments, 8426 put the result into newpos and return the replacement string, which 8427 has to be freed by the caller */ 8428static PyObject * 8429unicode_translate_call_errorhandler(const char *errors, 8430 PyObject **errorHandler, 8431 const char *reason, 8432 PyObject *unicode, PyObject **exceptionObject, 8433 Py_ssize_t startpos, Py_ssize_t endpos, 8434 Py_ssize_t *newpos) 8435{ 8436 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8437 8438 Py_ssize_t i_newpos; 8439 PyObject *restuple; 8440 PyObject *resunicode; 8441 8442 if (*errorHandler == NULL) { 8443 *errorHandler = PyCodec_LookupError(errors); 8444 if (*errorHandler == NULL) 8445 return NULL; 8446 } 8447 8448 make_translate_exception(exceptionObject, 8449 unicode, startpos, endpos, reason); 8450 if (*exceptionObject == NULL) 8451 return NULL; 8452 8453 restuple = PyObject_CallFunctionObjArgs( 8454 *errorHandler, *exceptionObject, NULL); 8455 if (restuple == NULL) 8456 return NULL; 8457 if (!PyTuple_Check(restuple)) { 8458 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8459 Py_DECREF(restuple); 8460 return NULL; 8461 } 8462 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8463 &resunicode, &i_newpos)) { 8464 Py_DECREF(restuple); 8465 return NULL; 8466 } 8467 if (i_newpos<0) 8468 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8469 else 8470 *newpos = i_newpos; 8471 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8472 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8473 Py_DECREF(restuple); 8474 return NULL; 8475 } 8476 Py_INCREF(resunicode); 8477 Py_DECREF(restuple); 8478 return resunicode; 8479} 8480 8481/* Lookup the character ch in the mapping and put the result in result, 8482 which must be decrefed by the caller. 8483 Return 0 on success, -1 on error */ 8484static int 8485charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8486{ 8487 PyObject *w = PyLong_FromLong((long)c); 8488 PyObject *x; 8489 8490 if (w == NULL) 8491 return -1; 8492 x = PyObject_GetItem(mapping, w); 8493 Py_DECREF(w); 8494 if (x == NULL) { 8495 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8496 /* No mapping found means: use 1:1 mapping. */ 8497 PyErr_Clear(); 8498 *result = NULL; 8499 return 0; 8500 } else 8501 return -1; 8502 } 8503 else if (x == Py_None) { 8504 *result = x; 8505 return 0; 8506 } 8507 else if (PyLong_Check(x)) { 8508 long value = PyLong_AS_LONG(x); 8509 if (value < 0 || value > MAX_UNICODE) { 8510 PyErr_Format(PyExc_ValueError, 8511 "character mapping must be in range(0x%x)", 8512 MAX_UNICODE+1); 8513 Py_DECREF(x); 8514 return -1; 8515 } 8516 *result = x; 8517 return 0; 8518 } 8519 else if (PyUnicode_Check(x)) { 8520 *result = x; 8521 return 0; 8522 } 8523 else { 8524 /* wrong return value */ 8525 PyErr_SetString(PyExc_TypeError, 8526 "character mapping must return integer, None or str"); 8527 Py_DECREF(x); 8528 return -1; 8529 } 8530} 8531 8532/* lookup the character, write the result into the writer. 8533 Return 1 if the result was written into the writer, return 0 if the mapping 8534 was undefined, raise an exception return -1 on error. */ 8535static int 8536charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8537 _PyUnicodeWriter *writer) 8538{ 8539 PyObject *item; 8540 8541 if (charmaptranslate_lookup(ch, mapping, &item)) 8542 return -1; 8543 8544 if (item == NULL) { 8545 /* not found => default to 1:1 mapping */ 8546 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8547 return -1; 8548 } 8549 return 1; 8550 } 8551 8552 if (item == Py_None) { 8553 Py_DECREF(item); 8554 return 0; 8555 } 8556 8557 if (PyLong_Check(item)) { 8558 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8559 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8560 used it */ 8561 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8562 Py_DECREF(item); 8563 return -1; 8564 } 8565 Py_DECREF(item); 8566 return 1; 8567 } 8568 8569 if (!PyUnicode_Check(item)) { 8570 Py_DECREF(item); 8571 return -1; 8572 } 8573 8574 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8575 Py_DECREF(item); 8576 return -1; 8577 } 8578 8579 Py_DECREF(item); 8580 return 1; 8581} 8582 8583static int 8584unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8585 Py_UCS1 *translate) 8586{ 8587 PyObject *item = NULL; 8588 int ret = 0; 8589 8590 if (charmaptranslate_lookup(ch, mapping, &item)) { 8591 return -1; 8592 } 8593 8594 if (item == Py_None) { 8595 /* deletion */ 8596 translate[ch] = 0xfe; 8597 } 8598 else if (item == NULL) { 8599 /* not found => default to 1:1 mapping */ 8600 translate[ch] = ch; 8601 return 1; 8602 } 8603 else if (PyLong_Check(item)) { 8604 long replace = PyLong_AS_LONG(item); 8605 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8606 used it */ 8607 if (127 < replace) { 8608 /* invalid character or character outside ASCII: 8609 skip the fast translate */ 8610 goto exit; 8611 } 8612 translate[ch] = (Py_UCS1)replace; 8613 } 8614 else if (PyUnicode_Check(item)) { 8615 Py_UCS4 replace; 8616 8617 if (PyUnicode_READY(item) == -1) { 8618 Py_DECREF(item); 8619 return -1; 8620 } 8621 if (PyUnicode_GET_LENGTH(item) != 1) 8622 goto exit; 8623 8624 replace = PyUnicode_READ_CHAR(item, 0); 8625 if (replace > 127) 8626 goto exit; 8627 translate[ch] = (Py_UCS1)replace; 8628 } 8629 else { 8630 /* not None, NULL, long or unicode */ 8631 goto exit; 8632 } 8633 ret = 1; 8634 8635 exit: 8636 Py_DECREF(item); 8637 return ret; 8638} 8639 8640/* Fast path for ascii => ascii translation. Return 1 if the whole string 8641 was translated into writer, return 0 if the input string was partially 8642 translated into writer, raise an exception and return -1 on error. */ 8643static int 8644unicode_fast_translate(PyObject *input, PyObject *mapping, 8645 _PyUnicodeWriter *writer, int ignore) 8646{ 8647 Py_UCS1 ascii_table[128], ch, ch2; 8648 Py_ssize_t len; 8649 Py_UCS1 *in, *end, *out; 8650 int res = 0; 8651 8652 if (PyUnicode_READY(input) == -1) 8653 return -1; 8654 if (!PyUnicode_IS_ASCII(input)) 8655 return 0; 8656 len = PyUnicode_GET_LENGTH(input); 8657 8658 memset(ascii_table, 0xff, 128); 8659 8660 in = PyUnicode_1BYTE_DATA(input); 8661 end = in + len; 8662 8663 assert(PyUnicode_IS_ASCII(writer->buffer)); 8664 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8665 out = PyUnicode_1BYTE_DATA(writer->buffer); 8666 8667 for (; in < end; in++) { 8668 ch = *in; 8669 ch2 = ascii_table[ch]; 8670 if (ch2 == 0xff) { 8671 int translate = unicode_fast_translate_lookup(mapping, ch, 8672 ascii_table); 8673 if (translate < 0) 8674 return -1; 8675 if (translate == 0) 8676 goto exit; 8677 ch2 = ascii_table[ch]; 8678 } 8679 if (ch2 == 0xfe) { 8680 if (ignore) 8681 continue; 8682 goto exit; 8683 } 8684 assert(ch2 < 128); 8685 *out = ch2; 8686 out++; 8687 } 8688 res = 1; 8689 8690exit: 8691 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8692 return res; 8693} 8694 8695PyObject * 8696_PyUnicode_TranslateCharmap(PyObject *input, 8697 PyObject *mapping, 8698 const char *errors) 8699{ 8700 /* input object */ 8701 char *data; 8702 Py_ssize_t size, i; 8703 int kind; 8704 /* output buffer */ 8705 _PyUnicodeWriter writer; 8706 /* error handler */ 8707 char *reason = "character maps to <undefined>"; 8708 PyObject *errorHandler = NULL; 8709 PyObject *exc = NULL; 8710 int ignore; 8711 int res; 8712 8713 if (mapping == NULL) { 8714 PyErr_BadArgument(); 8715 return NULL; 8716 } 8717 8718 if (PyUnicode_READY(input) == -1) 8719 return NULL; 8720 data = (char*)PyUnicode_DATA(input); 8721 kind = PyUnicode_KIND(input); 8722 size = PyUnicode_GET_LENGTH(input); 8723 8724 if (size == 0) { 8725 Py_INCREF(input); 8726 return input; 8727 } 8728 8729 /* allocate enough for a simple 1:1 translation without 8730 replacements, if we need more, we'll resize */ 8731 _PyUnicodeWriter_Init(&writer); 8732 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8733 goto onError; 8734 8735 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8736 8737 res = unicode_fast_translate(input, mapping, &writer, ignore); 8738 if (res < 0) { 8739 _PyUnicodeWriter_Dealloc(&writer); 8740 return NULL; 8741 } 8742 if (res == 1) 8743 return _PyUnicodeWriter_Finish(&writer); 8744 8745 i = writer.pos; 8746 while (i<size) { 8747 /* try to encode it */ 8748 int translate; 8749 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8750 Py_ssize_t newpos; 8751 /* startpos for collecting untranslatable chars */ 8752 Py_ssize_t collstart; 8753 Py_ssize_t collend; 8754 Py_UCS4 ch; 8755 8756 ch = PyUnicode_READ(kind, data, i); 8757 translate = charmaptranslate_output(ch, mapping, &writer); 8758 if (translate < 0) 8759 goto onError; 8760 8761 if (translate != 0) { 8762 /* it worked => adjust input pointer */ 8763 ++i; 8764 continue; 8765 } 8766 8767 /* untranslatable character */ 8768 collstart = i; 8769 collend = i+1; 8770 8771 /* find all untranslatable characters */ 8772 while (collend < size) { 8773 PyObject *x; 8774 ch = PyUnicode_READ(kind, data, collend); 8775 if (charmaptranslate_lookup(ch, mapping, &x)) 8776 goto onError; 8777 Py_XDECREF(x); 8778 if (x != Py_None) 8779 break; 8780 ++collend; 8781 } 8782 8783 if (ignore) { 8784 i = collend; 8785 } 8786 else { 8787 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8788 reason, input, &exc, 8789 collstart, collend, &newpos); 8790 if (repunicode == NULL) 8791 goto onError; 8792 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 8793 Py_DECREF(repunicode); 8794 goto onError; 8795 } 8796 Py_DECREF(repunicode); 8797 i = newpos; 8798 } 8799 } 8800 Py_XDECREF(exc); 8801 Py_XDECREF(errorHandler); 8802 return _PyUnicodeWriter_Finish(&writer); 8803 8804 onError: 8805 _PyUnicodeWriter_Dealloc(&writer); 8806 Py_XDECREF(exc); 8807 Py_XDECREF(errorHandler); 8808 return NULL; 8809} 8810 8811/* Deprecated. Use PyUnicode_Translate instead. */ 8812PyObject * 8813PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8814 Py_ssize_t size, 8815 PyObject *mapping, 8816 const char *errors) 8817{ 8818 PyObject *result; 8819 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8820 if (!unicode) 8821 return NULL; 8822 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8823 Py_DECREF(unicode); 8824 return result; 8825} 8826 8827PyObject * 8828PyUnicode_Translate(PyObject *str, 8829 PyObject *mapping, 8830 const char *errors) 8831{ 8832 PyObject *result; 8833 8834 str = PyUnicode_FromObject(str); 8835 if (str == NULL) 8836 return NULL; 8837 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8838 Py_DECREF(str); 8839 return result; 8840} 8841 8842static Py_UCS4 8843fix_decimal_and_space_to_ascii(PyObject *self) 8844{ 8845 /* No need to call PyUnicode_READY(self) because this function is only 8846 called as a callback from fixup() which does it already. */ 8847 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8848 const int kind = PyUnicode_KIND(self); 8849 void *data = PyUnicode_DATA(self); 8850 Py_UCS4 maxchar = 127, ch, fixed; 8851 int modified = 0; 8852 Py_ssize_t i; 8853 8854 for (i = 0; i < len; ++i) { 8855 ch = PyUnicode_READ(kind, data, i); 8856 fixed = 0; 8857 if (ch > 127) { 8858 if (Py_UNICODE_ISSPACE(ch)) 8859 fixed = ' '; 8860 else { 8861 const int decimal = Py_UNICODE_TODECIMAL(ch); 8862 if (decimal >= 0) 8863 fixed = '0' + decimal; 8864 } 8865 if (fixed != 0) { 8866 modified = 1; 8867 maxchar = Py_MAX(maxchar, fixed); 8868 PyUnicode_WRITE(kind, data, i, fixed); 8869 } 8870 else 8871 maxchar = Py_MAX(maxchar, ch); 8872 } 8873 } 8874 8875 return (modified) ? maxchar : 0; 8876} 8877 8878PyObject * 8879_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8880{ 8881 if (!PyUnicode_Check(unicode)) { 8882 PyErr_BadInternalCall(); 8883 return NULL; 8884 } 8885 if (PyUnicode_READY(unicode) == -1) 8886 return NULL; 8887 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8888 /* If the string is already ASCII, just return the same string */ 8889 Py_INCREF(unicode); 8890 return unicode; 8891 } 8892 return fixup(unicode, fix_decimal_and_space_to_ascii); 8893} 8894 8895PyObject * 8896PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8897 Py_ssize_t length) 8898{ 8899 PyObject *decimal; 8900 Py_ssize_t i; 8901 Py_UCS4 maxchar; 8902 enum PyUnicode_Kind kind; 8903 void *data; 8904 8905 maxchar = 127; 8906 for (i = 0; i < length; i++) { 8907 Py_UCS4 ch = s[i]; 8908 if (ch > 127) { 8909 int decimal = Py_UNICODE_TODECIMAL(ch); 8910 if (decimal >= 0) 8911 ch = '0' + decimal; 8912 maxchar = Py_MAX(maxchar, ch); 8913 } 8914 } 8915 8916 /* Copy to a new string */ 8917 decimal = PyUnicode_New(length, maxchar); 8918 if (decimal == NULL) 8919 return decimal; 8920 kind = PyUnicode_KIND(decimal); 8921 data = PyUnicode_DATA(decimal); 8922 /* Iterate over code points */ 8923 for (i = 0; i < length; i++) { 8924 Py_UCS4 ch = s[i]; 8925 if (ch > 127) { 8926 int decimal = Py_UNICODE_TODECIMAL(ch); 8927 if (decimal >= 0) 8928 ch = '0' + decimal; 8929 } 8930 PyUnicode_WRITE(kind, data, i, ch); 8931 } 8932 return unicode_result(decimal); 8933} 8934/* --- Decimal Encoder ---------------------------------------------------- */ 8935 8936int 8937PyUnicode_EncodeDecimal(Py_UNICODE *s, 8938 Py_ssize_t length, 8939 char *output, 8940 const char *errors) 8941{ 8942 PyObject *unicode; 8943 Py_ssize_t i; 8944 enum PyUnicode_Kind kind; 8945 void *data; 8946 8947 if (output == NULL) { 8948 PyErr_BadArgument(); 8949 return -1; 8950 } 8951 8952 unicode = PyUnicode_FromUnicode(s, length); 8953 if (unicode == NULL) 8954 return -1; 8955 8956 if (PyUnicode_READY(unicode) == -1) { 8957 Py_DECREF(unicode); 8958 return -1; 8959 } 8960 kind = PyUnicode_KIND(unicode); 8961 data = PyUnicode_DATA(unicode); 8962 8963 for (i=0; i < length; ) { 8964 PyObject *exc; 8965 Py_UCS4 ch; 8966 int decimal; 8967 Py_ssize_t startpos; 8968 8969 ch = PyUnicode_READ(kind, data, i); 8970 8971 if (Py_UNICODE_ISSPACE(ch)) { 8972 *output++ = ' '; 8973 i++; 8974 continue; 8975 } 8976 decimal = Py_UNICODE_TODECIMAL(ch); 8977 if (decimal >= 0) { 8978 *output++ = '0' + decimal; 8979 i++; 8980 continue; 8981 } 8982 if (0 < ch && ch < 256) { 8983 *output++ = (char)ch; 8984 i++; 8985 continue; 8986 } 8987 8988 startpos = i; 8989 exc = NULL; 8990 raise_encode_exception(&exc, "decimal", unicode, 8991 startpos, startpos+1, 8992 "invalid decimal Unicode string"); 8993 Py_XDECREF(exc); 8994 Py_DECREF(unicode); 8995 return -1; 8996 } 8997 /* 0-terminate the output string */ 8998 *output++ = '\0'; 8999 Py_DECREF(unicode); 9000 return 0; 9001} 9002 9003/* --- Helpers ------------------------------------------------------------ */ 9004 9005static Py_ssize_t 9006any_find_slice(int direction, PyObject* s1, PyObject* s2, 9007 Py_ssize_t start, 9008 Py_ssize_t end) 9009{ 9010 int kind1, kind2, kind; 9011 void *buf1, *buf2; 9012 Py_ssize_t len1, len2, result; 9013 9014 kind1 = PyUnicode_KIND(s1); 9015 kind2 = PyUnicode_KIND(s2); 9016 kind = kind1 > kind2 ? kind1 : kind2; 9017 buf1 = PyUnicode_DATA(s1); 9018 buf2 = PyUnicode_DATA(s2); 9019 if (kind1 != kind) 9020 buf1 = _PyUnicode_AsKind(s1, kind); 9021 if (!buf1) 9022 return -2; 9023 if (kind2 != kind) 9024 buf2 = _PyUnicode_AsKind(s2, kind); 9025 if (!buf2) { 9026 if (kind1 != kind) PyMem_Free(buf1); 9027 return -2; 9028 } 9029 len1 = PyUnicode_GET_LENGTH(s1); 9030 len2 = PyUnicode_GET_LENGTH(s2); 9031 9032 if (direction > 0) { 9033 switch (kind) { 9034 case PyUnicode_1BYTE_KIND: 9035 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9036 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9037 else 9038 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9039 break; 9040 case PyUnicode_2BYTE_KIND: 9041 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9042 break; 9043 case PyUnicode_4BYTE_KIND: 9044 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9045 break; 9046 default: 9047 assert(0); result = -2; 9048 } 9049 } 9050 else { 9051 switch (kind) { 9052 case PyUnicode_1BYTE_KIND: 9053 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9054 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9055 else 9056 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9057 break; 9058 case PyUnicode_2BYTE_KIND: 9059 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9060 break; 9061 case PyUnicode_4BYTE_KIND: 9062 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9063 break; 9064 default: 9065 assert(0); result = -2; 9066 } 9067 } 9068 9069 if (kind1 != kind) 9070 PyMem_Free(buf1); 9071 if (kind2 != kind) 9072 PyMem_Free(buf2); 9073 9074 return result; 9075} 9076 9077Py_ssize_t 9078_PyUnicode_InsertThousandsGrouping( 9079 PyObject *unicode, Py_ssize_t index, 9080 Py_ssize_t n_buffer, 9081 void *digits, Py_ssize_t n_digits, 9082 Py_ssize_t min_width, 9083 const char *grouping, PyObject *thousands_sep, 9084 Py_UCS4 *maxchar) 9085{ 9086 unsigned int kind, thousands_sep_kind; 9087 char *data, *thousands_sep_data; 9088 Py_ssize_t thousands_sep_len; 9089 Py_ssize_t len; 9090 9091 if (unicode != NULL) { 9092 kind = PyUnicode_KIND(unicode); 9093 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9094 } 9095 else { 9096 kind = PyUnicode_1BYTE_KIND; 9097 data = NULL; 9098 } 9099 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9100 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9101 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9102 if (unicode != NULL && thousands_sep_kind != kind) { 9103 if (thousands_sep_kind < kind) { 9104 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9105 if (!thousands_sep_data) 9106 return -1; 9107 } 9108 else { 9109 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9110 if (!data) 9111 return -1; 9112 } 9113 } 9114 9115 switch (kind) { 9116 case PyUnicode_1BYTE_KIND: 9117 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9118 len = asciilib_InsertThousandsGrouping( 9119 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9120 min_width, grouping, 9121 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9122 else 9123 len = ucs1lib_InsertThousandsGrouping( 9124 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9125 min_width, grouping, 9126 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9127 break; 9128 case PyUnicode_2BYTE_KIND: 9129 len = ucs2lib_InsertThousandsGrouping( 9130 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9131 min_width, grouping, 9132 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9133 break; 9134 case PyUnicode_4BYTE_KIND: 9135 len = ucs4lib_InsertThousandsGrouping( 9136 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9137 min_width, grouping, 9138 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9139 break; 9140 default: 9141 assert(0); 9142 return -1; 9143 } 9144 if (unicode != NULL && thousands_sep_kind != kind) { 9145 if (thousands_sep_kind < kind) 9146 PyMem_Free(thousands_sep_data); 9147 else 9148 PyMem_Free(data); 9149 } 9150 if (unicode == NULL) { 9151 *maxchar = 127; 9152 if (len != n_digits) { 9153 *maxchar = Py_MAX(*maxchar, 9154 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9155 } 9156 } 9157 return len; 9158} 9159 9160 9161/* helper macro to fixup start/end slice values */ 9162#define ADJUST_INDICES(start, end, len) \ 9163 if (end > len) \ 9164 end = len; \ 9165 else if (end < 0) { \ 9166 end += len; \ 9167 if (end < 0) \ 9168 end = 0; \ 9169 } \ 9170 if (start < 0) { \ 9171 start += len; \ 9172 if (start < 0) \ 9173 start = 0; \ 9174 } 9175 9176Py_ssize_t 9177PyUnicode_Count(PyObject *str, 9178 PyObject *substr, 9179 Py_ssize_t start, 9180 Py_ssize_t end) 9181{ 9182 Py_ssize_t result; 9183 PyObject* str_obj; 9184 PyObject* sub_obj; 9185 int kind1, kind2, kind; 9186 void *buf1 = NULL, *buf2 = NULL; 9187 Py_ssize_t len1, len2; 9188 9189 str_obj = PyUnicode_FromObject(str); 9190 if (!str_obj) 9191 return -1; 9192 sub_obj = PyUnicode_FromObject(substr); 9193 if (!sub_obj) { 9194 Py_DECREF(str_obj); 9195 return -1; 9196 } 9197 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9198 Py_DECREF(sub_obj); 9199 Py_DECREF(str_obj); 9200 return -1; 9201 } 9202 9203 kind1 = PyUnicode_KIND(str_obj); 9204 kind2 = PyUnicode_KIND(sub_obj); 9205 kind = kind1; 9206 buf1 = PyUnicode_DATA(str_obj); 9207 buf2 = PyUnicode_DATA(sub_obj); 9208 if (kind2 != kind) { 9209 if (kind2 > kind) { 9210 Py_DECREF(sub_obj); 9211 Py_DECREF(str_obj); 9212 return 0; 9213 } 9214 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9215 } 9216 if (!buf2) 9217 goto onError; 9218 len1 = PyUnicode_GET_LENGTH(str_obj); 9219 len2 = PyUnicode_GET_LENGTH(sub_obj); 9220 9221 ADJUST_INDICES(start, end, len1); 9222 switch (kind) { 9223 case PyUnicode_1BYTE_KIND: 9224 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9225 result = asciilib_count( 9226 ((Py_UCS1*)buf1) + start, end - start, 9227 buf2, len2, PY_SSIZE_T_MAX 9228 ); 9229 else 9230 result = ucs1lib_count( 9231 ((Py_UCS1*)buf1) + start, end - start, 9232 buf2, len2, PY_SSIZE_T_MAX 9233 ); 9234 break; 9235 case PyUnicode_2BYTE_KIND: 9236 result = ucs2lib_count( 9237 ((Py_UCS2*)buf1) + start, end - start, 9238 buf2, len2, PY_SSIZE_T_MAX 9239 ); 9240 break; 9241 case PyUnicode_4BYTE_KIND: 9242 result = ucs4lib_count( 9243 ((Py_UCS4*)buf1) + start, end - start, 9244 buf2, len2, PY_SSIZE_T_MAX 9245 ); 9246 break; 9247 default: 9248 assert(0); result = 0; 9249 } 9250 9251 Py_DECREF(sub_obj); 9252 Py_DECREF(str_obj); 9253 9254 if (kind2 != kind) 9255 PyMem_Free(buf2); 9256 9257 return result; 9258 onError: 9259 Py_DECREF(sub_obj); 9260 Py_DECREF(str_obj); 9261 if (kind2 != kind && buf2) 9262 PyMem_Free(buf2); 9263 return -1; 9264} 9265 9266Py_ssize_t 9267PyUnicode_Find(PyObject *str, 9268 PyObject *sub, 9269 Py_ssize_t start, 9270 Py_ssize_t end, 9271 int direction) 9272{ 9273 Py_ssize_t result; 9274 9275 str = PyUnicode_FromObject(str); 9276 if (!str) 9277 return -2; 9278 sub = PyUnicode_FromObject(sub); 9279 if (!sub) { 9280 Py_DECREF(str); 9281 return -2; 9282 } 9283 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9284 Py_DECREF(sub); 9285 Py_DECREF(str); 9286 return -2; 9287 } 9288 9289 result = any_find_slice(direction, 9290 str, sub, start, end 9291 ); 9292 9293 Py_DECREF(str); 9294 Py_DECREF(sub); 9295 9296 return result; 9297} 9298 9299Py_ssize_t 9300PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9301 Py_ssize_t start, Py_ssize_t end, 9302 int direction) 9303{ 9304 int kind; 9305 Py_ssize_t result; 9306 if (PyUnicode_READY(str) == -1) 9307 return -2; 9308 if (start < 0 || end < 0) { 9309 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9310 return -2; 9311 } 9312 if (end > PyUnicode_GET_LENGTH(str)) 9313 end = PyUnicode_GET_LENGTH(str); 9314 kind = PyUnicode_KIND(str); 9315 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9316 kind, end-start, ch, direction); 9317 if (result == -1) 9318 return -1; 9319 else 9320 return start + result; 9321} 9322 9323static int 9324tailmatch(PyObject *self, 9325 PyObject *substring, 9326 Py_ssize_t start, 9327 Py_ssize_t end, 9328 int direction) 9329{ 9330 int kind_self; 9331 int kind_sub; 9332 void *data_self; 9333 void *data_sub; 9334 Py_ssize_t offset; 9335 Py_ssize_t i; 9336 Py_ssize_t end_sub; 9337 9338 if (PyUnicode_READY(self) == -1 || 9339 PyUnicode_READY(substring) == -1) 9340 return -1; 9341 9342 if (PyUnicode_GET_LENGTH(substring) == 0) 9343 return 1; 9344 9345 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9346 end -= PyUnicode_GET_LENGTH(substring); 9347 if (end < start) 9348 return 0; 9349 9350 kind_self = PyUnicode_KIND(self); 9351 data_self = PyUnicode_DATA(self); 9352 kind_sub = PyUnicode_KIND(substring); 9353 data_sub = PyUnicode_DATA(substring); 9354 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9355 9356 if (direction > 0) 9357 offset = end; 9358 else 9359 offset = start; 9360 9361 if (PyUnicode_READ(kind_self, data_self, offset) == 9362 PyUnicode_READ(kind_sub, data_sub, 0) && 9363 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9364 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9365 /* If both are of the same kind, memcmp is sufficient */ 9366 if (kind_self == kind_sub) { 9367 return ! memcmp((char *)data_self + 9368 (offset * PyUnicode_KIND(substring)), 9369 data_sub, 9370 PyUnicode_GET_LENGTH(substring) * 9371 PyUnicode_KIND(substring)); 9372 } 9373 /* otherwise we have to compare each character by first accesing it */ 9374 else { 9375 /* We do not need to compare 0 and len(substring)-1 because 9376 the if statement above ensured already that they are equal 9377 when we end up here. */ 9378 for (i = 1; i < end_sub; ++i) { 9379 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9380 PyUnicode_READ(kind_sub, data_sub, i)) 9381 return 0; 9382 } 9383 return 1; 9384 } 9385 } 9386 9387 return 0; 9388} 9389 9390Py_ssize_t 9391PyUnicode_Tailmatch(PyObject *str, 9392 PyObject *substr, 9393 Py_ssize_t start, 9394 Py_ssize_t end, 9395 int direction) 9396{ 9397 Py_ssize_t result; 9398 9399 str = PyUnicode_FromObject(str); 9400 if (str == NULL) 9401 return -1; 9402 substr = PyUnicode_FromObject(substr); 9403 if (substr == NULL) { 9404 Py_DECREF(str); 9405 return -1; 9406 } 9407 9408 result = tailmatch(str, substr, 9409 start, end, direction); 9410 Py_DECREF(str); 9411 Py_DECREF(substr); 9412 return result; 9413} 9414 9415/* Apply fixfct filter to the Unicode object self and return a 9416 reference to the modified object */ 9417 9418static PyObject * 9419fixup(PyObject *self, 9420 Py_UCS4 (*fixfct)(PyObject *s)) 9421{ 9422 PyObject *u; 9423 Py_UCS4 maxchar_old, maxchar_new = 0; 9424 PyObject *v; 9425 9426 u = _PyUnicode_Copy(self); 9427 if (u == NULL) 9428 return NULL; 9429 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9430 9431 /* fix functions return the new maximum character in a string, 9432 if the kind of the resulting unicode object does not change, 9433 everything is fine. Otherwise we need to change the string kind 9434 and re-run the fix function. */ 9435 maxchar_new = fixfct(u); 9436 9437 if (maxchar_new == 0) { 9438 /* no changes */; 9439 if (PyUnicode_CheckExact(self)) { 9440 Py_DECREF(u); 9441 Py_INCREF(self); 9442 return self; 9443 } 9444 else 9445 return u; 9446 } 9447 9448 maxchar_new = align_maxchar(maxchar_new); 9449 9450 if (maxchar_new == maxchar_old) 9451 return u; 9452 9453 /* In case the maximum character changed, we need to 9454 convert the string to the new category. */ 9455 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9456 if (v == NULL) { 9457 Py_DECREF(u); 9458 return NULL; 9459 } 9460 if (maxchar_new > maxchar_old) { 9461 /* If the maxchar increased so that the kind changed, not all 9462 characters are representable anymore and we need to fix the 9463 string again. This only happens in very few cases. */ 9464 _PyUnicode_FastCopyCharacters(v, 0, 9465 self, 0, PyUnicode_GET_LENGTH(self)); 9466 maxchar_old = fixfct(v); 9467 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9468 } 9469 else { 9470 _PyUnicode_FastCopyCharacters(v, 0, 9471 u, 0, PyUnicode_GET_LENGTH(self)); 9472 } 9473 Py_DECREF(u); 9474 assert(_PyUnicode_CheckConsistency(v, 1)); 9475 return v; 9476} 9477 9478static PyObject * 9479ascii_upper_or_lower(PyObject *self, int lower) 9480{ 9481 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9482 char *resdata, *data = PyUnicode_DATA(self); 9483 PyObject *res; 9484 9485 res = PyUnicode_New(len, 127); 9486 if (res == NULL) 9487 return NULL; 9488 resdata = PyUnicode_DATA(res); 9489 if (lower) 9490 _Py_bytes_lower(resdata, data, len); 9491 else 9492 _Py_bytes_upper(resdata, data, len); 9493 return res; 9494} 9495 9496static Py_UCS4 9497handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9498{ 9499 Py_ssize_t j; 9500 int final_sigma; 9501 Py_UCS4 c; 9502 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9503 9504 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9505 9506 where ! is a negation and \p{xxx} is a character with property xxx. 9507 */ 9508 for (j = i - 1; j >= 0; j--) { 9509 c = PyUnicode_READ(kind, data, j); 9510 if (!_PyUnicode_IsCaseIgnorable(c)) 9511 break; 9512 } 9513 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9514 if (final_sigma) { 9515 for (j = i + 1; j < length; j++) { 9516 c = PyUnicode_READ(kind, data, j); 9517 if (!_PyUnicode_IsCaseIgnorable(c)) 9518 break; 9519 } 9520 final_sigma = j == length || !_PyUnicode_IsCased(c); 9521 } 9522 return (final_sigma) ? 0x3C2 : 0x3C3; 9523} 9524 9525static int 9526lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9527 Py_UCS4 c, Py_UCS4 *mapped) 9528{ 9529 /* Obscure special case. */ 9530 if (c == 0x3A3) { 9531 mapped[0] = handle_capital_sigma(kind, data, length, i); 9532 return 1; 9533 } 9534 return _PyUnicode_ToLowerFull(c, mapped); 9535} 9536 9537static Py_ssize_t 9538do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9539{ 9540 Py_ssize_t i, k = 0; 9541 int n_res, j; 9542 Py_UCS4 c, mapped[3]; 9543 9544 c = PyUnicode_READ(kind, data, 0); 9545 n_res = _PyUnicode_ToUpperFull(c, mapped); 9546 for (j = 0; j < n_res; j++) { 9547 *maxchar = Py_MAX(*maxchar, mapped[j]); 9548 res[k++] = mapped[j]; 9549 } 9550 for (i = 1; i < length; i++) { 9551 c = PyUnicode_READ(kind, data, i); 9552 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9553 for (j = 0; j < n_res; j++) { 9554 *maxchar = Py_MAX(*maxchar, mapped[j]); 9555 res[k++] = mapped[j]; 9556 } 9557 } 9558 return k; 9559} 9560 9561static Py_ssize_t 9562do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9563 Py_ssize_t i, k = 0; 9564 9565 for (i = 0; i < length; i++) { 9566 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9567 int n_res, j; 9568 if (Py_UNICODE_ISUPPER(c)) { 9569 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9570 } 9571 else if (Py_UNICODE_ISLOWER(c)) { 9572 n_res = _PyUnicode_ToUpperFull(c, mapped); 9573 } 9574 else { 9575 n_res = 1; 9576 mapped[0] = c; 9577 } 9578 for (j = 0; j < n_res; j++) { 9579 *maxchar = Py_MAX(*maxchar, mapped[j]); 9580 res[k++] = mapped[j]; 9581 } 9582 } 9583 return k; 9584} 9585 9586static Py_ssize_t 9587do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9588 Py_UCS4 *maxchar, int lower) 9589{ 9590 Py_ssize_t i, k = 0; 9591 9592 for (i = 0; i < length; i++) { 9593 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9594 int n_res, j; 9595 if (lower) 9596 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9597 else 9598 n_res = _PyUnicode_ToUpperFull(c, mapped); 9599 for (j = 0; j < n_res; j++) { 9600 *maxchar = Py_MAX(*maxchar, mapped[j]); 9601 res[k++] = mapped[j]; 9602 } 9603 } 9604 return k; 9605} 9606 9607static Py_ssize_t 9608do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9609{ 9610 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9611} 9612 9613static Py_ssize_t 9614do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9615{ 9616 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9617} 9618 9619static Py_ssize_t 9620do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9621{ 9622 Py_ssize_t i, k = 0; 9623 9624 for (i = 0; i < length; i++) { 9625 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9626 Py_UCS4 mapped[3]; 9627 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9628 for (j = 0; j < n_res; j++) { 9629 *maxchar = Py_MAX(*maxchar, mapped[j]); 9630 res[k++] = mapped[j]; 9631 } 9632 } 9633 return k; 9634} 9635 9636static Py_ssize_t 9637do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9638{ 9639 Py_ssize_t i, k = 0; 9640 int previous_is_cased; 9641 9642 previous_is_cased = 0; 9643 for (i = 0; i < length; i++) { 9644 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9645 Py_UCS4 mapped[3]; 9646 int n_res, j; 9647 9648 if (previous_is_cased) 9649 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9650 else 9651 n_res = _PyUnicode_ToTitleFull(c, mapped); 9652 9653 for (j = 0; j < n_res; j++) { 9654 *maxchar = Py_MAX(*maxchar, mapped[j]); 9655 res[k++] = mapped[j]; 9656 } 9657 9658 previous_is_cased = _PyUnicode_IsCased(c); 9659 } 9660 return k; 9661} 9662 9663static PyObject * 9664case_operation(PyObject *self, 9665 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9666{ 9667 PyObject *res = NULL; 9668 Py_ssize_t length, newlength = 0; 9669 int kind, outkind; 9670 void *data, *outdata; 9671 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9672 9673 assert(PyUnicode_IS_READY(self)); 9674 9675 kind = PyUnicode_KIND(self); 9676 data = PyUnicode_DATA(self); 9677 length = PyUnicode_GET_LENGTH(self); 9678 if (length > PY_SSIZE_T_MAX / 3 || 9679 length > PY_SIZE_MAX / (3 * sizeof(Py_UCS4))) { 9680 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9681 return NULL; 9682 } 9683 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * (size_t)length); 9684 if (tmp == NULL) 9685 return PyErr_NoMemory(); 9686 newlength = perform(kind, data, length, tmp, &maxchar); 9687 res = PyUnicode_New(newlength, maxchar); 9688 if (res == NULL) 9689 goto leave; 9690 tmpend = tmp + newlength; 9691 outdata = PyUnicode_DATA(res); 9692 outkind = PyUnicode_KIND(res); 9693 switch (outkind) { 9694 case PyUnicode_1BYTE_KIND: 9695 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9696 break; 9697 case PyUnicode_2BYTE_KIND: 9698 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9699 break; 9700 case PyUnicode_4BYTE_KIND: 9701 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9702 break; 9703 default: 9704 assert(0); 9705 break; 9706 } 9707 leave: 9708 PyMem_FREE(tmp); 9709 return res; 9710} 9711 9712PyObject * 9713PyUnicode_Join(PyObject *separator, PyObject *seq) 9714{ 9715 PyObject *sep = NULL; 9716 Py_ssize_t seplen; 9717 PyObject *res = NULL; /* the result */ 9718 PyObject *fseq; /* PySequence_Fast(seq) */ 9719 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9720 PyObject **items; 9721 PyObject *item; 9722 Py_ssize_t sz, i, res_offset; 9723 Py_UCS4 maxchar; 9724 Py_UCS4 item_maxchar; 9725 int use_memcpy; 9726 unsigned char *res_data = NULL, *sep_data = NULL; 9727 PyObject *last_obj; 9728 unsigned int kind = 0; 9729 9730 fseq = PySequence_Fast(seq, "can only join an iterable"); 9731 if (fseq == NULL) { 9732 return NULL; 9733 } 9734 9735 /* NOTE: the following code can't call back into Python code, 9736 * so we are sure that fseq won't be mutated. 9737 */ 9738 9739 seqlen = PySequence_Fast_GET_SIZE(fseq); 9740 /* If empty sequence, return u"". */ 9741 if (seqlen == 0) { 9742 Py_DECREF(fseq); 9743 _Py_RETURN_UNICODE_EMPTY(); 9744 } 9745 9746 /* If singleton sequence with an exact Unicode, return that. */ 9747 last_obj = NULL; 9748 items = PySequence_Fast_ITEMS(fseq); 9749 if (seqlen == 1) { 9750 if (PyUnicode_CheckExact(items[0])) { 9751 res = items[0]; 9752 Py_INCREF(res); 9753 Py_DECREF(fseq); 9754 return res; 9755 } 9756 seplen = 0; 9757 maxchar = 0; 9758 } 9759 else { 9760 /* Set up sep and seplen */ 9761 if (separator == NULL) { 9762 /* fall back to a blank space separator */ 9763 sep = PyUnicode_FromOrdinal(' '); 9764 if (!sep) 9765 goto onError; 9766 seplen = 1; 9767 maxchar = 32; 9768 } 9769 else { 9770 if (!PyUnicode_Check(separator)) { 9771 PyErr_Format(PyExc_TypeError, 9772 "separator: expected str instance," 9773 " %.80s found", 9774 Py_TYPE(separator)->tp_name); 9775 goto onError; 9776 } 9777 if (PyUnicode_READY(separator)) 9778 goto onError; 9779 sep = separator; 9780 seplen = PyUnicode_GET_LENGTH(separator); 9781 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9782 /* inc refcount to keep this code path symmetric with the 9783 above case of a blank separator */ 9784 Py_INCREF(sep); 9785 } 9786 last_obj = sep; 9787 } 9788 9789 /* There are at least two things to join, or else we have a subclass 9790 * of str in the sequence. 9791 * Do a pre-pass to figure out the total amount of space we'll 9792 * need (sz), and see whether all argument are strings. 9793 */ 9794 sz = 0; 9795#ifdef Py_DEBUG 9796 use_memcpy = 0; 9797#else 9798 use_memcpy = 1; 9799#endif 9800 for (i = 0; i < seqlen; i++) { 9801 const Py_ssize_t old_sz = sz; 9802 item = items[i]; 9803 if (!PyUnicode_Check(item)) { 9804 PyErr_Format(PyExc_TypeError, 9805 "sequence item %zd: expected str instance," 9806 " %.80s found", 9807 i, Py_TYPE(item)->tp_name); 9808 goto onError; 9809 } 9810 if (PyUnicode_READY(item) == -1) 9811 goto onError; 9812 sz += PyUnicode_GET_LENGTH(item); 9813 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9814 maxchar = Py_MAX(maxchar, item_maxchar); 9815 if (i != 0) 9816 sz += seplen; 9817 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9818 PyErr_SetString(PyExc_OverflowError, 9819 "join() result is too long for a Python string"); 9820 goto onError; 9821 } 9822 if (use_memcpy && last_obj != NULL) { 9823 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9824 use_memcpy = 0; 9825 } 9826 last_obj = item; 9827 } 9828 9829 res = PyUnicode_New(sz, maxchar); 9830 if (res == NULL) 9831 goto onError; 9832 9833 /* Catenate everything. */ 9834#ifdef Py_DEBUG 9835 use_memcpy = 0; 9836#else 9837 if (use_memcpy) { 9838 res_data = PyUnicode_1BYTE_DATA(res); 9839 kind = PyUnicode_KIND(res); 9840 if (seplen != 0) 9841 sep_data = PyUnicode_1BYTE_DATA(sep); 9842 } 9843#endif 9844 if (use_memcpy) { 9845 for (i = 0; i < seqlen; ++i) { 9846 Py_ssize_t itemlen; 9847 item = items[i]; 9848 9849 /* Copy item, and maybe the separator. */ 9850 if (i && seplen != 0) { 9851 Py_MEMCPY(res_data, 9852 sep_data, 9853 kind * seplen); 9854 res_data += kind * seplen; 9855 } 9856 9857 itemlen = PyUnicode_GET_LENGTH(item); 9858 if (itemlen != 0) { 9859 Py_MEMCPY(res_data, 9860 PyUnicode_DATA(item), 9861 kind * itemlen); 9862 res_data += kind * itemlen; 9863 } 9864 } 9865 assert(res_data == PyUnicode_1BYTE_DATA(res) 9866 + kind * PyUnicode_GET_LENGTH(res)); 9867 } 9868 else { 9869 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9870 Py_ssize_t itemlen; 9871 item = items[i]; 9872 9873 /* Copy item, and maybe the separator. */ 9874 if (i && seplen != 0) { 9875 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9876 res_offset += seplen; 9877 } 9878 9879 itemlen = PyUnicode_GET_LENGTH(item); 9880 if (itemlen != 0) { 9881 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9882 res_offset += itemlen; 9883 } 9884 } 9885 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9886 } 9887 9888 Py_DECREF(fseq); 9889 Py_XDECREF(sep); 9890 assert(_PyUnicode_CheckConsistency(res, 1)); 9891 return res; 9892 9893 onError: 9894 Py_DECREF(fseq); 9895 Py_XDECREF(sep); 9896 Py_XDECREF(res); 9897 return NULL; 9898} 9899 9900#define FILL(kind, data, value, start, length) \ 9901 do { \ 9902 Py_ssize_t i_ = 0; \ 9903 assert(kind != PyUnicode_WCHAR_KIND); \ 9904 switch ((kind)) { \ 9905 case PyUnicode_1BYTE_KIND: { \ 9906 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9907 memset(to_, (unsigned char)value, (length)); \ 9908 break; \ 9909 } \ 9910 case PyUnicode_2BYTE_KIND: { \ 9911 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9912 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9913 break; \ 9914 } \ 9915 case PyUnicode_4BYTE_KIND: { \ 9916 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9917 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9918 break; \ 9919 default: assert(0); \ 9920 } \ 9921 } \ 9922 } while (0) 9923 9924void 9925_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9926 Py_UCS4 fill_char) 9927{ 9928 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9929 const void *data = PyUnicode_DATA(unicode); 9930 assert(PyUnicode_IS_READY(unicode)); 9931 assert(unicode_modifiable(unicode)); 9932 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9933 assert(start >= 0); 9934 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9935 FILL(kind, data, fill_char, start, length); 9936} 9937 9938Py_ssize_t 9939PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9940 Py_UCS4 fill_char) 9941{ 9942 Py_ssize_t maxlen; 9943 9944 if (!PyUnicode_Check(unicode)) { 9945 PyErr_BadInternalCall(); 9946 return -1; 9947 } 9948 if (PyUnicode_READY(unicode) == -1) 9949 return -1; 9950 if (unicode_check_modifiable(unicode)) 9951 return -1; 9952 9953 if (start < 0) { 9954 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9955 return -1; 9956 } 9957 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9958 PyErr_SetString(PyExc_ValueError, 9959 "fill character is bigger than " 9960 "the string maximum character"); 9961 return -1; 9962 } 9963 9964 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9965 length = Py_MIN(maxlen, length); 9966 if (length <= 0) 9967 return 0; 9968 9969 _PyUnicode_FastFill(unicode, start, length, fill_char); 9970 return length; 9971} 9972 9973static PyObject * 9974pad(PyObject *self, 9975 Py_ssize_t left, 9976 Py_ssize_t right, 9977 Py_UCS4 fill) 9978{ 9979 PyObject *u; 9980 Py_UCS4 maxchar; 9981 int kind; 9982 void *data; 9983 9984 if (left < 0) 9985 left = 0; 9986 if (right < 0) 9987 right = 0; 9988 9989 if (left == 0 && right == 0) 9990 return unicode_result_unchanged(self); 9991 9992 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9993 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9994 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9995 return NULL; 9996 } 9997 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9998 maxchar = Py_MAX(maxchar, fill); 9999 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10000 if (!u) 10001 return NULL; 10002 10003 kind = PyUnicode_KIND(u); 10004 data = PyUnicode_DATA(u); 10005 if (left) 10006 FILL(kind, data, fill, 0, left); 10007 if (right) 10008 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10009 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10010 assert(_PyUnicode_CheckConsistency(u, 1)); 10011 return u; 10012} 10013 10014PyObject * 10015PyUnicode_Splitlines(PyObject *string, int keepends) 10016{ 10017 PyObject *list; 10018 10019 string = PyUnicode_FromObject(string); 10020 if (string == NULL) 10021 return NULL; 10022 if (PyUnicode_READY(string) == -1) { 10023 Py_DECREF(string); 10024 return NULL; 10025 } 10026 10027 switch (PyUnicode_KIND(string)) { 10028 case PyUnicode_1BYTE_KIND: 10029 if (PyUnicode_IS_ASCII(string)) 10030 list = asciilib_splitlines( 10031 string, PyUnicode_1BYTE_DATA(string), 10032 PyUnicode_GET_LENGTH(string), keepends); 10033 else 10034 list = ucs1lib_splitlines( 10035 string, PyUnicode_1BYTE_DATA(string), 10036 PyUnicode_GET_LENGTH(string), keepends); 10037 break; 10038 case PyUnicode_2BYTE_KIND: 10039 list = ucs2lib_splitlines( 10040 string, PyUnicode_2BYTE_DATA(string), 10041 PyUnicode_GET_LENGTH(string), keepends); 10042 break; 10043 case PyUnicode_4BYTE_KIND: 10044 list = ucs4lib_splitlines( 10045 string, PyUnicode_4BYTE_DATA(string), 10046 PyUnicode_GET_LENGTH(string), keepends); 10047 break; 10048 default: 10049 assert(0); 10050 list = 0; 10051 } 10052 Py_DECREF(string); 10053 return list; 10054} 10055 10056static PyObject * 10057split(PyObject *self, 10058 PyObject *substring, 10059 Py_ssize_t maxcount) 10060{ 10061 int kind1, kind2, kind; 10062 void *buf1, *buf2; 10063 Py_ssize_t len1, len2; 10064 PyObject* out; 10065 10066 if (maxcount < 0) 10067 maxcount = PY_SSIZE_T_MAX; 10068 10069 if (PyUnicode_READY(self) == -1) 10070 return NULL; 10071 10072 if (substring == NULL) 10073 switch (PyUnicode_KIND(self)) { 10074 case PyUnicode_1BYTE_KIND: 10075 if (PyUnicode_IS_ASCII(self)) 10076 return asciilib_split_whitespace( 10077 self, PyUnicode_1BYTE_DATA(self), 10078 PyUnicode_GET_LENGTH(self), maxcount 10079 ); 10080 else 10081 return ucs1lib_split_whitespace( 10082 self, PyUnicode_1BYTE_DATA(self), 10083 PyUnicode_GET_LENGTH(self), maxcount 10084 ); 10085 case PyUnicode_2BYTE_KIND: 10086 return ucs2lib_split_whitespace( 10087 self, PyUnicode_2BYTE_DATA(self), 10088 PyUnicode_GET_LENGTH(self), maxcount 10089 ); 10090 case PyUnicode_4BYTE_KIND: 10091 return ucs4lib_split_whitespace( 10092 self, PyUnicode_4BYTE_DATA(self), 10093 PyUnicode_GET_LENGTH(self), maxcount 10094 ); 10095 default: 10096 assert(0); 10097 return NULL; 10098 } 10099 10100 if (PyUnicode_READY(substring) == -1) 10101 return NULL; 10102 10103 kind1 = PyUnicode_KIND(self); 10104 kind2 = PyUnicode_KIND(substring); 10105 kind = kind1 > kind2 ? kind1 : kind2; 10106 buf1 = PyUnicode_DATA(self); 10107 buf2 = PyUnicode_DATA(substring); 10108 if (kind1 != kind) 10109 buf1 = _PyUnicode_AsKind(self, kind); 10110 if (!buf1) 10111 return NULL; 10112 if (kind2 != kind) 10113 buf2 = _PyUnicode_AsKind(substring, kind); 10114 if (!buf2) { 10115 if (kind1 != kind) PyMem_Free(buf1); 10116 return NULL; 10117 } 10118 len1 = PyUnicode_GET_LENGTH(self); 10119 len2 = PyUnicode_GET_LENGTH(substring); 10120 10121 switch (kind) { 10122 case PyUnicode_1BYTE_KIND: 10123 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10124 out = asciilib_split( 10125 self, buf1, len1, buf2, len2, maxcount); 10126 else 10127 out = ucs1lib_split( 10128 self, buf1, len1, buf2, len2, maxcount); 10129 break; 10130 case PyUnicode_2BYTE_KIND: 10131 out = ucs2lib_split( 10132 self, buf1, len1, buf2, len2, maxcount); 10133 break; 10134 case PyUnicode_4BYTE_KIND: 10135 out = ucs4lib_split( 10136 self, buf1, len1, buf2, len2, maxcount); 10137 break; 10138 default: 10139 out = NULL; 10140 } 10141 if (kind1 != kind) 10142 PyMem_Free(buf1); 10143 if (kind2 != kind) 10144 PyMem_Free(buf2); 10145 return out; 10146} 10147 10148static PyObject * 10149rsplit(PyObject *self, 10150 PyObject *substring, 10151 Py_ssize_t maxcount) 10152{ 10153 int kind1, kind2, kind; 10154 void *buf1, *buf2; 10155 Py_ssize_t len1, len2; 10156 PyObject* out; 10157 10158 if (maxcount < 0) 10159 maxcount = PY_SSIZE_T_MAX; 10160 10161 if (PyUnicode_READY(self) == -1) 10162 return NULL; 10163 10164 if (substring == NULL) 10165 switch (PyUnicode_KIND(self)) { 10166 case PyUnicode_1BYTE_KIND: 10167 if (PyUnicode_IS_ASCII(self)) 10168 return asciilib_rsplit_whitespace( 10169 self, PyUnicode_1BYTE_DATA(self), 10170 PyUnicode_GET_LENGTH(self), maxcount 10171 ); 10172 else 10173 return ucs1lib_rsplit_whitespace( 10174 self, PyUnicode_1BYTE_DATA(self), 10175 PyUnicode_GET_LENGTH(self), maxcount 10176 ); 10177 case PyUnicode_2BYTE_KIND: 10178 return ucs2lib_rsplit_whitespace( 10179 self, PyUnicode_2BYTE_DATA(self), 10180 PyUnicode_GET_LENGTH(self), maxcount 10181 ); 10182 case PyUnicode_4BYTE_KIND: 10183 return ucs4lib_rsplit_whitespace( 10184 self, PyUnicode_4BYTE_DATA(self), 10185 PyUnicode_GET_LENGTH(self), maxcount 10186 ); 10187 default: 10188 assert(0); 10189 return NULL; 10190 } 10191 10192 if (PyUnicode_READY(substring) == -1) 10193 return NULL; 10194 10195 kind1 = PyUnicode_KIND(self); 10196 kind2 = PyUnicode_KIND(substring); 10197 kind = kind1 > kind2 ? kind1 : kind2; 10198 buf1 = PyUnicode_DATA(self); 10199 buf2 = PyUnicode_DATA(substring); 10200 if (kind1 != kind) 10201 buf1 = _PyUnicode_AsKind(self, kind); 10202 if (!buf1) 10203 return NULL; 10204 if (kind2 != kind) 10205 buf2 = _PyUnicode_AsKind(substring, kind); 10206 if (!buf2) { 10207 if (kind1 != kind) PyMem_Free(buf1); 10208 return NULL; 10209 } 10210 len1 = PyUnicode_GET_LENGTH(self); 10211 len2 = PyUnicode_GET_LENGTH(substring); 10212 10213 switch (kind) { 10214 case PyUnicode_1BYTE_KIND: 10215 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10216 out = asciilib_rsplit( 10217 self, buf1, len1, buf2, len2, maxcount); 10218 else 10219 out = ucs1lib_rsplit( 10220 self, buf1, len1, buf2, len2, maxcount); 10221 break; 10222 case PyUnicode_2BYTE_KIND: 10223 out = ucs2lib_rsplit( 10224 self, buf1, len1, buf2, len2, maxcount); 10225 break; 10226 case PyUnicode_4BYTE_KIND: 10227 out = ucs4lib_rsplit( 10228 self, buf1, len1, buf2, len2, maxcount); 10229 break; 10230 default: 10231 out = NULL; 10232 } 10233 if (kind1 != kind) 10234 PyMem_Free(buf1); 10235 if (kind2 != kind) 10236 PyMem_Free(buf2); 10237 return out; 10238} 10239 10240static Py_ssize_t 10241anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10242 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10243{ 10244 switch (kind) { 10245 case PyUnicode_1BYTE_KIND: 10246 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10247 return asciilib_find(buf1, len1, buf2, len2, offset); 10248 else 10249 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10250 case PyUnicode_2BYTE_KIND: 10251 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10252 case PyUnicode_4BYTE_KIND: 10253 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10254 } 10255 assert(0); 10256 return -1; 10257} 10258 10259static Py_ssize_t 10260anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10261 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10262{ 10263 switch (kind) { 10264 case PyUnicode_1BYTE_KIND: 10265 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10266 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10267 else 10268 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10269 case PyUnicode_2BYTE_KIND: 10270 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10271 case PyUnicode_4BYTE_KIND: 10272 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10273 } 10274 assert(0); 10275 return 0; 10276} 10277 10278static void 10279replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10280 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10281{ 10282 int kind = PyUnicode_KIND(u); 10283 void *data = PyUnicode_DATA(u); 10284 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10285 if (kind == PyUnicode_1BYTE_KIND) { 10286 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10287 (Py_UCS1 *)data + len, 10288 u1, u2, maxcount); 10289 } 10290 else if (kind == PyUnicode_2BYTE_KIND) { 10291 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10292 (Py_UCS2 *)data + len, 10293 u1, u2, maxcount); 10294 } 10295 else { 10296 assert(kind == PyUnicode_4BYTE_KIND); 10297 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10298 (Py_UCS4 *)data + len, 10299 u1, u2, maxcount); 10300 } 10301} 10302 10303static PyObject * 10304replace(PyObject *self, PyObject *str1, 10305 PyObject *str2, Py_ssize_t maxcount) 10306{ 10307 PyObject *u; 10308 char *sbuf = PyUnicode_DATA(self); 10309 char *buf1 = PyUnicode_DATA(str1); 10310 char *buf2 = PyUnicode_DATA(str2); 10311 int srelease = 0, release1 = 0, release2 = 0; 10312 int skind = PyUnicode_KIND(self); 10313 int kind1 = PyUnicode_KIND(str1); 10314 int kind2 = PyUnicode_KIND(str2); 10315 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10316 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10317 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10318 int mayshrink; 10319 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10320 10321 if (maxcount < 0) 10322 maxcount = PY_SSIZE_T_MAX; 10323 else if (maxcount == 0 || slen == 0) 10324 goto nothing; 10325 10326 if (str1 == str2) 10327 goto nothing; 10328 10329 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10330 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10331 if (maxchar < maxchar_str1) 10332 /* substring too wide to be present */ 10333 goto nothing; 10334 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10335 /* Replacing str1 with str2 may cause a maxchar reduction in the 10336 result string. */ 10337 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10338 maxchar = Py_MAX(maxchar, maxchar_str2); 10339 10340 if (len1 == len2) { 10341 /* same length */ 10342 if (len1 == 0) 10343 goto nothing; 10344 if (len1 == 1) { 10345 /* replace characters */ 10346 Py_UCS4 u1, u2; 10347 Py_ssize_t pos; 10348 10349 u1 = PyUnicode_READ(kind1, buf1, 0); 10350 pos = findchar(sbuf, skind, slen, u1, 1); 10351 if (pos < 0) 10352 goto nothing; 10353 u2 = PyUnicode_READ(kind2, buf2, 0); 10354 u = PyUnicode_New(slen, maxchar); 10355 if (!u) 10356 goto error; 10357 10358 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10359 replace_1char_inplace(u, pos, u1, u2, maxcount); 10360 } 10361 else { 10362 int rkind = skind; 10363 char *res; 10364 Py_ssize_t i; 10365 10366 if (kind1 < rkind) { 10367 /* widen substring */ 10368 buf1 = _PyUnicode_AsKind(str1, rkind); 10369 if (!buf1) goto error; 10370 release1 = 1; 10371 } 10372 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10373 if (i < 0) 10374 goto nothing; 10375 if (rkind > kind2) { 10376 /* widen replacement */ 10377 buf2 = _PyUnicode_AsKind(str2, rkind); 10378 if (!buf2) goto error; 10379 release2 = 1; 10380 } 10381 else if (rkind < kind2) { 10382 /* widen self and buf1 */ 10383 rkind = kind2; 10384 if (release1) PyMem_Free(buf1); 10385 release1 = 0; 10386 sbuf = _PyUnicode_AsKind(self, rkind); 10387 if (!sbuf) goto error; 10388 srelease = 1; 10389 buf1 = _PyUnicode_AsKind(str1, rkind); 10390 if (!buf1) goto error; 10391 release1 = 1; 10392 } 10393 u = PyUnicode_New(slen, maxchar); 10394 if (!u) 10395 goto error; 10396 assert(PyUnicode_KIND(u) == rkind); 10397 res = PyUnicode_DATA(u); 10398 10399 memcpy(res, sbuf, rkind * slen); 10400 /* change everything in-place, starting with this one */ 10401 memcpy(res + rkind * i, 10402 buf2, 10403 rkind * len2); 10404 i += len1; 10405 10406 while ( --maxcount > 0) { 10407 i = anylib_find(rkind, self, 10408 sbuf+rkind*i, slen-i, 10409 str1, buf1, len1, i); 10410 if (i == -1) 10411 break; 10412 memcpy(res + rkind * i, 10413 buf2, 10414 rkind * len2); 10415 i += len1; 10416 } 10417 } 10418 } 10419 else { 10420 Py_ssize_t n, i, j, ires; 10421 Py_ssize_t new_size; 10422 int rkind = skind; 10423 char *res; 10424 10425 if (kind1 < rkind) { 10426 /* widen substring */ 10427 buf1 = _PyUnicode_AsKind(str1, rkind); 10428 if (!buf1) goto error; 10429 release1 = 1; 10430 } 10431 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10432 if (n == 0) 10433 goto nothing; 10434 if (kind2 < rkind) { 10435 /* widen replacement */ 10436 buf2 = _PyUnicode_AsKind(str2, rkind); 10437 if (!buf2) goto error; 10438 release2 = 1; 10439 } 10440 else if (kind2 > rkind) { 10441 /* widen self and buf1 */ 10442 rkind = kind2; 10443 sbuf = _PyUnicode_AsKind(self, rkind); 10444 if (!sbuf) goto error; 10445 srelease = 1; 10446 if (release1) PyMem_Free(buf1); 10447 release1 = 0; 10448 buf1 = _PyUnicode_AsKind(str1, rkind); 10449 if (!buf1) goto error; 10450 release1 = 1; 10451 } 10452 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10453 PyUnicode_GET_LENGTH(str1))); */ 10454 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10455 PyErr_SetString(PyExc_OverflowError, 10456 "replace string is too long"); 10457 goto error; 10458 } 10459 new_size = slen + n * (len2 - len1); 10460 if (new_size == 0) { 10461 _Py_INCREF_UNICODE_EMPTY(); 10462 if (!unicode_empty) 10463 goto error; 10464 u = unicode_empty; 10465 goto done; 10466 } 10467 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10468 PyErr_SetString(PyExc_OverflowError, 10469 "replace string is too long"); 10470 goto error; 10471 } 10472 u = PyUnicode_New(new_size, maxchar); 10473 if (!u) 10474 goto error; 10475 assert(PyUnicode_KIND(u) == rkind); 10476 res = PyUnicode_DATA(u); 10477 ires = i = 0; 10478 if (len1 > 0) { 10479 while (n-- > 0) { 10480 /* look for next match */ 10481 j = anylib_find(rkind, self, 10482 sbuf + rkind * i, slen-i, 10483 str1, buf1, len1, i); 10484 if (j == -1) 10485 break; 10486 else if (j > i) { 10487 /* copy unchanged part [i:j] */ 10488 memcpy(res + rkind * ires, 10489 sbuf + rkind * i, 10490 rkind * (j-i)); 10491 ires += j - i; 10492 } 10493 /* copy substitution string */ 10494 if (len2 > 0) { 10495 memcpy(res + rkind * ires, 10496 buf2, 10497 rkind * len2); 10498 ires += len2; 10499 } 10500 i = j + len1; 10501 } 10502 if (i < slen) 10503 /* copy tail [i:] */ 10504 memcpy(res + rkind * ires, 10505 sbuf + rkind * i, 10506 rkind * (slen-i)); 10507 } 10508 else { 10509 /* interleave */ 10510 while (n > 0) { 10511 memcpy(res + rkind * ires, 10512 buf2, 10513 rkind * len2); 10514 ires += len2; 10515 if (--n <= 0) 10516 break; 10517 memcpy(res + rkind * ires, 10518 sbuf + rkind * i, 10519 rkind); 10520 ires++; 10521 i++; 10522 } 10523 memcpy(res + rkind * ires, 10524 sbuf + rkind * i, 10525 rkind * (slen-i)); 10526 } 10527 } 10528 10529 if (mayshrink) { 10530 unicode_adjust_maxchar(&u); 10531 if (u == NULL) 10532 goto error; 10533 } 10534 10535 done: 10536 if (srelease) 10537 PyMem_FREE(sbuf); 10538 if (release1) 10539 PyMem_FREE(buf1); 10540 if (release2) 10541 PyMem_FREE(buf2); 10542 assert(_PyUnicode_CheckConsistency(u, 1)); 10543 return u; 10544 10545 nothing: 10546 /* nothing to replace; return original string (when possible) */ 10547 if (srelease) 10548 PyMem_FREE(sbuf); 10549 if (release1) 10550 PyMem_FREE(buf1); 10551 if (release2) 10552 PyMem_FREE(buf2); 10553 return unicode_result_unchanged(self); 10554 10555 error: 10556 if (srelease && sbuf) 10557 PyMem_FREE(sbuf); 10558 if (release1 && buf1) 10559 PyMem_FREE(buf1); 10560 if (release2 && buf2) 10561 PyMem_FREE(buf2); 10562 return NULL; 10563} 10564 10565/* --- Unicode Object Methods --------------------------------------------- */ 10566 10567PyDoc_STRVAR(title__doc__, 10568 "S.title() -> str\n\ 10569\n\ 10570Return a titlecased version of S, i.e. words start with title case\n\ 10571characters, all remaining cased characters have lower case."); 10572 10573static PyObject* 10574unicode_title(PyObject *self) 10575{ 10576 if (PyUnicode_READY(self) == -1) 10577 return NULL; 10578 return case_operation(self, do_title); 10579} 10580 10581PyDoc_STRVAR(capitalize__doc__, 10582 "S.capitalize() -> str\n\ 10583\n\ 10584Return a capitalized version of S, i.e. make the first character\n\ 10585have upper case and the rest lower case."); 10586 10587static PyObject* 10588unicode_capitalize(PyObject *self) 10589{ 10590 if (PyUnicode_READY(self) == -1) 10591 return NULL; 10592 if (PyUnicode_GET_LENGTH(self) == 0) 10593 return unicode_result_unchanged(self); 10594 return case_operation(self, do_capitalize); 10595} 10596 10597PyDoc_STRVAR(casefold__doc__, 10598 "S.casefold() -> str\n\ 10599\n\ 10600Return a version of S suitable for caseless comparisons."); 10601 10602static PyObject * 10603unicode_casefold(PyObject *self) 10604{ 10605 if (PyUnicode_READY(self) == -1) 10606 return NULL; 10607 if (PyUnicode_IS_ASCII(self)) 10608 return ascii_upper_or_lower(self, 1); 10609 return case_operation(self, do_casefold); 10610} 10611 10612 10613/* Argument converter. Coerces to a single unicode character */ 10614 10615static int 10616convert_uc(PyObject *obj, void *addr) 10617{ 10618 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10619 PyObject *uniobj; 10620 10621 uniobj = PyUnicode_FromObject(obj); 10622 if (uniobj == NULL) { 10623 PyErr_SetString(PyExc_TypeError, 10624 "The fill character cannot be converted to Unicode"); 10625 return 0; 10626 } 10627 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10628 PyErr_SetString(PyExc_TypeError, 10629 "The fill character must be exactly one character long"); 10630 Py_DECREF(uniobj); 10631 return 0; 10632 } 10633 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10634 Py_DECREF(uniobj); 10635 return 1; 10636} 10637 10638PyDoc_STRVAR(center__doc__, 10639 "S.center(width[, fillchar]) -> str\n\ 10640\n\ 10641Return S centered in a string of length width. Padding is\n\ 10642done using the specified fill character (default is a space)"); 10643 10644static PyObject * 10645unicode_center(PyObject *self, PyObject *args) 10646{ 10647 Py_ssize_t marg, left; 10648 Py_ssize_t width; 10649 Py_UCS4 fillchar = ' '; 10650 10651 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10652 return NULL; 10653 10654 if (PyUnicode_READY(self) == -1) 10655 return NULL; 10656 10657 if (PyUnicode_GET_LENGTH(self) >= width) 10658 return unicode_result_unchanged(self); 10659 10660 marg = width - PyUnicode_GET_LENGTH(self); 10661 left = marg / 2 + (marg & width & 1); 10662 10663 return pad(self, left, marg - left, fillchar); 10664} 10665 10666/* This function assumes that str1 and str2 are readied by the caller. */ 10667 10668static int 10669unicode_compare(PyObject *str1, PyObject *str2) 10670{ 10671#define COMPARE(TYPE1, TYPE2) \ 10672 do { \ 10673 TYPE1* p1 = (TYPE1 *)data1; \ 10674 TYPE2* p2 = (TYPE2 *)data2; \ 10675 TYPE1* end = p1 + len; \ 10676 Py_UCS4 c1, c2; \ 10677 for (; p1 != end; p1++, p2++) { \ 10678 c1 = *p1; \ 10679 c2 = *p2; \ 10680 if (c1 != c2) \ 10681 return (c1 < c2) ? -1 : 1; \ 10682 } \ 10683 } \ 10684 while (0) 10685 10686 int kind1, kind2; 10687 void *data1, *data2; 10688 Py_ssize_t len1, len2, len; 10689 10690 kind1 = PyUnicode_KIND(str1); 10691 kind2 = PyUnicode_KIND(str2); 10692 data1 = PyUnicode_DATA(str1); 10693 data2 = PyUnicode_DATA(str2); 10694 len1 = PyUnicode_GET_LENGTH(str1); 10695 len2 = PyUnicode_GET_LENGTH(str2); 10696 len = Py_MIN(len1, len2); 10697 10698 switch(kind1) { 10699 case PyUnicode_1BYTE_KIND: 10700 { 10701 switch(kind2) { 10702 case PyUnicode_1BYTE_KIND: 10703 { 10704 int cmp = memcmp(data1, data2, len); 10705 /* normalize result of memcmp() into the range [-1; 1] */ 10706 if (cmp < 0) 10707 return -1; 10708 if (cmp > 0) 10709 return 1; 10710 break; 10711 } 10712 case PyUnicode_2BYTE_KIND: 10713 COMPARE(Py_UCS1, Py_UCS2); 10714 break; 10715 case PyUnicode_4BYTE_KIND: 10716 COMPARE(Py_UCS1, Py_UCS4); 10717 break; 10718 default: 10719 assert(0); 10720 } 10721 break; 10722 } 10723 case PyUnicode_2BYTE_KIND: 10724 { 10725 switch(kind2) { 10726 case PyUnicode_1BYTE_KIND: 10727 COMPARE(Py_UCS2, Py_UCS1); 10728 break; 10729 case PyUnicode_2BYTE_KIND: 10730 { 10731 COMPARE(Py_UCS2, Py_UCS2); 10732 break; 10733 } 10734 case PyUnicode_4BYTE_KIND: 10735 COMPARE(Py_UCS2, Py_UCS4); 10736 break; 10737 default: 10738 assert(0); 10739 } 10740 break; 10741 } 10742 case PyUnicode_4BYTE_KIND: 10743 { 10744 switch(kind2) { 10745 case PyUnicode_1BYTE_KIND: 10746 COMPARE(Py_UCS4, Py_UCS1); 10747 break; 10748 case PyUnicode_2BYTE_KIND: 10749 COMPARE(Py_UCS4, Py_UCS2); 10750 break; 10751 case PyUnicode_4BYTE_KIND: 10752 { 10753#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10754 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10755 /* normalize result of wmemcmp() into the range [-1; 1] */ 10756 if (cmp < 0) 10757 return -1; 10758 if (cmp > 0) 10759 return 1; 10760#else 10761 COMPARE(Py_UCS4, Py_UCS4); 10762#endif 10763 break; 10764 } 10765 default: 10766 assert(0); 10767 } 10768 break; 10769 } 10770 default: 10771 assert(0); 10772 } 10773 10774 if (len1 == len2) 10775 return 0; 10776 if (len1 < len2) 10777 return -1; 10778 else 10779 return 1; 10780 10781#undef COMPARE 10782} 10783 10784Py_LOCAL(int) 10785unicode_compare_eq(PyObject *str1, PyObject *str2) 10786{ 10787 int kind; 10788 void *data1, *data2; 10789 Py_ssize_t len; 10790 int cmp; 10791 10792 len = PyUnicode_GET_LENGTH(str1); 10793 if (PyUnicode_GET_LENGTH(str2) != len) 10794 return 0; 10795 kind = PyUnicode_KIND(str1); 10796 if (PyUnicode_KIND(str2) != kind) 10797 return 0; 10798 data1 = PyUnicode_DATA(str1); 10799 data2 = PyUnicode_DATA(str2); 10800 10801 cmp = memcmp(data1, data2, len * kind); 10802 return (cmp == 0); 10803} 10804 10805 10806int 10807PyUnicode_Compare(PyObject *left, PyObject *right) 10808{ 10809 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10810 if (PyUnicode_READY(left) == -1 || 10811 PyUnicode_READY(right) == -1) 10812 return -1; 10813 10814 /* a string is equal to itself */ 10815 if (left == right) 10816 return 0; 10817 10818 return unicode_compare(left, right); 10819 } 10820 PyErr_Format(PyExc_TypeError, 10821 "Can't compare %.100s and %.100s", 10822 left->ob_type->tp_name, 10823 right->ob_type->tp_name); 10824 return -1; 10825} 10826 10827int 10828_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10829{ 10830 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10831 if (right_str == NULL) 10832 return -1; 10833 return PyUnicode_Compare(left, right_str); 10834} 10835 10836int 10837PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10838{ 10839 Py_ssize_t i; 10840 int kind; 10841 Py_UCS4 chr; 10842 10843 assert(_PyUnicode_CHECK(uni)); 10844 if (PyUnicode_READY(uni) == -1) 10845 return -1; 10846 kind = PyUnicode_KIND(uni); 10847 if (kind == PyUnicode_1BYTE_KIND) { 10848 const void *data = PyUnicode_1BYTE_DATA(uni); 10849 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10850 size_t len, len2 = strlen(str); 10851 int cmp; 10852 10853 len = Py_MIN(len1, len2); 10854 cmp = memcmp(data, str, len); 10855 if (cmp != 0) { 10856 if (cmp < 0) 10857 return -1; 10858 else 10859 return 1; 10860 } 10861 if (len1 > len2) 10862 return 1; /* uni is longer */ 10863 if (len2 > len1) 10864 return -1; /* str is longer */ 10865 return 0; 10866 } 10867 else { 10868 void *data = PyUnicode_DATA(uni); 10869 /* Compare Unicode string and source character set string */ 10870 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10871 if (chr != (unsigned char)str[i]) 10872 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10873 /* This check keeps Python strings that end in '\0' from comparing equal 10874 to C strings identical up to that point. */ 10875 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10876 return 1; /* uni is longer */ 10877 if (str[i]) 10878 return -1; /* str is longer */ 10879 return 0; 10880 } 10881} 10882 10883 10884#define TEST_COND(cond) \ 10885 ((cond) ? Py_True : Py_False) 10886 10887PyObject * 10888PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10889{ 10890 int result; 10891 PyObject *v; 10892 10893 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10894 Py_RETURN_NOTIMPLEMENTED; 10895 10896 if (PyUnicode_READY(left) == -1 || 10897 PyUnicode_READY(right) == -1) 10898 return NULL; 10899 10900 if (left == right) { 10901 switch (op) { 10902 case Py_EQ: 10903 case Py_LE: 10904 case Py_GE: 10905 /* a string is equal to itself */ 10906 v = Py_True; 10907 break; 10908 case Py_NE: 10909 case Py_LT: 10910 case Py_GT: 10911 v = Py_False; 10912 break; 10913 default: 10914 PyErr_BadArgument(); 10915 return NULL; 10916 } 10917 } 10918 else if (op == Py_EQ || op == Py_NE) { 10919 result = unicode_compare_eq(left, right); 10920 result ^= (op == Py_NE); 10921 v = TEST_COND(result); 10922 } 10923 else { 10924 result = unicode_compare(left, right); 10925 10926 /* Convert the return value to a Boolean */ 10927 switch (op) { 10928 case Py_LE: 10929 v = TEST_COND(result <= 0); 10930 break; 10931 case Py_GE: 10932 v = TEST_COND(result >= 0); 10933 break; 10934 case Py_LT: 10935 v = TEST_COND(result == -1); 10936 break; 10937 case Py_GT: 10938 v = TEST_COND(result == 1); 10939 break; 10940 default: 10941 PyErr_BadArgument(); 10942 return NULL; 10943 } 10944 } 10945 Py_INCREF(v); 10946 return v; 10947} 10948 10949int 10950PyUnicode_Contains(PyObject *container, PyObject *element) 10951{ 10952 PyObject *str, *sub; 10953 int kind1, kind2; 10954 void *buf1, *buf2; 10955 Py_ssize_t len1, len2; 10956 int result; 10957 10958 /* Coerce the two arguments */ 10959 sub = PyUnicode_FromObject(element); 10960 if (!sub) { 10961 PyErr_Format(PyExc_TypeError, 10962 "'in <string>' requires string as left operand, not %s", 10963 element->ob_type->tp_name); 10964 return -1; 10965 } 10966 10967 str = PyUnicode_FromObject(container); 10968 if (!str) { 10969 Py_DECREF(sub); 10970 return -1; 10971 } 10972 10973 kind1 = PyUnicode_KIND(str); 10974 kind2 = PyUnicode_KIND(sub); 10975 buf1 = PyUnicode_DATA(str); 10976 buf2 = PyUnicode_DATA(sub); 10977 if (kind2 != kind1) { 10978 if (kind2 > kind1) { 10979 Py_DECREF(sub); 10980 Py_DECREF(str); 10981 return 0; 10982 } 10983 buf2 = _PyUnicode_AsKind(sub, kind1); 10984 } 10985 if (!buf2) { 10986 Py_DECREF(sub); 10987 Py_DECREF(str); 10988 return -1; 10989 } 10990 len1 = PyUnicode_GET_LENGTH(str); 10991 len2 = PyUnicode_GET_LENGTH(sub); 10992 10993 switch (kind1) { 10994 case PyUnicode_1BYTE_KIND: 10995 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10996 break; 10997 case PyUnicode_2BYTE_KIND: 10998 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10999 break; 11000 case PyUnicode_4BYTE_KIND: 11001 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11002 break; 11003 default: 11004 result = -1; 11005 assert(0); 11006 } 11007 11008 Py_DECREF(str); 11009 Py_DECREF(sub); 11010 11011 if (kind2 != kind1) 11012 PyMem_Free(buf2); 11013 11014 return result; 11015} 11016 11017/* Concat to string or Unicode object giving a new Unicode object. */ 11018 11019PyObject * 11020PyUnicode_Concat(PyObject *left, PyObject *right) 11021{ 11022 PyObject *u = NULL, *v = NULL, *w; 11023 Py_UCS4 maxchar, maxchar2; 11024 Py_ssize_t u_len, v_len, new_len; 11025 11026 /* Coerce the two arguments */ 11027 u = PyUnicode_FromObject(left); 11028 if (u == NULL) 11029 goto onError; 11030 v = PyUnicode_FromObject(right); 11031 if (v == NULL) 11032 goto onError; 11033 11034 /* Shortcuts */ 11035 if (v == unicode_empty) { 11036 Py_DECREF(v); 11037 return u; 11038 } 11039 if (u == unicode_empty) { 11040 Py_DECREF(u); 11041 return v; 11042 } 11043 11044 u_len = PyUnicode_GET_LENGTH(u); 11045 v_len = PyUnicode_GET_LENGTH(v); 11046 if (u_len > PY_SSIZE_T_MAX - v_len) { 11047 PyErr_SetString(PyExc_OverflowError, 11048 "strings are too large to concat"); 11049 goto onError; 11050 } 11051 new_len = u_len + v_len; 11052 11053 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11054 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11055 maxchar = Py_MAX(maxchar, maxchar2); 11056 11057 /* Concat the two Unicode strings */ 11058 w = PyUnicode_New(new_len, maxchar); 11059 if (w == NULL) 11060 goto onError; 11061 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11062 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11063 Py_DECREF(u); 11064 Py_DECREF(v); 11065 assert(_PyUnicode_CheckConsistency(w, 1)); 11066 return w; 11067 11068 onError: 11069 Py_XDECREF(u); 11070 Py_XDECREF(v); 11071 return NULL; 11072} 11073 11074void 11075PyUnicode_Append(PyObject **p_left, PyObject *right) 11076{ 11077 PyObject *left, *res; 11078 Py_UCS4 maxchar, maxchar2; 11079 Py_ssize_t left_len, right_len, new_len; 11080 11081 if (p_left == NULL) { 11082 if (!PyErr_Occurred()) 11083 PyErr_BadInternalCall(); 11084 return; 11085 } 11086 left = *p_left; 11087 if (right == NULL || left == NULL 11088 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11089 if (!PyErr_Occurred()) 11090 PyErr_BadInternalCall(); 11091 goto error; 11092 } 11093 11094 if (PyUnicode_READY(left) == -1) 11095 goto error; 11096 if (PyUnicode_READY(right) == -1) 11097 goto error; 11098 11099 /* Shortcuts */ 11100 if (left == unicode_empty) { 11101 Py_DECREF(left); 11102 Py_INCREF(right); 11103 *p_left = right; 11104 return; 11105 } 11106 if (right == unicode_empty) 11107 return; 11108 11109 left_len = PyUnicode_GET_LENGTH(left); 11110 right_len = PyUnicode_GET_LENGTH(right); 11111 if (left_len > PY_SSIZE_T_MAX - right_len) { 11112 PyErr_SetString(PyExc_OverflowError, 11113 "strings are too large to concat"); 11114 goto error; 11115 } 11116 new_len = left_len + right_len; 11117 11118 if (unicode_modifiable(left) 11119 && PyUnicode_CheckExact(right) 11120 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11121 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11122 to change the structure size, but characters are stored just after 11123 the structure, and so it requires to move all characters which is 11124 not so different than duplicating the string. */ 11125 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11126 { 11127 /* append inplace */ 11128 if (unicode_resize(p_left, new_len) != 0) 11129 goto error; 11130 11131 /* copy 'right' into the newly allocated area of 'left' */ 11132 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11133 } 11134 else { 11135 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11136 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11137 maxchar = Py_MAX(maxchar, maxchar2); 11138 11139 /* Concat the two Unicode strings */ 11140 res = PyUnicode_New(new_len, maxchar); 11141 if (res == NULL) 11142 goto error; 11143 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11144 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11145 Py_DECREF(left); 11146 *p_left = res; 11147 } 11148 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11149 return; 11150 11151error: 11152 Py_CLEAR(*p_left); 11153} 11154 11155void 11156PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11157{ 11158 PyUnicode_Append(pleft, right); 11159 Py_XDECREF(right); 11160} 11161 11162PyDoc_STRVAR(count__doc__, 11163 "S.count(sub[, start[, end]]) -> int\n\ 11164\n\ 11165Return the number of non-overlapping occurrences of substring sub in\n\ 11166string S[start:end]. Optional arguments start and end are\n\ 11167interpreted as in slice notation."); 11168 11169static PyObject * 11170unicode_count(PyObject *self, PyObject *args) 11171{ 11172 PyObject *substring; 11173 Py_ssize_t start = 0; 11174 Py_ssize_t end = PY_SSIZE_T_MAX; 11175 PyObject *result; 11176 int kind1, kind2, kind; 11177 void *buf1, *buf2; 11178 Py_ssize_t len1, len2, iresult; 11179 11180 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11181 &start, &end)) 11182 return NULL; 11183 11184 kind1 = PyUnicode_KIND(self); 11185 kind2 = PyUnicode_KIND(substring); 11186 if (kind2 > kind1) { 11187 Py_DECREF(substring); 11188 return PyLong_FromLong(0); 11189 } 11190 kind = kind1; 11191 buf1 = PyUnicode_DATA(self); 11192 buf2 = PyUnicode_DATA(substring); 11193 if (kind2 != kind) 11194 buf2 = _PyUnicode_AsKind(substring, kind); 11195 if (!buf2) { 11196 Py_DECREF(substring); 11197 return NULL; 11198 } 11199 len1 = PyUnicode_GET_LENGTH(self); 11200 len2 = PyUnicode_GET_LENGTH(substring); 11201 11202 ADJUST_INDICES(start, end, len1); 11203 switch (kind) { 11204 case PyUnicode_1BYTE_KIND: 11205 iresult = ucs1lib_count( 11206 ((Py_UCS1*)buf1) + start, end - start, 11207 buf2, len2, PY_SSIZE_T_MAX 11208 ); 11209 break; 11210 case PyUnicode_2BYTE_KIND: 11211 iresult = ucs2lib_count( 11212 ((Py_UCS2*)buf1) + start, end - start, 11213 buf2, len2, PY_SSIZE_T_MAX 11214 ); 11215 break; 11216 case PyUnicode_4BYTE_KIND: 11217 iresult = ucs4lib_count( 11218 ((Py_UCS4*)buf1) + start, end - start, 11219 buf2, len2, PY_SSIZE_T_MAX 11220 ); 11221 break; 11222 default: 11223 assert(0); iresult = 0; 11224 } 11225 11226 result = PyLong_FromSsize_t(iresult); 11227 11228 if (kind2 != kind) 11229 PyMem_Free(buf2); 11230 11231 Py_DECREF(substring); 11232 11233 return result; 11234} 11235 11236PyDoc_STRVAR(encode__doc__, 11237 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11238\n\ 11239Encode S using the codec registered for encoding. Default encoding\n\ 11240is 'utf-8'. errors may be given to set a different error\n\ 11241handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11242a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11243'xmlcharrefreplace' as well as any other name registered with\n\ 11244codecs.register_error that can handle UnicodeEncodeErrors."); 11245 11246static PyObject * 11247unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11248{ 11249 static char *kwlist[] = {"encoding", "errors", 0}; 11250 char *encoding = NULL; 11251 char *errors = NULL; 11252 11253 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11254 kwlist, &encoding, &errors)) 11255 return NULL; 11256 return PyUnicode_AsEncodedString(self, encoding, errors); 11257} 11258 11259PyDoc_STRVAR(expandtabs__doc__, 11260 "S.expandtabs(tabsize=8) -> str\n\ 11261\n\ 11262Return a copy of S where all tab characters are expanded using spaces.\n\ 11263If tabsize is not given, a tab size of 8 characters is assumed."); 11264 11265static PyObject* 11266unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11267{ 11268 Py_ssize_t i, j, line_pos, src_len, incr; 11269 Py_UCS4 ch; 11270 PyObject *u; 11271 void *src_data, *dest_data; 11272 static char *kwlist[] = {"tabsize", 0}; 11273 int tabsize = 8; 11274 int kind; 11275 int found; 11276 11277 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11278 kwlist, &tabsize)) 11279 return NULL; 11280 11281 if (PyUnicode_READY(self) == -1) 11282 return NULL; 11283 11284 /* First pass: determine size of output string */ 11285 src_len = PyUnicode_GET_LENGTH(self); 11286 i = j = line_pos = 0; 11287 kind = PyUnicode_KIND(self); 11288 src_data = PyUnicode_DATA(self); 11289 found = 0; 11290 for (; i < src_len; i++) { 11291 ch = PyUnicode_READ(kind, src_data, i); 11292 if (ch == '\t') { 11293 found = 1; 11294 if (tabsize > 0) { 11295 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11296 if (j > PY_SSIZE_T_MAX - incr) 11297 goto overflow; 11298 line_pos += incr; 11299 j += incr; 11300 } 11301 } 11302 else { 11303 if (j > PY_SSIZE_T_MAX - 1) 11304 goto overflow; 11305 line_pos++; 11306 j++; 11307 if (ch == '\n' || ch == '\r') 11308 line_pos = 0; 11309 } 11310 } 11311 if (!found) 11312 return unicode_result_unchanged(self); 11313 11314 /* Second pass: create output string and fill it */ 11315 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11316 if (!u) 11317 return NULL; 11318 dest_data = PyUnicode_DATA(u); 11319 11320 i = j = line_pos = 0; 11321 11322 for (; i < src_len; i++) { 11323 ch = PyUnicode_READ(kind, src_data, i); 11324 if (ch == '\t') { 11325 if (tabsize > 0) { 11326 incr = tabsize - (line_pos % tabsize); 11327 line_pos += incr; 11328 FILL(kind, dest_data, ' ', j, incr); 11329 j += incr; 11330 } 11331 } 11332 else { 11333 line_pos++; 11334 PyUnicode_WRITE(kind, dest_data, j, ch); 11335 j++; 11336 if (ch == '\n' || ch == '\r') 11337 line_pos = 0; 11338 } 11339 } 11340 assert (j == PyUnicode_GET_LENGTH(u)); 11341 return unicode_result(u); 11342 11343 overflow: 11344 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11345 return NULL; 11346} 11347 11348PyDoc_STRVAR(find__doc__, 11349 "S.find(sub[, start[, end]]) -> int\n\ 11350\n\ 11351Return the lowest index in S where substring sub is found,\n\ 11352such that sub is contained within S[start:end]. Optional\n\ 11353arguments start and end are interpreted as in slice notation.\n\ 11354\n\ 11355Return -1 on failure."); 11356 11357static PyObject * 11358unicode_find(PyObject *self, PyObject *args) 11359{ 11360 PyObject *substring; 11361 Py_ssize_t start; 11362 Py_ssize_t end; 11363 Py_ssize_t result; 11364 11365 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11366 &start, &end)) 11367 return NULL; 11368 11369 if (PyUnicode_READY(self) == -1) { 11370 Py_DECREF(substring); 11371 return NULL; 11372 } 11373 if (PyUnicode_READY(substring) == -1) { 11374 Py_DECREF(substring); 11375 return NULL; 11376 } 11377 11378 result = any_find_slice(1, self, substring, start, end); 11379 11380 Py_DECREF(substring); 11381 11382 if (result == -2) 11383 return NULL; 11384 11385 return PyLong_FromSsize_t(result); 11386} 11387 11388static PyObject * 11389unicode_getitem(PyObject *self, Py_ssize_t index) 11390{ 11391 void *data; 11392 enum PyUnicode_Kind kind; 11393 Py_UCS4 ch; 11394 11395 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11396 PyErr_BadArgument(); 11397 return NULL; 11398 } 11399 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11400 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11401 return NULL; 11402 } 11403 kind = PyUnicode_KIND(self); 11404 data = PyUnicode_DATA(self); 11405 ch = PyUnicode_READ(kind, data, index); 11406 return unicode_char(ch); 11407} 11408 11409/* Believe it or not, this produces the same value for ASCII strings 11410 as bytes_hash(). */ 11411static Py_hash_t 11412unicode_hash(PyObject *self) 11413{ 11414 Py_ssize_t len; 11415 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11416 11417#ifdef Py_DEBUG 11418 assert(_Py_HashSecret_Initialized); 11419#endif 11420 if (_PyUnicode_HASH(self) != -1) 11421 return _PyUnicode_HASH(self); 11422 if (PyUnicode_READY(self) == -1) 11423 return -1; 11424 len = PyUnicode_GET_LENGTH(self); 11425 /* 11426 We make the hash of the empty string be 0, rather than using 11427 (prefix ^ suffix), since this slightly obfuscates the hash secret 11428 */ 11429 if (len == 0) { 11430 _PyUnicode_HASH(self) = 0; 11431 return 0; 11432 } 11433 x = _Py_HashBytes(PyUnicode_DATA(self), 11434 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11435 _PyUnicode_HASH(self) = x; 11436 return x; 11437} 11438 11439PyDoc_STRVAR(index__doc__, 11440 "S.index(sub[, start[, end]]) -> int\n\ 11441\n\ 11442Like S.find() but raise ValueError when the substring is not found."); 11443 11444static PyObject * 11445unicode_index(PyObject *self, PyObject *args) 11446{ 11447 Py_ssize_t result; 11448 PyObject *substring; 11449 Py_ssize_t start; 11450 Py_ssize_t end; 11451 11452 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11453 &start, &end)) 11454 return NULL; 11455 11456 if (PyUnicode_READY(self) == -1) { 11457 Py_DECREF(substring); 11458 return NULL; 11459 } 11460 if (PyUnicode_READY(substring) == -1) { 11461 Py_DECREF(substring); 11462 return NULL; 11463 } 11464 11465 result = any_find_slice(1, self, substring, start, end); 11466 11467 Py_DECREF(substring); 11468 11469 if (result == -2) 11470 return NULL; 11471 11472 if (result < 0) { 11473 PyErr_SetString(PyExc_ValueError, "substring not found"); 11474 return NULL; 11475 } 11476 11477 return PyLong_FromSsize_t(result); 11478} 11479 11480PyDoc_STRVAR(islower__doc__, 11481 "S.islower() -> bool\n\ 11482\n\ 11483Return True if all cased characters in S are lowercase and there is\n\ 11484at least one cased character in S, False otherwise."); 11485 11486static PyObject* 11487unicode_islower(PyObject *self) 11488{ 11489 Py_ssize_t i, length; 11490 int kind; 11491 void *data; 11492 int cased; 11493 11494 if (PyUnicode_READY(self) == -1) 11495 return NULL; 11496 length = PyUnicode_GET_LENGTH(self); 11497 kind = PyUnicode_KIND(self); 11498 data = PyUnicode_DATA(self); 11499 11500 /* Shortcut for single character strings */ 11501 if (length == 1) 11502 return PyBool_FromLong( 11503 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11504 11505 /* Special case for empty strings */ 11506 if (length == 0) 11507 return PyBool_FromLong(0); 11508 11509 cased = 0; 11510 for (i = 0; i < length; i++) { 11511 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11512 11513 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11514 return PyBool_FromLong(0); 11515 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11516 cased = 1; 11517 } 11518 return PyBool_FromLong(cased); 11519} 11520 11521PyDoc_STRVAR(isupper__doc__, 11522 "S.isupper() -> bool\n\ 11523\n\ 11524Return True if all cased characters in S are uppercase and there is\n\ 11525at least one cased character in S, False otherwise."); 11526 11527static PyObject* 11528unicode_isupper(PyObject *self) 11529{ 11530 Py_ssize_t i, length; 11531 int kind; 11532 void *data; 11533 int cased; 11534 11535 if (PyUnicode_READY(self) == -1) 11536 return NULL; 11537 length = PyUnicode_GET_LENGTH(self); 11538 kind = PyUnicode_KIND(self); 11539 data = PyUnicode_DATA(self); 11540 11541 /* Shortcut for single character strings */ 11542 if (length == 1) 11543 return PyBool_FromLong( 11544 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11545 11546 /* Special case for empty strings */ 11547 if (length == 0) 11548 return PyBool_FromLong(0); 11549 11550 cased = 0; 11551 for (i = 0; i < length; i++) { 11552 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11553 11554 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11555 return PyBool_FromLong(0); 11556 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11557 cased = 1; 11558 } 11559 return PyBool_FromLong(cased); 11560} 11561 11562PyDoc_STRVAR(istitle__doc__, 11563 "S.istitle() -> bool\n\ 11564\n\ 11565Return True if S is a titlecased string and there is at least one\n\ 11566character in S, i.e. upper- and titlecase characters may only\n\ 11567follow uncased characters and lowercase characters only cased ones.\n\ 11568Return False otherwise."); 11569 11570static PyObject* 11571unicode_istitle(PyObject *self) 11572{ 11573 Py_ssize_t i, length; 11574 int kind; 11575 void *data; 11576 int cased, previous_is_cased; 11577 11578 if (PyUnicode_READY(self) == -1) 11579 return NULL; 11580 length = PyUnicode_GET_LENGTH(self); 11581 kind = PyUnicode_KIND(self); 11582 data = PyUnicode_DATA(self); 11583 11584 /* Shortcut for single character strings */ 11585 if (length == 1) { 11586 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11587 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11588 (Py_UNICODE_ISUPPER(ch) != 0)); 11589 } 11590 11591 /* Special case for empty strings */ 11592 if (length == 0) 11593 return PyBool_FromLong(0); 11594 11595 cased = 0; 11596 previous_is_cased = 0; 11597 for (i = 0; i < length; i++) { 11598 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11599 11600 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11601 if (previous_is_cased) 11602 return PyBool_FromLong(0); 11603 previous_is_cased = 1; 11604 cased = 1; 11605 } 11606 else if (Py_UNICODE_ISLOWER(ch)) { 11607 if (!previous_is_cased) 11608 return PyBool_FromLong(0); 11609 previous_is_cased = 1; 11610 cased = 1; 11611 } 11612 else 11613 previous_is_cased = 0; 11614 } 11615 return PyBool_FromLong(cased); 11616} 11617 11618PyDoc_STRVAR(isspace__doc__, 11619 "S.isspace() -> bool\n\ 11620\n\ 11621Return True if all characters in S are whitespace\n\ 11622and there is at least one character in S, False otherwise."); 11623 11624static PyObject* 11625unicode_isspace(PyObject *self) 11626{ 11627 Py_ssize_t i, length; 11628 int kind; 11629 void *data; 11630 11631 if (PyUnicode_READY(self) == -1) 11632 return NULL; 11633 length = PyUnicode_GET_LENGTH(self); 11634 kind = PyUnicode_KIND(self); 11635 data = PyUnicode_DATA(self); 11636 11637 /* Shortcut for single character strings */ 11638 if (length == 1) 11639 return PyBool_FromLong( 11640 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11641 11642 /* Special case for empty strings */ 11643 if (length == 0) 11644 return PyBool_FromLong(0); 11645 11646 for (i = 0; i < length; i++) { 11647 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11648 if (!Py_UNICODE_ISSPACE(ch)) 11649 return PyBool_FromLong(0); 11650 } 11651 return PyBool_FromLong(1); 11652} 11653 11654PyDoc_STRVAR(isalpha__doc__, 11655 "S.isalpha() -> bool\n\ 11656\n\ 11657Return True if all characters in S are alphabetic\n\ 11658and there is at least one character in S, False otherwise."); 11659 11660static PyObject* 11661unicode_isalpha(PyObject *self) 11662{ 11663 Py_ssize_t i, length; 11664 int kind; 11665 void *data; 11666 11667 if (PyUnicode_READY(self) == -1) 11668 return NULL; 11669 length = PyUnicode_GET_LENGTH(self); 11670 kind = PyUnicode_KIND(self); 11671 data = PyUnicode_DATA(self); 11672 11673 /* Shortcut for single character strings */ 11674 if (length == 1) 11675 return PyBool_FromLong( 11676 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11677 11678 /* Special case for empty strings */ 11679 if (length == 0) 11680 return PyBool_FromLong(0); 11681 11682 for (i = 0; i < length; i++) { 11683 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11684 return PyBool_FromLong(0); 11685 } 11686 return PyBool_FromLong(1); 11687} 11688 11689PyDoc_STRVAR(isalnum__doc__, 11690 "S.isalnum() -> bool\n\ 11691\n\ 11692Return True if all characters in S are alphanumeric\n\ 11693and there is at least one character in S, False otherwise."); 11694 11695static PyObject* 11696unicode_isalnum(PyObject *self) 11697{ 11698 int kind; 11699 void *data; 11700 Py_ssize_t len, i; 11701 11702 if (PyUnicode_READY(self) == -1) 11703 return NULL; 11704 11705 kind = PyUnicode_KIND(self); 11706 data = PyUnicode_DATA(self); 11707 len = PyUnicode_GET_LENGTH(self); 11708 11709 /* Shortcut for single character strings */ 11710 if (len == 1) { 11711 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11712 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11713 } 11714 11715 /* Special case for empty strings */ 11716 if (len == 0) 11717 return PyBool_FromLong(0); 11718 11719 for (i = 0; i < len; i++) { 11720 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11721 if (!Py_UNICODE_ISALNUM(ch)) 11722 return PyBool_FromLong(0); 11723 } 11724 return PyBool_FromLong(1); 11725} 11726 11727PyDoc_STRVAR(isdecimal__doc__, 11728 "S.isdecimal() -> bool\n\ 11729\n\ 11730Return True if there are only decimal characters in S,\n\ 11731False otherwise."); 11732 11733static PyObject* 11734unicode_isdecimal(PyObject *self) 11735{ 11736 Py_ssize_t i, length; 11737 int kind; 11738 void *data; 11739 11740 if (PyUnicode_READY(self) == -1) 11741 return NULL; 11742 length = PyUnicode_GET_LENGTH(self); 11743 kind = PyUnicode_KIND(self); 11744 data = PyUnicode_DATA(self); 11745 11746 /* Shortcut for single character strings */ 11747 if (length == 1) 11748 return PyBool_FromLong( 11749 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11750 11751 /* Special case for empty strings */ 11752 if (length == 0) 11753 return PyBool_FromLong(0); 11754 11755 for (i = 0; i < length; i++) { 11756 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11757 return PyBool_FromLong(0); 11758 } 11759 return PyBool_FromLong(1); 11760} 11761 11762PyDoc_STRVAR(isdigit__doc__, 11763 "S.isdigit() -> bool\n\ 11764\n\ 11765Return True if all characters in S are digits\n\ 11766and there is at least one character in S, False otherwise."); 11767 11768static PyObject* 11769unicode_isdigit(PyObject *self) 11770{ 11771 Py_ssize_t i, length; 11772 int kind; 11773 void *data; 11774 11775 if (PyUnicode_READY(self) == -1) 11776 return NULL; 11777 length = PyUnicode_GET_LENGTH(self); 11778 kind = PyUnicode_KIND(self); 11779 data = PyUnicode_DATA(self); 11780 11781 /* Shortcut for single character strings */ 11782 if (length == 1) { 11783 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11784 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11785 } 11786 11787 /* Special case for empty strings */ 11788 if (length == 0) 11789 return PyBool_FromLong(0); 11790 11791 for (i = 0; i < length; i++) { 11792 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11793 return PyBool_FromLong(0); 11794 } 11795 return PyBool_FromLong(1); 11796} 11797 11798PyDoc_STRVAR(isnumeric__doc__, 11799 "S.isnumeric() -> bool\n\ 11800\n\ 11801Return True if there are only numeric characters in S,\n\ 11802False otherwise."); 11803 11804static PyObject* 11805unicode_isnumeric(PyObject *self) 11806{ 11807 Py_ssize_t i, length; 11808 int kind; 11809 void *data; 11810 11811 if (PyUnicode_READY(self) == -1) 11812 return NULL; 11813 length = PyUnicode_GET_LENGTH(self); 11814 kind = PyUnicode_KIND(self); 11815 data = PyUnicode_DATA(self); 11816 11817 /* Shortcut for single character strings */ 11818 if (length == 1) 11819 return PyBool_FromLong( 11820 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11821 11822 /* Special case for empty strings */ 11823 if (length == 0) 11824 return PyBool_FromLong(0); 11825 11826 for (i = 0; i < length; i++) { 11827 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11828 return PyBool_FromLong(0); 11829 } 11830 return PyBool_FromLong(1); 11831} 11832 11833int 11834PyUnicode_IsIdentifier(PyObject *self) 11835{ 11836 int kind; 11837 void *data; 11838 Py_ssize_t i; 11839 Py_UCS4 first; 11840 11841 if (PyUnicode_READY(self) == -1) { 11842 Py_FatalError("identifier not ready"); 11843 return 0; 11844 } 11845 11846 /* Special case for empty strings */ 11847 if (PyUnicode_GET_LENGTH(self) == 0) 11848 return 0; 11849 kind = PyUnicode_KIND(self); 11850 data = PyUnicode_DATA(self); 11851 11852 /* PEP 3131 says that the first character must be in 11853 XID_Start and subsequent characters in XID_Continue, 11854 and for the ASCII range, the 2.x rules apply (i.e 11855 start with letters and underscore, continue with 11856 letters, digits, underscore). However, given the current 11857 definition of XID_Start and XID_Continue, it is sufficient 11858 to check just for these, except that _ must be allowed 11859 as starting an identifier. */ 11860 first = PyUnicode_READ(kind, data, 0); 11861 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11862 return 0; 11863 11864 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11865 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11866 return 0; 11867 return 1; 11868} 11869 11870PyDoc_STRVAR(isidentifier__doc__, 11871 "S.isidentifier() -> bool\n\ 11872\n\ 11873Return True if S is a valid identifier according\n\ 11874to the language definition.\n\ 11875\n\ 11876Use keyword.iskeyword() to test for reserved identifiers\n\ 11877such as \"def\" and \"class\".\n"); 11878 11879static PyObject* 11880unicode_isidentifier(PyObject *self) 11881{ 11882 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11883} 11884 11885PyDoc_STRVAR(isprintable__doc__, 11886 "S.isprintable() -> bool\n\ 11887\n\ 11888Return True if all characters in S are considered\n\ 11889printable in repr() or S is empty, False otherwise."); 11890 11891static PyObject* 11892unicode_isprintable(PyObject *self) 11893{ 11894 Py_ssize_t i, length; 11895 int kind; 11896 void *data; 11897 11898 if (PyUnicode_READY(self) == -1) 11899 return NULL; 11900 length = PyUnicode_GET_LENGTH(self); 11901 kind = PyUnicode_KIND(self); 11902 data = PyUnicode_DATA(self); 11903 11904 /* Shortcut for single character strings */ 11905 if (length == 1) 11906 return PyBool_FromLong( 11907 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11908 11909 for (i = 0; i < length; i++) { 11910 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11911 Py_RETURN_FALSE; 11912 } 11913 } 11914 Py_RETURN_TRUE; 11915} 11916 11917PyDoc_STRVAR(join__doc__, 11918 "S.join(iterable) -> str\n\ 11919\n\ 11920Return a string which is the concatenation of the strings in the\n\ 11921iterable. The separator between elements is S."); 11922 11923static PyObject* 11924unicode_join(PyObject *self, PyObject *data) 11925{ 11926 return PyUnicode_Join(self, data); 11927} 11928 11929static Py_ssize_t 11930unicode_length(PyObject *self) 11931{ 11932 if (PyUnicode_READY(self) == -1) 11933 return -1; 11934 return PyUnicode_GET_LENGTH(self); 11935} 11936 11937PyDoc_STRVAR(ljust__doc__, 11938 "S.ljust(width[, fillchar]) -> str\n\ 11939\n\ 11940Return S left-justified in a Unicode string of length width. Padding is\n\ 11941done using the specified fill character (default is a space)."); 11942 11943static PyObject * 11944unicode_ljust(PyObject *self, PyObject *args) 11945{ 11946 Py_ssize_t width; 11947 Py_UCS4 fillchar = ' '; 11948 11949 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11950 return NULL; 11951 11952 if (PyUnicode_READY(self) == -1) 11953 return NULL; 11954 11955 if (PyUnicode_GET_LENGTH(self) >= width) 11956 return unicode_result_unchanged(self); 11957 11958 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11959} 11960 11961PyDoc_STRVAR(lower__doc__, 11962 "S.lower() -> str\n\ 11963\n\ 11964Return a copy of the string S converted to lowercase."); 11965 11966static PyObject* 11967unicode_lower(PyObject *self) 11968{ 11969 if (PyUnicode_READY(self) == -1) 11970 return NULL; 11971 if (PyUnicode_IS_ASCII(self)) 11972 return ascii_upper_or_lower(self, 1); 11973 return case_operation(self, do_lower); 11974} 11975 11976#define LEFTSTRIP 0 11977#define RIGHTSTRIP 1 11978#define BOTHSTRIP 2 11979 11980/* Arrays indexed by above */ 11981static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11982 11983#define STRIPNAME(i) (stripformat[i]+3) 11984 11985/* externally visible for str.strip(unicode) */ 11986PyObject * 11987_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11988{ 11989 void *data; 11990 int kind; 11991 Py_ssize_t i, j, len; 11992 BLOOM_MASK sepmask; 11993 Py_ssize_t seplen; 11994 11995 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11996 return NULL; 11997 11998 kind = PyUnicode_KIND(self); 11999 data = PyUnicode_DATA(self); 12000 len = PyUnicode_GET_LENGTH(self); 12001 seplen = PyUnicode_GET_LENGTH(sepobj); 12002 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12003 PyUnicode_DATA(sepobj), 12004 seplen); 12005 12006 i = 0; 12007 if (striptype != RIGHTSTRIP) { 12008 while (i < len) { 12009 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12010 if (!BLOOM(sepmask, ch)) 12011 break; 12012 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12013 break; 12014 i++; 12015 } 12016 } 12017 12018 j = len; 12019 if (striptype != LEFTSTRIP) { 12020 j--; 12021 while (j >= i) { 12022 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12023 if (!BLOOM(sepmask, ch)) 12024 break; 12025 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12026 break; 12027 j--; 12028 } 12029 12030 j++; 12031 } 12032 12033 return PyUnicode_Substring(self, i, j); 12034} 12035 12036PyObject* 12037PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12038{ 12039 unsigned char *data; 12040 int kind; 12041 Py_ssize_t length; 12042 12043 if (PyUnicode_READY(self) == -1) 12044 return NULL; 12045 12046 length = PyUnicode_GET_LENGTH(self); 12047 end = Py_MIN(end, length); 12048 12049 if (start == 0 && end == length) 12050 return unicode_result_unchanged(self); 12051 12052 if (start < 0 || end < 0) { 12053 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12054 return NULL; 12055 } 12056 if (start >= length || end < start) 12057 _Py_RETURN_UNICODE_EMPTY(); 12058 12059 length = end - start; 12060 if (PyUnicode_IS_ASCII(self)) { 12061 data = PyUnicode_1BYTE_DATA(self); 12062 return _PyUnicode_FromASCII((char*)(data + start), length); 12063 } 12064 else { 12065 kind = PyUnicode_KIND(self); 12066 data = PyUnicode_1BYTE_DATA(self); 12067 return PyUnicode_FromKindAndData(kind, 12068 data + kind * start, 12069 length); 12070 } 12071} 12072 12073static PyObject * 12074do_strip(PyObject *self, int striptype) 12075{ 12076 Py_ssize_t len, i, j; 12077 12078 if (PyUnicode_READY(self) == -1) 12079 return NULL; 12080 12081 len = PyUnicode_GET_LENGTH(self); 12082 12083 if (PyUnicode_IS_ASCII(self)) { 12084 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12085 12086 i = 0; 12087 if (striptype != RIGHTSTRIP) { 12088 while (i < len) { 12089 Py_UCS1 ch = data[i]; 12090 if (!_Py_ascii_whitespace[ch]) 12091 break; 12092 i++; 12093 } 12094 } 12095 12096 j = len; 12097 if (striptype != LEFTSTRIP) { 12098 j--; 12099 while (j >= i) { 12100 Py_UCS1 ch = data[j]; 12101 if (!_Py_ascii_whitespace[ch]) 12102 break; 12103 j--; 12104 } 12105 j++; 12106 } 12107 } 12108 else { 12109 int kind = PyUnicode_KIND(self); 12110 void *data = PyUnicode_DATA(self); 12111 12112 i = 0; 12113 if (striptype != RIGHTSTRIP) { 12114 while (i < len) { 12115 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12116 if (!Py_UNICODE_ISSPACE(ch)) 12117 break; 12118 i++; 12119 } 12120 } 12121 12122 j = len; 12123 if (striptype != LEFTSTRIP) { 12124 j--; 12125 while (j >= i) { 12126 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12127 if (!Py_UNICODE_ISSPACE(ch)) 12128 break; 12129 j--; 12130 } 12131 j++; 12132 } 12133 } 12134 12135 return PyUnicode_Substring(self, i, j); 12136} 12137 12138 12139static PyObject * 12140do_argstrip(PyObject *self, int striptype, PyObject *args) 12141{ 12142 PyObject *sep = NULL; 12143 12144 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12145 return NULL; 12146 12147 if (sep != NULL && sep != Py_None) { 12148 if (PyUnicode_Check(sep)) 12149 return _PyUnicode_XStrip(self, striptype, sep); 12150 else { 12151 PyErr_Format(PyExc_TypeError, 12152 "%s arg must be None or str", 12153 STRIPNAME(striptype)); 12154 return NULL; 12155 } 12156 } 12157 12158 return do_strip(self, striptype); 12159} 12160 12161 12162PyDoc_STRVAR(strip__doc__, 12163 "S.strip([chars]) -> str\n\ 12164\n\ 12165Return a copy of the string S with leading and trailing\n\ 12166whitespace removed.\n\ 12167If chars is given and not None, remove characters in chars instead."); 12168 12169static PyObject * 12170unicode_strip(PyObject *self, PyObject *args) 12171{ 12172 if (PyTuple_GET_SIZE(args) == 0) 12173 return do_strip(self, BOTHSTRIP); /* Common case */ 12174 else 12175 return do_argstrip(self, BOTHSTRIP, args); 12176} 12177 12178 12179PyDoc_STRVAR(lstrip__doc__, 12180 "S.lstrip([chars]) -> str\n\ 12181\n\ 12182Return a copy of the string S with leading whitespace removed.\n\ 12183If chars is given and not None, remove characters in chars instead."); 12184 12185static PyObject * 12186unicode_lstrip(PyObject *self, PyObject *args) 12187{ 12188 if (PyTuple_GET_SIZE(args) == 0) 12189 return do_strip(self, LEFTSTRIP); /* Common case */ 12190 else 12191 return do_argstrip(self, LEFTSTRIP, args); 12192} 12193 12194 12195PyDoc_STRVAR(rstrip__doc__, 12196 "S.rstrip([chars]) -> str\n\ 12197\n\ 12198Return a copy of the string S with trailing whitespace removed.\n\ 12199If chars is given and not None, remove characters in chars instead."); 12200 12201static PyObject * 12202unicode_rstrip(PyObject *self, PyObject *args) 12203{ 12204 if (PyTuple_GET_SIZE(args) == 0) 12205 return do_strip(self, RIGHTSTRIP); /* Common case */ 12206 else 12207 return do_argstrip(self, RIGHTSTRIP, args); 12208} 12209 12210 12211static PyObject* 12212unicode_repeat(PyObject *str, Py_ssize_t len) 12213{ 12214 PyObject *u; 12215 Py_ssize_t nchars, n; 12216 12217 if (len < 1) 12218 _Py_RETURN_UNICODE_EMPTY(); 12219 12220 /* no repeat, return original string */ 12221 if (len == 1) 12222 return unicode_result_unchanged(str); 12223 12224 if (PyUnicode_READY(str) == -1) 12225 return NULL; 12226 12227 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12228 PyErr_SetString(PyExc_OverflowError, 12229 "repeated string is too long"); 12230 return NULL; 12231 } 12232 nchars = len * PyUnicode_GET_LENGTH(str); 12233 12234 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12235 if (!u) 12236 return NULL; 12237 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12238 12239 if (PyUnicode_GET_LENGTH(str) == 1) { 12240 const int kind = PyUnicode_KIND(str); 12241 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12242 if (kind == PyUnicode_1BYTE_KIND) { 12243 void *to = PyUnicode_DATA(u); 12244 memset(to, (unsigned char)fill_char, len); 12245 } 12246 else if (kind == PyUnicode_2BYTE_KIND) { 12247 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12248 for (n = 0; n < len; ++n) 12249 ucs2[n] = fill_char; 12250 } else { 12251 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12252 assert(kind == PyUnicode_4BYTE_KIND); 12253 for (n = 0; n < len; ++n) 12254 ucs4[n] = fill_char; 12255 } 12256 } 12257 else { 12258 /* number of characters copied this far */ 12259 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12260 const Py_ssize_t char_size = PyUnicode_KIND(str); 12261 char *to = (char *) PyUnicode_DATA(u); 12262 Py_MEMCPY(to, PyUnicode_DATA(str), 12263 PyUnicode_GET_LENGTH(str) * char_size); 12264 while (done < nchars) { 12265 n = (done <= nchars-done) ? done : nchars-done; 12266 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12267 done += n; 12268 } 12269 } 12270 12271 assert(_PyUnicode_CheckConsistency(u, 1)); 12272 return u; 12273} 12274 12275PyObject * 12276PyUnicode_Replace(PyObject *obj, 12277 PyObject *subobj, 12278 PyObject *replobj, 12279 Py_ssize_t maxcount) 12280{ 12281 PyObject *self; 12282 PyObject *str1; 12283 PyObject *str2; 12284 PyObject *result; 12285 12286 self = PyUnicode_FromObject(obj); 12287 if (self == NULL) 12288 return NULL; 12289 str1 = PyUnicode_FromObject(subobj); 12290 if (str1 == NULL) { 12291 Py_DECREF(self); 12292 return NULL; 12293 } 12294 str2 = PyUnicode_FromObject(replobj); 12295 if (str2 == NULL) { 12296 Py_DECREF(self); 12297 Py_DECREF(str1); 12298 return NULL; 12299 } 12300 if (PyUnicode_READY(self) == -1 || 12301 PyUnicode_READY(str1) == -1 || 12302 PyUnicode_READY(str2) == -1) 12303 result = NULL; 12304 else 12305 result = replace(self, str1, str2, maxcount); 12306 Py_DECREF(self); 12307 Py_DECREF(str1); 12308 Py_DECREF(str2); 12309 return result; 12310} 12311 12312PyDoc_STRVAR(replace__doc__, 12313 "S.replace(old, new[, count]) -> str\n\ 12314\n\ 12315Return a copy of S with all occurrences of substring\n\ 12316old replaced by new. If the optional argument count is\n\ 12317given, only the first count occurrences are replaced."); 12318 12319static PyObject* 12320unicode_replace(PyObject *self, PyObject *args) 12321{ 12322 PyObject *str1; 12323 PyObject *str2; 12324 Py_ssize_t maxcount = -1; 12325 PyObject *result; 12326 12327 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12328 return NULL; 12329 if (PyUnicode_READY(self) == -1) 12330 return NULL; 12331 str1 = PyUnicode_FromObject(str1); 12332 if (str1 == NULL) 12333 return NULL; 12334 str2 = PyUnicode_FromObject(str2); 12335 if (str2 == NULL) { 12336 Py_DECREF(str1); 12337 return NULL; 12338 } 12339 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12340 result = NULL; 12341 else 12342 result = replace(self, str1, str2, maxcount); 12343 12344 Py_DECREF(str1); 12345 Py_DECREF(str2); 12346 return result; 12347} 12348 12349static PyObject * 12350unicode_repr(PyObject *unicode) 12351{ 12352 PyObject *repr; 12353 Py_ssize_t isize; 12354 Py_ssize_t osize, squote, dquote, i, o; 12355 Py_UCS4 max, quote; 12356 int ikind, okind, unchanged; 12357 void *idata, *odata; 12358 12359 if (PyUnicode_READY(unicode) == -1) 12360 return NULL; 12361 12362 isize = PyUnicode_GET_LENGTH(unicode); 12363 idata = PyUnicode_DATA(unicode); 12364 12365 /* Compute length of output, quote characters, and 12366 maximum character */ 12367 osize = 0; 12368 max = 127; 12369 squote = dquote = 0; 12370 ikind = PyUnicode_KIND(unicode); 12371 for (i = 0; i < isize; i++) { 12372 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12373 Py_ssize_t incr = 1; 12374 switch (ch) { 12375 case '\'': squote++; break; 12376 case '"': dquote++; break; 12377 case '\\': case '\t': case '\r': case '\n': 12378 incr = 2; 12379 break; 12380 default: 12381 /* Fast-path ASCII */ 12382 if (ch < ' ' || ch == 0x7f) 12383 incr = 4; /* \xHH */ 12384 else if (ch < 0x7f) 12385 ; 12386 else if (Py_UNICODE_ISPRINTABLE(ch)) 12387 max = ch > max ? ch : max; 12388 else if (ch < 0x100) 12389 incr = 4; /* \xHH */ 12390 else if (ch < 0x10000) 12391 incr = 6; /* \uHHHH */ 12392 else 12393 incr = 10; /* \uHHHHHHHH */ 12394 } 12395 if (osize > PY_SSIZE_T_MAX - incr) { 12396 PyErr_SetString(PyExc_OverflowError, 12397 "string is too long to generate repr"); 12398 return NULL; 12399 } 12400 osize += incr; 12401 } 12402 12403 quote = '\''; 12404 unchanged = (osize == isize); 12405 if (squote) { 12406 unchanged = 0; 12407 if (dquote) 12408 /* Both squote and dquote present. Use squote, 12409 and escape them */ 12410 osize += squote; 12411 else 12412 quote = '"'; 12413 } 12414 osize += 2; /* quotes */ 12415 12416 repr = PyUnicode_New(osize, max); 12417 if (repr == NULL) 12418 return NULL; 12419 okind = PyUnicode_KIND(repr); 12420 odata = PyUnicode_DATA(repr); 12421 12422 PyUnicode_WRITE(okind, odata, 0, quote); 12423 PyUnicode_WRITE(okind, odata, osize-1, quote); 12424 if (unchanged) { 12425 _PyUnicode_FastCopyCharacters(repr, 1, 12426 unicode, 0, 12427 isize); 12428 } 12429 else { 12430 for (i = 0, o = 1; i < isize; i++) { 12431 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12432 12433 /* Escape quotes and backslashes */ 12434 if ((ch == quote) || (ch == '\\')) { 12435 PyUnicode_WRITE(okind, odata, o++, '\\'); 12436 PyUnicode_WRITE(okind, odata, o++, ch); 12437 continue; 12438 } 12439 12440 /* Map special whitespace to '\t', \n', '\r' */ 12441 if (ch == '\t') { 12442 PyUnicode_WRITE(okind, odata, o++, '\\'); 12443 PyUnicode_WRITE(okind, odata, o++, 't'); 12444 } 12445 else if (ch == '\n') { 12446 PyUnicode_WRITE(okind, odata, o++, '\\'); 12447 PyUnicode_WRITE(okind, odata, o++, 'n'); 12448 } 12449 else if (ch == '\r') { 12450 PyUnicode_WRITE(okind, odata, o++, '\\'); 12451 PyUnicode_WRITE(okind, odata, o++, 'r'); 12452 } 12453 12454 /* Map non-printable US ASCII to '\xhh' */ 12455 else if (ch < ' ' || ch == 0x7F) { 12456 PyUnicode_WRITE(okind, odata, o++, '\\'); 12457 PyUnicode_WRITE(okind, odata, o++, 'x'); 12458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12460 } 12461 12462 /* Copy ASCII characters as-is */ 12463 else if (ch < 0x7F) { 12464 PyUnicode_WRITE(okind, odata, o++, ch); 12465 } 12466 12467 /* Non-ASCII characters */ 12468 else { 12469 /* Map Unicode whitespace and control characters 12470 (categories Z* and C* except ASCII space) 12471 */ 12472 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12473 PyUnicode_WRITE(okind, odata, o++, '\\'); 12474 /* Map 8-bit characters to '\xhh' */ 12475 if (ch <= 0xff) { 12476 PyUnicode_WRITE(okind, odata, o++, 'x'); 12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12479 } 12480 /* Map 16-bit characters to '\uxxxx' */ 12481 else if (ch <= 0xffff) { 12482 PyUnicode_WRITE(okind, odata, o++, 'u'); 12483 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12484 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12485 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12486 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12487 } 12488 /* Map 21-bit characters to '\U00xxxxxx' */ 12489 else { 12490 PyUnicode_WRITE(okind, odata, o++, 'U'); 12491 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12492 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12493 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12494 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12495 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12496 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12497 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12498 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12499 } 12500 } 12501 /* Copy characters as-is */ 12502 else { 12503 PyUnicode_WRITE(okind, odata, o++, ch); 12504 } 12505 } 12506 } 12507 } 12508 /* Closing quote already added at the beginning */ 12509 assert(_PyUnicode_CheckConsistency(repr, 1)); 12510 return repr; 12511} 12512 12513PyDoc_STRVAR(rfind__doc__, 12514 "S.rfind(sub[, start[, end]]) -> int\n\ 12515\n\ 12516Return the highest index in S where substring sub is found,\n\ 12517such that sub is contained within S[start:end]. Optional\n\ 12518arguments start and end are interpreted as in slice notation.\n\ 12519\n\ 12520Return -1 on failure."); 12521 12522static PyObject * 12523unicode_rfind(PyObject *self, PyObject *args) 12524{ 12525 PyObject *substring; 12526 Py_ssize_t start; 12527 Py_ssize_t end; 12528 Py_ssize_t result; 12529 12530 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12531 &start, &end)) 12532 return NULL; 12533 12534 if (PyUnicode_READY(self) == -1) { 12535 Py_DECREF(substring); 12536 return NULL; 12537 } 12538 if (PyUnicode_READY(substring) == -1) { 12539 Py_DECREF(substring); 12540 return NULL; 12541 } 12542 12543 result = any_find_slice(-1, self, substring, start, end); 12544 12545 Py_DECREF(substring); 12546 12547 if (result == -2) 12548 return NULL; 12549 12550 return PyLong_FromSsize_t(result); 12551} 12552 12553PyDoc_STRVAR(rindex__doc__, 12554 "S.rindex(sub[, start[, end]]) -> int\n\ 12555\n\ 12556Like S.rfind() but raise ValueError when the substring is not found."); 12557 12558static PyObject * 12559unicode_rindex(PyObject *self, PyObject *args) 12560{ 12561 PyObject *substring; 12562 Py_ssize_t start; 12563 Py_ssize_t end; 12564 Py_ssize_t result; 12565 12566 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12567 &start, &end)) 12568 return NULL; 12569 12570 if (PyUnicode_READY(self) == -1) { 12571 Py_DECREF(substring); 12572 return NULL; 12573 } 12574 if (PyUnicode_READY(substring) == -1) { 12575 Py_DECREF(substring); 12576 return NULL; 12577 } 12578 12579 result = any_find_slice(-1, self, substring, start, end); 12580 12581 Py_DECREF(substring); 12582 12583 if (result == -2) 12584 return NULL; 12585 12586 if (result < 0) { 12587 PyErr_SetString(PyExc_ValueError, "substring not found"); 12588 return NULL; 12589 } 12590 12591 return PyLong_FromSsize_t(result); 12592} 12593 12594PyDoc_STRVAR(rjust__doc__, 12595 "S.rjust(width[, fillchar]) -> str\n\ 12596\n\ 12597Return S right-justified in a string of length width. Padding is\n\ 12598done using the specified fill character (default is a space)."); 12599 12600static PyObject * 12601unicode_rjust(PyObject *self, PyObject *args) 12602{ 12603 Py_ssize_t width; 12604 Py_UCS4 fillchar = ' '; 12605 12606 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12607 return NULL; 12608 12609 if (PyUnicode_READY(self) == -1) 12610 return NULL; 12611 12612 if (PyUnicode_GET_LENGTH(self) >= width) 12613 return unicode_result_unchanged(self); 12614 12615 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12616} 12617 12618PyObject * 12619PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12620{ 12621 PyObject *result; 12622 12623 s = PyUnicode_FromObject(s); 12624 if (s == NULL) 12625 return NULL; 12626 if (sep != NULL) { 12627 sep = PyUnicode_FromObject(sep); 12628 if (sep == NULL) { 12629 Py_DECREF(s); 12630 return NULL; 12631 } 12632 } 12633 12634 result = split(s, sep, maxsplit); 12635 12636 Py_DECREF(s); 12637 Py_XDECREF(sep); 12638 return result; 12639} 12640 12641PyDoc_STRVAR(split__doc__, 12642 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12643\n\ 12644Return a list of the words in S, using sep as the\n\ 12645delimiter string. If maxsplit is given, at most maxsplit\n\ 12646splits are done. If sep is not specified or is None, any\n\ 12647whitespace string is a separator and empty strings are\n\ 12648removed from the result."); 12649 12650static PyObject* 12651unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12652{ 12653 static char *kwlist[] = {"sep", "maxsplit", 0}; 12654 PyObject *substring = Py_None; 12655 Py_ssize_t maxcount = -1; 12656 12657 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12658 kwlist, &substring, &maxcount)) 12659 return NULL; 12660 12661 if (substring == Py_None) 12662 return split(self, NULL, maxcount); 12663 else if (PyUnicode_Check(substring)) 12664 return split(self, substring, maxcount); 12665 else 12666 return PyUnicode_Split(self, substring, maxcount); 12667} 12668 12669PyObject * 12670PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12671{ 12672 PyObject* str_obj; 12673 PyObject* sep_obj; 12674 PyObject* out; 12675 int kind1, kind2, kind; 12676 void *buf1 = NULL, *buf2 = NULL; 12677 Py_ssize_t len1, len2; 12678 12679 str_obj = PyUnicode_FromObject(str_in); 12680 if (!str_obj) 12681 return NULL; 12682 sep_obj = PyUnicode_FromObject(sep_in); 12683 if (!sep_obj) { 12684 Py_DECREF(str_obj); 12685 return NULL; 12686 } 12687 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12688 Py_DECREF(sep_obj); 12689 Py_DECREF(str_obj); 12690 return NULL; 12691 } 12692 12693 kind1 = PyUnicode_KIND(str_obj); 12694 kind2 = PyUnicode_KIND(sep_obj); 12695 kind = Py_MAX(kind1, kind2); 12696 buf1 = PyUnicode_DATA(str_obj); 12697 if (kind1 != kind) 12698 buf1 = _PyUnicode_AsKind(str_obj, kind); 12699 if (!buf1) 12700 goto onError; 12701 buf2 = PyUnicode_DATA(sep_obj); 12702 if (kind2 != kind) 12703 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12704 if (!buf2) 12705 goto onError; 12706 len1 = PyUnicode_GET_LENGTH(str_obj); 12707 len2 = PyUnicode_GET_LENGTH(sep_obj); 12708 12709 switch (PyUnicode_KIND(str_obj)) { 12710 case PyUnicode_1BYTE_KIND: 12711 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12712 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12713 else 12714 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12715 break; 12716 case PyUnicode_2BYTE_KIND: 12717 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12718 break; 12719 case PyUnicode_4BYTE_KIND: 12720 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12721 break; 12722 default: 12723 assert(0); 12724 out = 0; 12725 } 12726 12727 Py_DECREF(sep_obj); 12728 Py_DECREF(str_obj); 12729 if (kind1 != kind) 12730 PyMem_Free(buf1); 12731 if (kind2 != kind) 12732 PyMem_Free(buf2); 12733 12734 return out; 12735 onError: 12736 Py_DECREF(sep_obj); 12737 Py_DECREF(str_obj); 12738 if (kind1 != kind && buf1) 12739 PyMem_Free(buf1); 12740 if (kind2 != kind && buf2) 12741 PyMem_Free(buf2); 12742 return NULL; 12743} 12744 12745 12746PyObject * 12747PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12748{ 12749 PyObject* str_obj; 12750 PyObject* sep_obj; 12751 PyObject* out; 12752 int kind1, kind2, kind; 12753 void *buf1 = NULL, *buf2 = NULL; 12754 Py_ssize_t len1, len2; 12755 12756 str_obj = PyUnicode_FromObject(str_in); 12757 if (!str_obj) 12758 return NULL; 12759 sep_obj = PyUnicode_FromObject(sep_in); 12760 if (!sep_obj) { 12761 Py_DECREF(str_obj); 12762 return NULL; 12763 } 12764 12765 kind1 = PyUnicode_KIND(str_in); 12766 kind2 = PyUnicode_KIND(sep_obj); 12767 kind = Py_MAX(kind1, kind2); 12768 buf1 = PyUnicode_DATA(str_in); 12769 if (kind1 != kind) 12770 buf1 = _PyUnicode_AsKind(str_in, kind); 12771 if (!buf1) 12772 goto onError; 12773 buf2 = PyUnicode_DATA(sep_obj); 12774 if (kind2 != kind) 12775 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12776 if (!buf2) 12777 goto onError; 12778 len1 = PyUnicode_GET_LENGTH(str_obj); 12779 len2 = PyUnicode_GET_LENGTH(sep_obj); 12780 12781 switch (PyUnicode_KIND(str_in)) { 12782 case PyUnicode_1BYTE_KIND: 12783 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12784 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12785 else 12786 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12787 break; 12788 case PyUnicode_2BYTE_KIND: 12789 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12790 break; 12791 case PyUnicode_4BYTE_KIND: 12792 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12793 break; 12794 default: 12795 assert(0); 12796 out = 0; 12797 } 12798 12799 Py_DECREF(sep_obj); 12800 Py_DECREF(str_obj); 12801 if (kind1 != kind) 12802 PyMem_Free(buf1); 12803 if (kind2 != kind) 12804 PyMem_Free(buf2); 12805 12806 return out; 12807 onError: 12808 Py_DECREF(sep_obj); 12809 Py_DECREF(str_obj); 12810 if (kind1 != kind && buf1) 12811 PyMem_Free(buf1); 12812 if (kind2 != kind && buf2) 12813 PyMem_Free(buf2); 12814 return NULL; 12815} 12816 12817PyDoc_STRVAR(partition__doc__, 12818 "S.partition(sep) -> (head, sep, tail)\n\ 12819\n\ 12820Search for the separator sep in S, and return the part before it,\n\ 12821the separator itself, and the part after it. If the separator is not\n\ 12822found, return S and two empty strings."); 12823 12824static PyObject* 12825unicode_partition(PyObject *self, PyObject *separator) 12826{ 12827 return PyUnicode_Partition(self, separator); 12828} 12829 12830PyDoc_STRVAR(rpartition__doc__, 12831 "S.rpartition(sep) -> (head, sep, tail)\n\ 12832\n\ 12833Search for the separator sep in S, starting at the end of S, and return\n\ 12834the part before it, the separator itself, and the part after it. If the\n\ 12835separator is not found, return two empty strings and S."); 12836 12837static PyObject* 12838unicode_rpartition(PyObject *self, PyObject *separator) 12839{ 12840 return PyUnicode_RPartition(self, separator); 12841} 12842 12843PyObject * 12844PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12845{ 12846 PyObject *result; 12847 12848 s = PyUnicode_FromObject(s); 12849 if (s == NULL) 12850 return NULL; 12851 if (sep != NULL) { 12852 sep = PyUnicode_FromObject(sep); 12853 if (sep == NULL) { 12854 Py_DECREF(s); 12855 return NULL; 12856 } 12857 } 12858 12859 result = rsplit(s, sep, maxsplit); 12860 12861 Py_DECREF(s); 12862 Py_XDECREF(sep); 12863 return result; 12864} 12865 12866PyDoc_STRVAR(rsplit__doc__, 12867 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12868\n\ 12869Return a list of the words in S, using sep as the\n\ 12870delimiter string, starting at the end of the string and\n\ 12871working to the front. If maxsplit is given, at most maxsplit\n\ 12872splits are done. If sep is not specified, any whitespace string\n\ 12873is a separator."); 12874 12875static PyObject* 12876unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12877{ 12878 static char *kwlist[] = {"sep", "maxsplit", 0}; 12879 PyObject *substring = Py_None; 12880 Py_ssize_t maxcount = -1; 12881 12882 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12883 kwlist, &substring, &maxcount)) 12884 return NULL; 12885 12886 if (substring == Py_None) 12887 return rsplit(self, NULL, maxcount); 12888 else if (PyUnicode_Check(substring)) 12889 return rsplit(self, substring, maxcount); 12890 else 12891 return PyUnicode_RSplit(self, substring, maxcount); 12892} 12893 12894PyDoc_STRVAR(splitlines__doc__, 12895 "S.splitlines([keepends]) -> list of strings\n\ 12896\n\ 12897Return a list of the lines in S, breaking at line boundaries.\n\ 12898Line breaks are not included in the resulting list unless keepends\n\ 12899is given and true."); 12900 12901static PyObject* 12902unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12903{ 12904 static char *kwlist[] = {"keepends", 0}; 12905 int keepends = 0; 12906 12907 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12908 kwlist, &keepends)) 12909 return NULL; 12910 12911 return PyUnicode_Splitlines(self, keepends); 12912} 12913 12914static 12915PyObject *unicode_str(PyObject *self) 12916{ 12917 return unicode_result_unchanged(self); 12918} 12919 12920PyDoc_STRVAR(swapcase__doc__, 12921 "S.swapcase() -> str\n\ 12922\n\ 12923Return a copy of S with uppercase characters converted to lowercase\n\ 12924and vice versa."); 12925 12926static PyObject* 12927unicode_swapcase(PyObject *self) 12928{ 12929 if (PyUnicode_READY(self) == -1) 12930 return NULL; 12931 return case_operation(self, do_swapcase); 12932} 12933 12934/*[clinic input] 12935 12936@staticmethod 12937str.maketrans as unicode_maketrans 12938 12939 x: object 12940 12941 y: unicode=NULL 12942 12943 z: unicode=NULL 12944 12945 / 12946 12947Return a translation table usable for str.translate(). 12948 12949If there is only one argument, it must be a dictionary mapping Unicode 12950ordinals (integers) or characters to Unicode ordinals, strings or None. 12951Character keys will be then converted to ordinals. 12952If there are two arguments, they must be strings of equal length, and 12953in the resulting dictionary, each character in x will be mapped to the 12954character at the same position in y. If there is a third argument, it 12955must be a string, whose characters will be mapped to None in the result. 12956[clinic start generated code]*/ 12957 12958PyDoc_STRVAR(unicode_maketrans__doc__, 12959"maketrans(x, y=None, z=None, /)\n" 12960"--\n" 12961"\n" 12962"Return a translation table usable for str.translate().\n" 12963"\n" 12964"If there is only one argument, it must be a dictionary mapping Unicode\n" 12965"ordinals (integers) or characters to Unicode ordinals, strings or None.\n" 12966"Character keys will be then converted to ordinals.\n" 12967"If there are two arguments, they must be strings of equal length, and\n" 12968"in the resulting dictionary, each character in x will be mapped to the\n" 12969"character at the same position in y. If there is a third argument, it\n" 12970"must be a string, whose characters will be mapped to None in the result."); 12971 12972#define UNICODE_MAKETRANS_METHODDEF \ 12973 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__}, 12974 12975static PyObject * 12976unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z); 12977 12978static PyObject * 12979unicode_maketrans(void *null, PyObject *args) 12980{ 12981 PyObject *return_value = NULL; 12982 PyObject *x; 12983 PyObject *y = NULL; 12984 PyObject *z = NULL; 12985 12986 if (!PyArg_ParseTuple(args, 12987 "O|UU:maketrans", 12988 &x, &y, &z)) 12989 goto exit; 12990 return_value = unicode_maketrans_impl(x, y, z); 12991 12992exit: 12993 return return_value; 12994} 12995 12996static PyObject * 12997unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12998/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/ 12999{ 13000 PyObject *new = NULL, *key, *value; 13001 Py_ssize_t i = 0; 13002 int res; 13003 13004 new = PyDict_New(); 13005 if (!new) 13006 return NULL; 13007 if (y != NULL) { 13008 int x_kind, y_kind, z_kind; 13009 void *x_data, *y_data, *z_data; 13010 13011 /* x must be a string too, of equal length */ 13012 if (!PyUnicode_Check(x)) { 13013 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13014 "be a string if there is a second argument"); 13015 goto err; 13016 } 13017 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13018 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13019 "arguments must have equal length"); 13020 goto err; 13021 } 13022 /* create entries for translating chars in x to those in y */ 13023 x_kind = PyUnicode_KIND(x); 13024 y_kind = PyUnicode_KIND(y); 13025 x_data = PyUnicode_DATA(x); 13026 y_data = PyUnicode_DATA(y); 13027 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13028 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13029 if (!key) 13030 goto err; 13031 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13032 if (!value) { 13033 Py_DECREF(key); 13034 goto err; 13035 } 13036 res = PyDict_SetItem(new, key, value); 13037 Py_DECREF(key); 13038 Py_DECREF(value); 13039 if (res < 0) 13040 goto err; 13041 } 13042 /* create entries for deleting chars in z */ 13043 if (z != NULL) { 13044 z_kind = PyUnicode_KIND(z); 13045 z_data = PyUnicode_DATA(z); 13046 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13047 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13048 if (!key) 13049 goto err; 13050 res = PyDict_SetItem(new, key, Py_None); 13051 Py_DECREF(key); 13052 if (res < 0) 13053 goto err; 13054 } 13055 } 13056 } else { 13057 int kind; 13058 void *data; 13059 13060 /* x must be a dict */ 13061 if (!PyDict_CheckExact(x)) { 13062 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13063 "to maketrans it must be a dict"); 13064 goto err; 13065 } 13066 /* copy entries into the new dict, converting string keys to int keys */ 13067 while (PyDict_Next(x, &i, &key, &value)) { 13068 if (PyUnicode_Check(key)) { 13069 /* convert string keys to integer keys */ 13070 PyObject *newkey; 13071 if (PyUnicode_GET_LENGTH(key) != 1) { 13072 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13073 "table must be of length 1"); 13074 goto err; 13075 } 13076 kind = PyUnicode_KIND(key); 13077 data = PyUnicode_DATA(key); 13078 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13079 if (!newkey) 13080 goto err; 13081 res = PyDict_SetItem(new, newkey, value); 13082 Py_DECREF(newkey); 13083 if (res < 0) 13084 goto err; 13085 } else if (PyLong_Check(key)) { 13086 /* just keep integer keys */ 13087 if (PyDict_SetItem(new, key, value) < 0) 13088 goto err; 13089 } else { 13090 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13091 "be strings or integers"); 13092 goto err; 13093 } 13094 } 13095 } 13096 return new; 13097 err: 13098 Py_DECREF(new); 13099 return NULL; 13100} 13101 13102PyDoc_STRVAR(translate__doc__, 13103 "S.translate(table) -> str\n\ 13104\n\ 13105Return a copy of the string S, where all characters have been mapped\n\ 13106through the given translation table, which must be a mapping of\n\ 13107Unicode ordinals to Unicode ordinals, strings, or None.\n\ 13108Unmapped characters are left untouched. Characters mapped to None\n\ 13109are deleted."); 13110 13111static PyObject* 13112unicode_translate(PyObject *self, PyObject *table) 13113{ 13114 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13115} 13116 13117PyDoc_STRVAR(upper__doc__, 13118 "S.upper() -> str\n\ 13119\n\ 13120Return a copy of S converted to uppercase."); 13121 13122static PyObject* 13123unicode_upper(PyObject *self) 13124{ 13125 if (PyUnicode_READY(self) == -1) 13126 return NULL; 13127 if (PyUnicode_IS_ASCII(self)) 13128 return ascii_upper_or_lower(self, 0); 13129 return case_operation(self, do_upper); 13130} 13131 13132PyDoc_STRVAR(zfill__doc__, 13133 "S.zfill(width) -> str\n\ 13134\n\ 13135Pad a numeric string S with zeros on the left, to fill a field\n\ 13136of the specified width. The string S is never truncated."); 13137 13138static PyObject * 13139unicode_zfill(PyObject *self, PyObject *args) 13140{ 13141 Py_ssize_t fill; 13142 PyObject *u; 13143 Py_ssize_t width; 13144 int kind; 13145 void *data; 13146 Py_UCS4 chr; 13147 13148 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13149 return NULL; 13150 13151 if (PyUnicode_READY(self) == -1) 13152 return NULL; 13153 13154 if (PyUnicode_GET_LENGTH(self) >= width) 13155 return unicode_result_unchanged(self); 13156 13157 fill = width - PyUnicode_GET_LENGTH(self); 13158 13159 u = pad(self, fill, 0, '0'); 13160 13161 if (u == NULL) 13162 return NULL; 13163 13164 kind = PyUnicode_KIND(u); 13165 data = PyUnicode_DATA(u); 13166 chr = PyUnicode_READ(kind, data, fill); 13167 13168 if (chr == '+' || chr == '-') { 13169 /* move sign to beginning of string */ 13170 PyUnicode_WRITE(kind, data, 0, chr); 13171 PyUnicode_WRITE(kind, data, fill, '0'); 13172 } 13173 13174 assert(_PyUnicode_CheckConsistency(u, 1)); 13175 return u; 13176} 13177 13178#if 0 13179static PyObject * 13180unicode__decimal2ascii(PyObject *self) 13181{ 13182 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13183} 13184#endif 13185 13186PyDoc_STRVAR(startswith__doc__, 13187 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13188\n\ 13189Return True if S starts with the specified prefix, False otherwise.\n\ 13190With optional start, test S beginning at that position.\n\ 13191With optional end, stop comparing S at that position.\n\ 13192prefix can also be a tuple of strings to try."); 13193 13194static PyObject * 13195unicode_startswith(PyObject *self, 13196 PyObject *args) 13197{ 13198 PyObject *subobj; 13199 PyObject *substring; 13200 Py_ssize_t start = 0; 13201 Py_ssize_t end = PY_SSIZE_T_MAX; 13202 int result; 13203 13204 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13205 return NULL; 13206 if (PyTuple_Check(subobj)) { 13207 Py_ssize_t i; 13208 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13209 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13210 if (substring == NULL) 13211 return NULL; 13212 result = tailmatch(self, substring, start, end, -1); 13213 Py_DECREF(substring); 13214 if (result == -1) 13215 return NULL; 13216 if (result) { 13217 Py_RETURN_TRUE; 13218 } 13219 } 13220 /* nothing matched */ 13221 Py_RETURN_FALSE; 13222 } 13223 substring = PyUnicode_FromObject(subobj); 13224 if (substring == NULL) { 13225 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13226 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13227 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13228 return NULL; 13229 } 13230 result = tailmatch(self, substring, start, end, -1); 13231 Py_DECREF(substring); 13232 if (result == -1) 13233 return NULL; 13234 return PyBool_FromLong(result); 13235} 13236 13237 13238PyDoc_STRVAR(endswith__doc__, 13239 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13240\n\ 13241Return True if S ends with the specified suffix, False otherwise.\n\ 13242With optional start, test S beginning at that position.\n\ 13243With optional end, stop comparing S at that position.\n\ 13244suffix can also be a tuple of strings to try."); 13245 13246static PyObject * 13247unicode_endswith(PyObject *self, 13248 PyObject *args) 13249{ 13250 PyObject *subobj; 13251 PyObject *substring; 13252 Py_ssize_t start = 0; 13253 Py_ssize_t end = PY_SSIZE_T_MAX; 13254 int result; 13255 13256 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13257 return NULL; 13258 if (PyTuple_Check(subobj)) { 13259 Py_ssize_t i; 13260 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13261 substring = PyUnicode_FromObject( 13262 PyTuple_GET_ITEM(subobj, i)); 13263 if (substring == NULL) 13264 return NULL; 13265 result = tailmatch(self, substring, start, end, +1); 13266 Py_DECREF(substring); 13267 if (result == -1) 13268 return NULL; 13269 if (result) { 13270 Py_RETURN_TRUE; 13271 } 13272 } 13273 Py_RETURN_FALSE; 13274 } 13275 substring = PyUnicode_FromObject(subobj); 13276 if (substring == NULL) { 13277 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13278 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13279 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13280 return NULL; 13281 } 13282 result = tailmatch(self, substring, start, end, +1); 13283 Py_DECREF(substring); 13284 if (result == -1) 13285 return NULL; 13286 return PyBool_FromLong(result); 13287} 13288 13289Py_LOCAL_INLINE(void) 13290_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13291{ 13292 if (!writer->readonly) 13293 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13294 else { 13295 /* Copy-on-write mode: set buffer size to 0 so 13296 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13297 * next write. */ 13298 writer->size = 0; 13299 } 13300 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13301 writer->data = PyUnicode_DATA(writer->buffer); 13302 writer->kind = PyUnicode_KIND(writer->buffer); 13303} 13304 13305void 13306_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13307{ 13308 memset(writer, 0, sizeof(*writer)); 13309#ifdef Py_DEBUG 13310 writer->kind = 5; /* invalid kind */ 13311#endif 13312 writer->min_char = 127; 13313} 13314 13315int 13316_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13317 Py_ssize_t length, Py_UCS4 maxchar) 13318{ 13319#ifdef MS_WINDOWS 13320 /* On Windows, overallocate by 50% is the best factor */ 13321# define OVERALLOCATE_FACTOR 2 13322#else 13323 /* On Linux, overallocate by 25% is the best factor */ 13324# define OVERALLOCATE_FACTOR 4 13325#endif 13326 Py_ssize_t newlen; 13327 PyObject *newbuffer; 13328 13329 assert(length > 0); 13330 13331 if (length > PY_SSIZE_T_MAX - writer->pos) { 13332 PyErr_NoMemory(); 13333 return -1; 13334 } 13335 newlen = writer->pos + length; 13336 13337 maxchar = Py_MAX(maxchar, writer->min_char); 13338 13339 if (writer->buffer == NULL) { 13340 assert(!writer->readonly); 13341 if (writer->overallocate 13342 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13343 /* overallocate to limit the number of realloc() */ 13344 newlen += newlen / OVERALLOCATE_FACTOR; 13345 } 13346 if (newlen < writer->min_length) 13347 newlen = writer->min_length; 13348 13349 writer->buffer = PyUnicode_New(newlen, maxchar); 13350 if (writer->buffer == NULL) 13351 return -1; 13352 } 13353 else if (newlen > writer->size) { 13354 if (writer->overallocate 13355 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13356 /* overallocate to limit the number of realloc() */ 13357 newlen += newlen / OVERALLOCATE_FACTOR; 13358 } 13359 if (newlen < writer->min_length) 13360 newlen = writer->min_length; 13361 13362 if (maxchar > writer->maxchar || writer->readonly) { 13363 /* resize + widen */ 13364 newbuffer = PyUnicode_New(newlen, maxchar); 13365 if (newbuffer == NULL) 13366 return -1; 13367 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13368 writer->buffer, 0, writer->pos); 13369 Py_DECREF(writer->buffer); 13370 writer->readonly = 0; 13371 } 13372 else { 13373 newbuffer = resize_compact(writer->buffer, newlen); 13374 if (newbuffer == NULL) 13375 return -1; 13376 } 13377 writer->buffer = newbuffer; 13378 } 13379 else if (maxchar > writer->maxchar) { 13380 assert(!writer->readonly); 13381 newbuffer = PyUnicode_New(writer->size, maxchar); 13382 if (newbuffer == NULL) 13383 return -1; 13384 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13385 writer->buffer, 0, writer->pos); 13386 Py_DECREF(writer->buffer); 13387 writer->buffer = newbuffer; 13388 } 13389 _PyUnicodeWriter_Update(writer); 13390 return 0; 13391 13392#undef OVERALLOCATE_FACTOR 13393} 13394 13395Py_LOCAL_INLINE(int) 13396_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13397{ 13398 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13399 return -1; 13400 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13401 writer->pos++; 13402 return 0; 13403} 13404 13405int 13406_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13407{ 13408 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13409} 13410 13411int 13412_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13413{ 13414 Py_UCS4 maxchar; 13415 Py_ssize_t len; 13416 13417 if (PyUnicode_READY(str) == -1) 13418 return -1; 13419 len = PyUnicode_GET_LENGTH(str); 13420 if (len == 0) 13421 return 0; 13422 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13423 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13424 if (writer->buffer == NULL && !writer->overallocate) { 13425 writer->readonly = 1; 13426 Py_INCREF(str); 13427 writer->buffer = str; 13428 _PyUnicodeWriter_Update(writer); 13429 writer->pos += len; 13430 return 0; 13431 } 13432 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13433 return -1; 13434 } 13435 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13436 str, 0, len); 13437 writer->pos += len; 13438 return 0; 13439} 13440 13441int 13442_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13443 Py_ssize_t start, Py_ssize_t end) 13444{ 13445 Py_UCS4 maxchar; 13446 Py_ssize_t len; 13447 13448 if (PyUnicode_READY(str) == -1) 13449 return -1; 13450 13451 assert(0 <= start); 13452 assert(end <= PyUnicode_GET_LENGTH(str)); 13453 assert(start <= end); 13454 13455 if (end == 0) 13456 return 0; 13457 13458 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13459 return _PyUnicodeWriter_WriteStr(writer, str); 13460 13461 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13462 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13463 else 13464 maxchar = writer->maxchar; 13465 len = end - start; 13466 13467 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13468 return -1; 13469 13470 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13471 str, start, len); 13472 writer->pos += len; 13473 return 0; 13474} 13475 13476int 13477_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13478 const char *ascii, Py_ssize_t len) 13479{ 13480 if (len == -1) 13481 len = strlen(ascii); 13482 13483 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13484 13485 if (writer->buffer == NULL && !writer->overallocate) { 13486 PyObject *str; 13487 13488 str = _PyUnicode_FromASCII(ascii, len); 13489 if (str == NULL) 13490 return -1; 13491 13492 writer->readonly = 1; 13493 writer->buffer = str; 13494 _PyUnicodeWriter_Update(writer); 13495 writer->pos += len; 13496 return 0; 13497 } 13498 13499 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13500 return -1; 13501 13502 switch (writer->kind) 13503 { 13504 case PyUnicode_1BYTE_KIND: 13505 { 13506 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13507 Py_UCS1 *data = writer->data; 13508 13509 Py_MEMCPY(data + writer->pos, str, len); 13510 break; 13511 } 13512 case PyUnicode_2BYTE_KIND: 13513 { 13514 _PyUnicode_CONVERT_BYTES( 13515 Py_UCS1, Py_UCS2, 13516 ascii, ascii + len, 13517 (Py_UCS2 *)writer->data + writer->pos); 13518 break; 13519 } 13520 case PyUnicode_4BYTE_KIND: 13521 { 13522 _PyUnicode_CONVERT_BYTES( 13523 Py_UCS1, Py_UCS4, 13524 ascii, ascii + len, 13525 (Py_UCS4 *)writer->data + writer->pos); 13526 break; 13527 } 13528 default: 13529 assert(0); 13530 } 13531 13532 writer->pos += len; 13533 return 0; 13534} 13535 13536int 13537_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13538 const char *str, Py_ssize_t len) 13539{ 13540 Py_UCS4 maxchar; 13541 13542 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13543 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13544 return -1; 13545 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13546 writer->pos += len; 13547 return 0; 13548} 13549 13550PyObject * 13551_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13552{ 13553 PyObject *str; 13554 if (writer->pos == 0) { 13555 Py_CLEAR(writer->buffer); 13556 _Py_RETURN_UNICODE_EMPTY(); 13557 } 13558 if (writer->readonly) { 13559 str = writer->buffer; 13560 writer->buffer = NULL; 13561 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13562 return str; 13563 } 13564 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13565 PyObject *newbuffer; 13566 newbuffer = resize_compact(writer->buffer, writer->pos); 13567 if (newbuffer == NULL) { 13568 Py_CLEAR(writer->buffer); 13569 return NULL; 13570 } 13571 writer->buffer = newbuffer; 13572 } 13573 str = writer->buffer; 13574 writer->buffer = NULL; 13575 assert(_PyUnicode_CheckConsistency(str, 1)); 13576 return unicode_result_ready(str); 13577} 13578 13579void 13580_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13581{ 13582 Py_CLEAR(writer->buffer); 13583} 13584 13585#include "stringlib/unicode_format.h" 13586 13587PyDoc_STRVAR(format__doc__, 13588 "S.format(*args, **kwargs) -> str\n\ 13589\n\ 13590Return a formatted version of S, using substitutions from args and kwargs.\n\ 13591The substitutions are identified by braces ('{' and '}')."); 13592 13593PyDoc_STRVAR(format_map__doc__, 13594 "S.format_map(mapping) -> str\n\ 13595\n\ 13596Return a formatted version of S, using substitutions from mapping.\n\ 13597The substitutions are identified by braces ('{' and '}')."); 13598 13599static PyObject * 13600unicode__format__(PyObject* self, PyObject* args) 13601{ 13602 PyObject *format_spec; 13603 _PyUnicodeWriter writer; 13604 int ret; 13605 13606 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13607 return NULL; 13608 13609 if (PyUnicode_READY(self) == -1) 13610 return NULL; 13611 _PyUnicodeWriter_Init(&writer); 13612 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13613 self, format_spec, 0, 13614 PyUnicode_GET_LENGTH(format_spec)); 13615 if (ret == -1) { 13616 _PyUnicodeWriter_Dealloc(&writer); 13617 return NULL; 13618 } 13619 return _PyUnicodeWriter_Finish(&writer); 13620} 13621 13622PyDoc_STRVAR(p_format__doc__, 13623 "S.__format__(format_spec) -> str\n\ 13624\n\ 13625Return a formatted version of S as described by format_spec."); 13626 13627static PyObject * 13628unicode__sizeof__(PyObject *v) 13629{ 13630 Py_ssize_t size; 13631 13632 /* If it's a compact object, account for base structure + 13633 character data. */ 13634 if (PyUnicode_IS_COMPACT_ASCII(v)) 13635 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13636 else if (PyUnicode_IS_COMPACT(v)) 13637 size = sizeof(PyCompactUnicodeObject) + 13638 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13639 else { 13640 /* If it is a two-block object, account for base object, and 13641 for character block if present. */ 13642 size = sizeof(PyUnicodeObject); 13643 if (_PyUnicode_DATA_ANY(v)) 13644 size += (PyUnicode_GET_LENGTH(v) + 1) * 13645 PyUnicode_KIND(v); 13646 } 13647 /* If the wstr pointer is present, account for it unless it is shared 13648 with the data pointer. Check if the data is not shared. */ 13649 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13650 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13651 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13652 size += PyUnicode_UTF8_LENGTH(v) + 1; 13653 13654 return PyLong_FromSsize_t(size); 13655} 13656 13657PyDoc_STRVAR(sizeof__doc__, 13658 "S.__sizeof__() -> size of S in memory, in bytes"); 13659 13660static PyObject * 13661unicode_getnewargs(PyObject *v) 13662{ 13663 PyObject *copy = _PyUnicode_Copy(v); 13664 if (!copy) 13665 return NULL; 13666 return Py_BuildValue("(N)", copy); 13667} 13668 13669static PyMethodDef unicode_methods[] = { 13670 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13671 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13672 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13673 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13674 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13675 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13676 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13677 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13678 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13679 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13680 {"expandtabs", (PyCFunction) unicode_expandtabs, 13681 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13682 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13683 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13684 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13685 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13686 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13687 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13688 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13689 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13690 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13691 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13692 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13693 {"splitlines", (PyCFunction) unicode_splitlines, 13694 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13695 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13696 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13697 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13698 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13699 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13700 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13701 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13702 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13703 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13704 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13705 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13706 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13707 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13708 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13709 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13710 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13711 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13712 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13713 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13714 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13715 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13716 UNICODE_MAKETRANS_METHODDEF 13717 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13718#if 0 13719 /* These methods are just used for debugging the implementation. */ 13720 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13721#endif 13722 13723 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13724 {NULL, NULL} 13725}; 13726 13727static PyObject * 13728unicode_mod(PyObject *v, PyObject *w) 13729{ 13730 if (!PyUnicode_Check(v)) 13731 Py_RETURN_NOTIMPLEMENTED; 13732 return PyUnicode_Format(v, w); 13733} 13734 13735static PyNumberMethods unicode_as_number = { 13736 0, /*nb_add*/ 13737 0, /*nb_subtract*/ 13738 0, /*nb_multiply*/ 13739 unicode_mod, /*nb_remainder*/ 13740}; 13741 13742static PySequenceMethods unicode_as_sequence = { 13743 (lenfunc) unicode_length, /* sq_length */ 13744 PyUnicode_Concat, /* sq_concat */ 13745 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13746 (ssizeargfunc) unicode_getitem, /* sq_item */ 13747 0, /* sq_slice */ 13748 0, /* sq_ass_item */ 13749 0, /* sq_ass_slice */ 13750 PyUnicode_Contains, /* sq_contains */ 13751}; 13752 13753static PyObject* 13754unicode_subscript(PyObject* self, PyObject* item) 13755{ 13756 if (PyUnicode_READY(self) == -1) 13757 return NULL; 13758 13759 if (PyIndex_Check(item)) { 13760 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13761 if (i == -1 && PyErr_Occurred()) 13762 return NULL; 13763 if (i < 0) 13764 i += PyUnicode_GET_LENGTH(self); 13765 return unicode_getitem(self, i); 13766 } else if (PySlice_Check(item)) { 13767 Py_ssize_t start, stop, step, slicelength, cur, i; 13768 PyObject *result; 13769 void *src_data, *dest_data; 13770 int src_kind, dest_kind; 13771 Py_UCS4 ch, max_char, kind_limit; 13772 13773 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13774 &start, &stop, &step, &slicelength) < 0) { 13775 return NULL; 13776 } 13777 13778 if (slicelength <= 0) { 13779 _Py_RETURN_UNICODE_EMPTY(); 13780 } else if (start == 0 && step == 1 && 13781 slicelength == PyUnicode_GET_LENGTH(self)) { 13782 return unicode_result_unchanged(self); 13783 } else if (step == 1) { 13784 return PyUnicode_Substring(self, 13785 start, start + slicelength); 13786 } 13787 /* General case */ 13788 src_kind = PyUnicode_KIND(self); 13789 src_data = PyUnicode_DATA(self); 13790 if (!PyUnicode_IS_ASCII(self)) { 13791 kind_limit = kind_maxchar_limit(src_kind); 13792 max_char = 0; 13793 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13794 ch = PyUnicode_READ(src_kind, src_data, cur); 13795 if (ch > max_char) { 13796 max_char = ch; 13797 if (max_char >= kind_limit) 13798 break; 13799 } 13800 } 13801 } 13802 else 13803 max_char = 127; 13804 result = PyUnicode_New(slicelength, max_char); 13805 if (result == NULL) 13806 return NULL; 13807 dest_kind = PyUnicode_KIND(result); 13808 dest_data = PyUnicode_DATA(result); 13809 13810 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13811 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13812 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13813 } 13814 assert(_PyUnicode_CheckConsistency(result, 1)); 13815 return result; 13816 } else { 13817 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13818 return NULL; 13819 } 13820} 13821 13822static PyMappingMethods unicode_as_mapping = { 13823 (lenfunc)unicode_length, /* mp_length */ 13824 (binaryfunc)unicode_subscript, /* mp_subscript */ 13825 (objobjargproc)0, /* mp_ass_subscript */ 13826}; 13827 13828 13829/* Helpers for PyUnicode_Format() */ 13830 13831struct unicode_formatter_t { 13832 PyObject *args; 13833 int args_owned; 13834 Py_ssize_t arglen, argidx; 13835 PyObject *dict; 13836 13837 enum PyUnicode_Kind fmtkind; 13838 Py_ssize_t fmtcnt, fmtpos; 13839 void *fmtdata; 13840 PyObject *fmtstr; 13841 13842 _PyUnicodeWriter writer; 13843}; 13844 13845struct unicode_format_arg_t { 13846 Py_UCS4 ch; 13847 int flags; 13848 Py_ssize_t width; 13849 int prec; 13850 int sign; 13851}; 13852 13853static PyObject * 13854unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13855{ 13856 Py_ssize_t argidx = ctx->argidx; 13857 13858 if (argidx < ctx->arglen) { 13859 ctx->argidx++; 13860 if (ctx->arglen < 0) 13861 return ctx->args; 13862 else 13863 return PyTuple_GetItem(ctx->args, argidx); 13864 } 13865 PyErr_SetString(PyExc_TypeError, 13866 "not enough arguments for format string"); 13867 return NULL; 13868} 13869 13870/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13871 13872/* Format a float into the writer if the writer is not NULL, or into *p_output 13873 otherwise. 13874 13875 Return 0 on success, raise an exception and return -1 on error. */ 13876static int 13877formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13878 PyObject **p_output, 13879 _PyUnicodeWriter *writer) 13880{ 13881 char *p; 13882 double x; 13883 Py_ssize_t len; 13884 int prec; 13885 int dtoa_flags; 13886 13887 x = PyFloat_AsDouble(v); 13888 if (x == -1.0 && PyErr_Occurred()) 13889 return -1; 13890 13891 prec = arg->prec; 13892 if (prec < 0) 13893 prec = 6; 13894 13895 if (arg->flags & F_ALT) 13896 dtoa_flags = Py_DTSF_ALT; 13897 else 13898 dtoa_flags = 0; 13899 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13900 if (p == NULL) 13901 return -1; 13902 len = strlen(p); 13903 if (writer) { 13904 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13905 PyMem_Free(p); 13906 return -1; 13907 } 13908 } 13909 else 13910 *p_output = _PyUnicode_FromASCII(p, len); 13911 PyMem_Free(p); 13912 return 0; 13913} 13914 13915/* formatlong() emulates the format codes d, u, o, x and X, and 13916 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13917 * Python's regular ints. 13918 * Return value: a new PyUnicodeObject*, or NULL if error. 13919 * The output string is of the form 13920 * "-"? ("0x" | "0X")? digit+ 13921 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13922 * set in flags. The case of hex digits will be correct, 13923 * There will be at least prec digits, zero-filled on the left if 13924 * necessary to get that many. 13925 * val object to be converted 13926 * flags bitmask of format flags; only F_ALT is looked at 13927 * prec minimum number of digits; 0-fill on left if needed 13928 * type a character in [duoxX]; u acts the same as d 13929 * 13930 * CAUTION: o, x and X conversions on regular ints can never 13931 * produce a '-' sign, but can for Python's unbounded ints. 13932 */ 13933static PyObject* 13934formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13935{ 13936 PyObject *result = NULL; 13937 char *buf; 13938 Py_ssize_t i; 13939 int sign; /* 1 if '-', else 0 */ 13940 int len; /* number of characters */ 13941 Py_ssize_t llen; 13942 int numdigits; /* len == numnondigits + numdigits */ 13943 int numnondigits = 0; 13944 int prec = arg->prec; 13945 int type = arg->ch; 13946 13947 /* Avoid exceeding SSIZE_T_MAX */ 13948 if (prec > INT_MAX-3) { 13949 PyErr_SetString(PyExc_OverflowError, 13950 "precision too large"); 13951 return NULL; 13952 } 13953 13954 assert(PyLong_Check(val)); 13955 13956 switch (type) { 13957 default: 13958 assert(!"'type' not in [diuoxX]"); 13959 case 'd': 13960 case 'i': 13961 case 'u': 13962 /* int and int subclasses should print numerically when a numeric */ 13963 /* format code is used (see issue18780) */ 13964 result = PyNumber_ToBase(val, 10); 13965 break; 13966 case 'o': 13967 numnondigits = 2; 13968 result = PyNumber_ToBase(val, 8); 13969 break; 13970 case 'x': 13971 case 'X': 13972 numnondigits = 2; 13973 result = PyNumber_ToBase(val, 16); 13974 break; 13975 } 13976 if (!result) 13977 return NULL; 13978 13979 assert(unicode_modifiable(result)); 13980 assert(PyUnicode_IS_READY(result)); 13981 assert(PyUnicode_IS_ASCII(result)); 13982 13983 /* To modify the string in-place, there can only be one reference. */ 13984 if (Py_REFCNT(result) != 1) { 13985 Py_DECREF(result); 13986 PyErr_BadInternalCall(); 13987 return NULL; 13988 } 13989 buf = PyUnicode_DATA(result); 13990 llen = PyUnicode_GET_LENGTH(result); 13991 if (llen > INT_MAX) { 13992 Py_DECREF(result); 13993 PyErr_SetString(PyExc_ValueError, 13994 "string too large in _PyBytes_FormatLong"); 13995 return NULL; 13996 } 13997 len = (int)llen; 13998 sign = buf[0] == '-'; 13999 numnondigits += sign; 14000 numdigits = len - numnondigits; 14001 assert(numdigits > 0); 14002 14003 /* Get rid of base marker unless F_ALT */ 14004 if (((arg->flags & F_ALT) == 0 && 14005 (type == 'o' || type == 'x' || type == 'X'))) { 14006 assert(buf[sign] == '0'); 14007 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14008 buf[sign+1] == 'o'); 14009 numnondigits -= 2; 14010 buf += 2; 14011 len -= 2; 14012 if (sign) 14013 buf[0] = '-'; 14014 assert(len == numnondigits + numdigits); 14015 assert(numdigits > 0); 14016 } 14017 14018 /* Fill with leading zeroes to meet minimum width. */ 14019 if (prec > numdigits) { 14020 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14021 numnondigits + prec); 14022 char *b1; 14023 if (!r1) { 14024 Py_DECREF(result); 14025 return NULL; 14026 } 14027 b1 = PyBytes_AS_STRING(r1); 14028 for (i = 0; i < numnondigits; ++i) 14029 *b1++ = *buf++; 14030 for (i = 0; i < prec - numdigits; i++) 14031 *b1++ = '0'; 14032 for (i = 0; i < numdigits; i++) 14033 *b1++ = *buf++; 14034 *b1 = '\0'; 14035 Py_DECREF(result); 14036 result = r1; 14037 buf = PyBytes_AS_STRING(result); 14038 len = numnondigits + prec; 14039 } 14040 14041 /* Fix up case for hex conversions. */ 14042 if (type == 'X') { 14043 /* Need to convert all lower case letters to upper case. 14044 and need to convert 0x to 0X (and -0x to -0X). */ 14045 for (i = 0; i < len; i++) 14046 if (buf[i] >= 'a' && buf[i] <= 'x') 14047 buf[i] -= 'a'-'A'; 14048 } 14049 if (!PyUnicode_Check(result) 14050 || buf != PyUnicode_DATA(result)) { 14051 PyObject *unicode; 14052 unicode = _PyUnicode_FromASCII(buf, len); 14053 Py_DECREF(result); 14054 result = unicode; 14055 } 14056 else if (len != PyUnicode_GET_LENGTH(result)) { 14057 if (PyUnicode_Resize(&result, len) < 0) 14058 Py_CLEAR(result); 14059 } 14060 return result; 14061} 14062 14063/* Format an integer or a float as an integer. 14064 * Return 1 if the number has been formatted into the writer, 14065 * 0 if the number has been formatted into *p_output 14066 * -1 and raise an exception on error */ 14067static int 14068mainformatlong(PyObject *v, 14069 struct unicode_format_arg_t *arg, 14070 PyObject **p_output, 14071 _PyUnicodeWriter *writer) 14072{ 14073 PyObject *iobj, *res; 14074 char type = (char)arg->ch; 14075 14076 if (!PyNumber_Check(v)) 14077 goto wrongtype; 14078 14079 /* make sure number is a type of integer for o, x, and X */ 14080 if (!PyLong_Check(v)) { 14081 if (type == 'o' || type == 'x' || type == 'X') { 14082 iobj = PyNumber_Index(v); 14083 if (iobj == NULL) { 14084 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14085 goto wrongtype; 14086 return -1; 14087 } 14088 } 14089 else { 14090 iobj = PyNumber_Long(v); 14091 if (iobj == NULL ) { 14092 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14093 goto wrongtype; 14094 return -1; 14095 } 14096 } 14097 assert(PyLong_Check(iobj)); 14098 } 14099 else { 14100 iobj = v; 14101 Py_INCREF(iobj); 14102 } 14103 14104 if (PyLong_CheckExact(v) 14105 && arg->width == -1 && arg->prec == -1 14106 && !(arg->flags & (F_SIGN | F_BLANK)) 14107 && type != 'X') 14108 { 14109 /* Fast path */ 14110 int alternate = arg->flags & F_ALT; 14111 int base; 14112 14113 switch(type) 14114 { 14115 default: 14116 assert(0 && "'type' not in [diuoxX]"); 14117 case 'd': 14118 case 'i': 14119 case 'u': 14120 base = 10; 14121 break; 14122 case 'o': 14123 base = 8; 14124 break; 14125 case 'x': 14126 case 'X': 14127 base = 16; 14128 break; 14129 } 14130 14131 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14132 Py_DECREF(iobj); 14133 return -1; 14134 } 14135 Py_DECREF(iobj); 14136 return 1; 14137 } 14138 14139 res = formatlong(iobj, arg); 14140 Py_DECREF(iobj); 14141 if (res == NULL) 14142 return -1; 14143 *p_output = res; 14144 return 0; 14145 14146wrongtype: 14147 switch(type) 14148 { 14149 case 'o': 14150 case 'x': 14151 case 'X': 14152 PyErr_Format(PyExc_TypeError, 14153 "%%%c format: an integer is required, " 14154 "not %.200s", 14155 type, Py_TYPE(v)->tp_name); 14156 break; 14157 default: 14158 PyErr_Format(PyExc_TypeError, 14159 "%%%c format: a number is required, " 14160 "not %.200s", 14161 type, Py_TYPE(v)->tp_name); 14162 break; 14163 } 14164 return -1; 14165} 14166 14167static Py_UCS4 14168formatchar(PyObject *v) 14169{ 14170 /* presume that the buffer is at least 3 characters long */ 14171 if (PyUnicode_Check(v)) { 14172 if (PyUnicode_GET_LENGTH(v) == 1) { 14173 return PyUnicode_READ_CHAR(v, 0); 14174 } 14175 goto onError; 14176 } 14177 else { 14178 PyObject *iobj; 14179 long x; 14180 /* make sure number is a type of integer */ 14181 if (!PyLong_Check(v)) { 14182 iobj = PyNumber_Index(v); 14183 if (iobj == NULL) { 14184 goto onError; 14185 } 14186 v = iobj; 14187 Py_DECREF(iobj); 14188 } 14189 /* Integer input truncated to a character */ 14190 x = PyLong_AsLong(v); 14191 if (x == -1 && PyErr_Occurred()) 14192 goto onError; 14193 14194 if (x < 0 || x > MAX_UNICODE) { 14195 PyErr_SetString(PyExc_OverflowError, 14196 "%c arg not in range(0x110000)"); 14197 return (Py_UCS4) -1; 14198 } 14199 14200 return (Py_UCS4) x; 14201 } 14202 14203 onError: 14204 PyErr_SetString(PyExc_TypeError, 14205 "%c requires int or char"); 14206 return (Py_UCS4) -1; 14207} 14208 14209/* Parse options of an argument: flags, width, precision. 14210 Handle also "%(name)" syntax. 14211 14212 Return 0 if the argument has been formatted into arg->str. 14213 Return 1 if the argument has been written into ctx->writer, 14214 Raise an exception and return -1 on error. */ 14215static int 14216unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14217 struct unicode_format_arg_t *arg) 14218{ 14219#define FORMAT_READ(ctx) \ 14220 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14221 14222 PyObject *v; 14223 14224 if (arg->ch == '(') { 14225 /* Get argument value from a dictionary. Example: "%(name)s". */ 14226 Py_ssize_t keystart; 14227 Py_ssize_t keylen; 14228 PyObject *key; 14229 int pcount = 1; 14230 14231 if (ctx->dict == NULL) { 14232 PyErr_SetString(PyExc_TypeError, 14233 "format requires a mapping"); 14234 return -1; 14235 } 14236 ++ctx->fmtpos; 14237 --ctx->fmtcnt; 14238 keystart = ctx->fmtpos; 14239 /* Skip over balanced parentheses */ 14240 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14241 arg->ch = FORMAT_READ(ctx); 14242 if (arg->ch == ')') 14243 --pcount; 14244 else if (arg->ch == '(') 14245 ++pcount; 14246 ctx->fmtpos++; 14247 } 14248 keylen = ctx->fmtpos - keystart - 1; 14249 if (ctx->fmtcnt < 0 || pcount > 0) { 14250 PyErr_SetString(PyExc_ValueError, 14251 "incomplete format key"); 14252 return -1; 14253 } 14254 key = PyUnicode_Substring(ctx->fmtstr, 14255 keystart, keystart + keylen); 14256 if (key == NULL) 14257 return -1; 14258 if (ctx->args_owned) { 14259 Py_DECREF(ctx->args); 14260 ctx->args_owned = 0; 14261 } 14262 ctx->args = PyObject_GetItem(ctx->dict, key); 14263 Py_DECREF(key); 14264 if (ctx->args == NULL) 14265 return -1; 14266 ctx->args_owned = 1; 14267 ctx->arglen = -1; 14268 ctx->argidx = -2; 14269 } 14270 14271 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14272 while (--ctx->fmtcnt >= 0) { 14273 arg->ch = FORMAT_READ(ctx); 14274 ctx->fmtpos++; 14275 switch (arg->ch) { 14276 case '-': arg->flags |= F_LJUST; continue; 14277 case '+': arg->flags |= F_SIGN; continue; 14278 case ' ': arg->flags |= F_BLANK; continue; 14279 case '#': arg->flags |= F_ALT; continue; 14280 case '0': arg->flags |= F_ZERO; continue; 14281 } 14282 break; 14283 } 14284 14285 /* Parse width. Example: "%10s" => width=10 */ 14286 if (arg->ch == '*') { 14287 v = unicode_format_getnextarg(ctx); 14288 if (v == NULL) 14289 return -1; 14290 if (!PyLong_Check(v)) { 14291 PyErr_SetString(PyExc_TypeError, 14292 "* wants int"); 14293 return -1; 14294 } 14295 arg->width = PyLong_AsSsize_t(v); 14296 if (arg->width == -1 && PyErr_Occurred()) 14297 return -1; 14298 if (arg->width < 0) { 14299 arg->flags |= F_LJUST; 14300 arg->width = -arg->width; 14301 } 14302 if (--ctx->fmtcnt >= 0) { 14303 arg->ch = FORMAT_READ(ctx); 14304 ctx->fmtpos++; 14305 } 14306 } 14307 else if (arg->ch >= '0' && arg->ch <= '9') { 14308 arg->width = arg->ch - '0'; 14309 while (--ctx->fmtcnt >= 0) { 14310 arg->ch = FORMAT_READ(ctx); 14311 ctx->fmtpos++; 14312 if (arg->ch < '0' || arg->ch > '9') 14313 break; 14314 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14315 mixing signed and unsigned comparison. Since arg->ch is between 14316 '0' and '9', casting to int is safe. */ 14317 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14318 PyErr_SetString(PyExc_ValueError, 14319 "width too big"); 14320 return -1; 14321 } 14322 arg->width = arg->width*10 + (arg->ch - '0'); 14323 } 14324 } 14325 14326 /* Parse precision. Example: "%.3f" => prec=3 */ 14327 if (arg->ch == '.') { 14328 arg->prec = 0; 14329 if (--ctx->fmtcnt >= 0) { 14330 arg->ch = FORMAT_READ(ctx); 14331 ctx->fmtpos++; 14332 } 14333 if (arg->ch == '*') { 14334 v = unicode_format_getnextarg(ctx); 14335 if (v == NULL) 14336 return -1; 14337 if (!PyLong_Check(v)) { 14338 PyErr_SetString(PyExc_TypeError, 14339 "* wants int"); 14340 return -1; 14341 } 14342 arg->prec = _PyLong_AsInt(v); 14343 if (arg->prec == -1 && PyErr_Occurred()) 14344 return -1; 14345 if (arg->prec < 0) 14346 arg->prec = 0; 14347 if (--ctx->fmtcnt >= 0) { 14348 arg->ch = FORMAT_READ(ctx); 14349 ctx->fmtpos++; 14350 } 14351 } 14352 else if (arg->ch >= '0' && arg->ch <= '9') { 14353 arg->prec = arg->ch - '0'; 14354 while (--ctx->fmtcnt >= 0) { 14355 arg->ch = FORMAT_READ(ctx); 14356 ctx->fmtpos++; 14357 if (arg->ch < '0' || arg->ch > '9') 14358 break; 14359 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14360 PyErr_SetString(PyExc_ValueError, 14361 "precision too big"); 14362 return -1; 14363 } 14364 arg->prec = arg->prec*10 + (arg->ch - '0'); 14365 } 14366 } 14367 } 14368 14369 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14370 if (ctx->fmtcnt >= 0) { 14371 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14372 if (--ctx->fmtcnt >= 0) { 14373 arg->ch = FORMAT_READ(ctx); 14374 ctx->fmtpos++; 14375 } 14376 } 14377 } 14378 if (ctx->fmtcnt < 0) { 14379 PyErr_SetString(PyExc_ValueError, 14380 "incomplete format"); 14381 return -1; 14382 } 14383 return 0; 14384 14385#undef FORMAT_READ 14386} 14387 14388/* Format one argument. Supported conversion specifiers: 14389 14390 - "s", "r", "a": any type 14391 - "i", "d", "u": int or float 14392 - "o", "x", "X": int 14393 - "e", "E", "f", "F", "g", "G": float 14394 - "c": int or str (1 character) 14395 14396 When possible, the output is written directly into the Unicode writer 14397 (ctx->writer). A string is created when padding is required. 14398 14399 Return 0 if the argument has been formatted into *p_str, 14400 1 if the argument has been written into ctx->writer, 14401 -1 on error. */ 14402static int 14403unicode_format_arg_format(struct unicode_formatter_t *ctx, 14404 struct unicode_format_arg_t *arg, 14405 PyObject **p_str) 14406{ 14407 PyObject *v; 14408 _PyUnicodeWriter *writer = &ctx->writer; 14409 14410 if (ctx->fmtcnt == 0) 14411 ctx->writer.overallocate = 0; 14412 14413 if (arg->ch == '%') { 14414 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14415 return -1; 14416 return 1; 14417 } 14418 14419 v = unicode_format_getnextarg(ctx); 14420 if (v == NULL) 14421 return -1; 14422 14423 14424 switch (arg->ch) { 14425 case 's': 14426 case 'r': 14427 case 'a': 14428 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14429 /* Fast path */ 14430 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14431 return -1; 14432 return 1; 14433 } 14434 14435 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14436 *p_str = v; 14437 Py_INCREF(*p_str); 14438 } 14439 else { 14440 if (arg->ch == 's') 14441 *p_str = PyObject_Str(v); 14442 else if (arg->ch == 'r') 14443 *p_str = PyObject_Repr(v); 14444 else 14445 *p_str = PyObject_ASCII(v); 14446 } 14447 break; 14448 14449 case 'i': 14450 case 'd': 14451 case 'u': 14452 case 'o': 14453 case 'x': 14454 case 'X': 14455 { 14456 int ret = mainformatlong(v, arg, p_str, writer); 14457 if (ret != 0) 14458 return ret; 14459 arg->sign = 1; 14460 break; 14461 } 14462 14463 case 'e': 14464 case 'E': 14465 case 'f': 14466 case 'F': 14467 case 'g': 14468 case 'G': 14469 if (arg->width == -1 && arg->prec == -1 14470 && !(arg->flags & (F_SIGN | F_BLANK))) 14471 { 14472 /* Fast path */ 14473 if (formatfloat(v, arg, NULL, writer) == -1) 14474 return -1; 14475 return 1; 14476 } 14477 14478 arg->sign = 1; 14479 if (formatfloat(v, arg, p_str, NULL) == -1) 14480 return -1; 14481 break; 14482 14483 case 'c': 14484 { 14485 Py_UCS4 ch = formatchar(v); 14486 if (ch == (Py_UCS4) -1) 14487 return -1; 14488 if (arg->width == -1 && arg->prec == -1) { 14489 /* Fast path */ 14490 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14491 return -1; 14492 return 1; 14493 } 14494 *p_str = PyUnicode_FromOrdinal(ch); 14495 break; 14496 } 14497 14498 default: 14499 PyErr_Format(PyExc_ValueError, 14500 "unsupported format character '%c' (0x%x) " 14501 "at index %zd", 14502 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14503 (int)arg->ch, 14504 ctx->fmtpos - 1); 14505 return -1; 14506 } 14507 if (*p_str == NULL) 14508 return -1; 14509 assert (PyUnicode_Check(*p_str)); 14510 return 0; 14511} 14512 14513static int 14514unicode_format_arg_output(struct unicode_formatter_t *ctx, 14515 struct unicode_format_arg_t *arg, 14516 PyObject *str) 14517{ 14518 Py_ssize_t len; 14519 enum PyUnicode_Kind kind; 14520 void *pbuf; 14521 Py_ssize_t pindex; 14522 Py_UCS4 signchar; 14523 Py_ssize_t buflen; 14524 Py_UCS4 maxchar; 14525 Py_ssize_t sublen; 14526 _PyUnicodeWriter *writer = &ctx->writer; 14527 Py_UCS4 fill; 14528 14529 fill = ' '; 14530 if (arg->sign && arg->flags & F_ZERO) 14531 fill = '0'; 14532 14533 if (PyUnicode_READY(str) == -1) 14534 return -1; 14535 14536 len = PyUnicode_GET_LENGTH(str); 14537 if ((arg->width == -1 || arg->width <= len) 14538 && (arg->prec == -1 || arg->prec >= len) 14539 && !(arg->flags & (F_SIGN | F_BLANK))) 14540 { 14541 /* Fast path */ 14542 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14543 return -1; 14544 return 0; 14545 } 14546 14547 /* Truncate the string for "s", "r" and "a" formats 14548 if the precision is set */ 14549 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14550 if (arg->prec >= 0 && len > arg->prec) 14551 len = arg->prec; 14552 } 14553 14554 /* Adjust sign and width */ 14555 kind = PyUnicode_KIND(str); 14556 pbuf = PyUnicode_DATA(str); 14557 pindex = 0; 14558 signchar = '\0'; 14559 if (arg->sign) { 14560 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14561 if (ch == '-' || ch == '+') { 14562 signchar = ch; 14563 len--; 14564 pindex++; 14565 } 14566 else if (arg->flags & F_SIGN) 14567 signchar = '+'; 14568 else if (arg->flags & F_BLANK) 14569 signchar = ' '; 14570 else 14571 arg->sign = 0; 14572 } 14573 if (arg->width < len) 14574 arg->width = len; 14575 14576 /* Prepare the writer */ 14577 maxchar = writer->maxchar; 14578 if (!(arg->flags & F_LJUST)) { 14579 if (arg->sign) { 14580 if ((arg->width-1) > len) 14581 maxchar = Py_MAX(maxchar, fill); 14582 } 14583 else { 14584 if (arg->width > len) 14585 maxchar = Py_MAX(maxchar, fill); 14586 } 14587 } 14588 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14589 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14590 maxchar = Py_MAX(maxchar, strmaxchar); 14591 } 14592 14593 buflen = arg->width; 14594 if (arg->sign && len == arg->width) 14595 buflen++; 14596 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14597 return -1; 14598 14599 /* Write the sign if needed */ 14600 if (arg->sign) { 14601 if (fill != ' ') { 14602 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14603 writer->pos += 1; 14604 } 14605 if (arg->width > len) 14606 arg->width--; 14607 } 14608 14609 /* Write the numeric prefix for "x", "X" and "o" formats 14610 if the alternate form is used. 14611 For example, write "0x" for the "%#x" format. */ 14612 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14613 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14614 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14615 if (fill != ' ') { 14616 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14617 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14618 writer->pos += 2; 14619 pindex += 2; 14620 } 14621 arg->width -= 2; 14622 if (arg->width < 0) 14623 arg->width = 0; 14624 len -= 2; 14625 } 14626 14627 /* Pad left with the fill character if needed */ 14628 if (arg->width > len && !(arg->flags & F_LJUST)) { 14629 sublen = arg->width - len; 14630 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14631 writer->pos += sublen; 14632 arg->width = len; 14633 } 14634 14635 /* If padding with spaces: write sign if needed and/or numeric prefix if 14636 the alternate form is used */ 14637 if (fill == ' ') { 14638 if (arg->sign) { 14639 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14640 writer->pos += 1; 14641 } 14642 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14643 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14644 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14645 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14646 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14647 writer->pos += 2; 14648 pindex += 2; 14649 } 14650 } 14651 14652 /* Write characters */ 14653 if (len) { 14654 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14655 str, pindex, len); 14656 writer->pos += len; 14657 } 14658 14659 /* Pad right with the fill character if needed */ 14660 if (arg->width > len) { 14661 sublen = arg->width - len; 14662 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14663 writer->pos += sublen; 14664 } 14665 return 0; 14666} 14667 14668/* Helper of PyUnicode_Format(): format one arg. 14669 Return 0 on success, raise an exception and return -1 on error. */ 14670static int 14671unicode_format_arg(struct unicode_formatter_t *ctx) 14672{ 14673 struct unicode_format_arg_t arg; 14674 PyObject *str; 14675 int ret; 14676 14677 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14678 arg.flags = 0; 14679 arg.width = -1; 14680 arg.prec = -1; 14681 arg.sign = 0; 14682 str = NULL; 14683 14684 ret = unicode_format_arg_parse(ctx, &arg); 14685 if (ret == -1) 14686 return -1; 14687 14688 ret = unicode_format_arg_format(ctx, &arg, &str); 14689 if (ret == -1) 14690 return -1; 14691 14692 if (ret != 1) { 14693 ret = unicode_format_arg_output(ctx, &arg, str); 14694 Py_DECREF(str); 14695 if (ret == -1) 14696 return -1; 14697 } 14698 14699 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14700 PyErr_SetString(PyExc_TypeError, 14701 "not all arguments converted during string formatting"); 14702 return -1; 14703 } 14704 return 0; 14705} 14706 14707PyObject * 14708PyUnicode_Format(PyObject *format, PyObject *args) 14709{ 14710 struct unicode_formatter_t ctx; 14711 14712 if (format == NULL || args == NULL) { 14713 PyErr_BadInternalCall(); 14714 return NULL; 14715 } 14716 14717 ctx.fmtstr = PyUnicode_FromObject(format); 14718 if (ctx.fmtstr == NULL) 14719 return NULL; 14720 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14721 Py_DECREF(ctx.fmtstr); 14722 return NULL; 14723 } 14724 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14725 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14726 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14727 ctx.fmtpos = 0; 14728 14729 _PyUnicodeWriter_Init(&ctx.writer); 14730 ctx.writer.min_length = ctx.fmtcnt + 100; 14731 ctx.writer.overallocate = 1; 14732 14733 if (PyTuple_Check(args)) { 14734 ctx.arglen = PyTuple_Size(args); 14735 ctx.argidx = 0; 14736 } 14737 else { 14738 ctx.arglen = -1; 14739 ctx.argidx = -2; 14740 } 14741 ctx.args_owned = 0; 14742 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14743 ctx.dict = args; 14744 else 14745 ctx.dict = NULL; 14746 ctx.args = args; 14747 14748 while (--ctx.fmtcnt >= 0) { 14749 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14750 Py_ssize_t nonfmtpos; 14751 14752 nonfmtpos = ctx.fmtpos++; 14753 while (ctx.fmtcnt >= 0 && 14754 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14755 ctx.fmtpos++; 14756 ctx.fmtcnt--; 14757 } 14758 if (ctx.fmtcnt < 0) { 14759 ctx.fmtpos--; 14760 ctx.writer.overallocate = 0; 14761 } 14762 14763 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14764 nonfmtpos, ctx.fmtpos) < 0) 14765 goto onError; 14766 } 14767 else { 14768 ctx.fmtpos++; 14769 if (unicode_format_arg(&ctx) == -1) 14770 goto onError; 14771 } 14772 } 14773 14774 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14775 PyErr_SetString(PyExc_TypeError, 14776 "not all arguments converted during string formatting"); 14777 goto onError; 14778 } 14779 14780 if (ctx.args_owned) { 14781 Py_DECREF(ctx.args); 14782 } 14783 Py_DECREF(ctx.fmtstr); 14784 return _PyUnicodeWriter_Finish(&ctx.writer); 14785 14786 onError: 14787 Py_DECREF(ctx.fmtstr); 14788 _PyUnicodeWriter_Dealloc(&ctx.writer); 14789 if (ctx.args_owned) { 14790 Py_DECREF(ctx.args); 14791 } 14792 return NULL; 14793} 14794 14795static PyObject * 14796unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14797 14798static PyObject * 14799unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14800{ 14801 PyObject *x = NULL; 14802 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14803 char *encoding = NULL; 14804 char *errors = NULL; 14805 14806 if (type != &PyUnicode_Type) 14807 return unicode_subtype_new(type, args, kwds); 14808 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14809 kwlist, &x, &encoding, &errors)) 14810 return NULL; 14811 if (x == NULL) 14812 _Py_RETURN_UNICODE_EMPTY(); 14813 if (encoding == NULL && errors == NULL) 14814 return PyObject_Str(x); 14815 else 14816 return PyUnicode_FromEncodedObject(x, encoding, errors); 14817} 14818 14819static PyObject * 14820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14821{ 14822 PyObject *unicode, *self; 14823 Py_ssize_t length, char_size; 14824 int share_wstr, share_utf8; 14825 unsigned int kind; 14826 void *data; 14827 14828 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14829 14830 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14831 if (unicode == NULL) 14832 return NULL; 14833 assert(_PyUnicode_CHECK(unicode)); 14834 if (PyUnicode_READY(unicode) == -1) { 14835 Py_DECREF(unicode); 14836 return NULL; 14837 } 14838 14839 self = type->tp_alloc(type, 0); 14840 if (self == NULL) { 14841 Py_DECREF(unicode); 14842 return NULL; 14843 } 14844 kind = PyUnicode_KIND(unicode); 14845 length = PyUnicode_GET_LENGTH(unicode); 14846 14847 _PyUnicode_LENGTH(self) = length; 14848#ifdef Py_DEBUG 14849 _PyUnicode_HASH(self) = -1; 14850#else 14851 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14852#endif 14853 _PyUnicode_STATE(self).interned = 0; 14854 _PyUnicode_STATE(self).kind = kind; 14855 _PyUnicode_STATE(self).compact = 0; 14856 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14857 _PyUnicode_STATE(self).ready = 1; 14858 _PyUnicode_WSTR(self) = NULL; 14859 _PyUnicode_UTF8_LENGTH(self) = 0; 14860 _PyUnicode_UTF8(self) = NULL; 14861 _PyUnicode_WSTR_LENGTH(self) = 0; 14862 _PyUnicode_DATA_ANY(self) = NULL; 14863 14864 share_utf8 = 0; 14865 share_wstr = 0; 14866 if (kind == PyUnicode_1BYTE_KIND) { 14867 char_size = 1; 14868 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14869 share_utf8 = 1; 14870 } 14871 else if (kind == PyUnicode_2BYTE_KIND) { 14872 char_size = 2; 14873 if (sizeof(wchar_t) == 2) 14874 share_wstr = 1; 14875 } 14876 else { 14877 assert(kind == PyUnicode_4BYTE_KIND); 14878 char_size = 4; 14879 if (sizeof(wchar_t) == 4) 14880 share_wstr = 1; 14881 } 14882 14883 /* Ensure we won't overflow the length. */ 14884 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14885 PyErr_NoMemory(); 14886 goto onError; 14887 } 14888 data = PyObject_MALLOC((length + 1) * char_size); 14889 if (data == NULL) { 14890 PyErr_NoMemory(); 14891 goto onError; 14892 } 14893 14894 _PyUnicode_DATA_ANY(self) = data; 14895 if (share_utf8) { 14896 _PyUnicode_UTF8_LENGTH(self) = length; 14897 _PyUnicode_UTF8(self) = data; 14898 } 14899 if (share_wstr) { 14900 _PyUnicode_WSTR_LENGTH(self) = length; 14901 _PyUnicode_WSTR(self) = (wchar_t *)data; 14902 } 14903 14904 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14905 kind * (length + 1)); 14906 assert(_PyUnicode_CheckConsistency(self, 1)); 14907#ifdef Py_DEBUG 14908 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14909#endif 14910 Py_DECREF(unicode); 14911 return self; 14912 14913onError: 14914 Py_DECREF(unicode); 14915 Py_DECREF(self); 14916 return NULL; 14917} 14918 14919PyDoc_STRVAR(unicode_doc, 14920"str(object='') -> str\n\ 14921str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14922\n\ 14923Create a new string object from the given object. If encoding or\n\ 14924errors is specified, then the object must expose a data buffer\n\ 14925that will be decoded using the given encoding and error handler.\n\ 14926Otherwise, returns the result of object.__str__() (if defined)\n\ 14927or repr(object).\n\ 14928encoding defaults to sys.getdefaultencoding().\n\ 14929errors defaults to 'strict'."); 14930 14931static PyObject *unicode_iter(PyObject *seq); 14932 14933PyTypeObject PyUnicode_Type = { 14934 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14935 "str", /* tp_name */ 14936 sizeof(PyUnicodeObject), /* tp_size */ 14937 0, /* tp_itemsize */ 14938 /* Slots */ 14939 (destructor)unicode_dealloc, /* tp_dealloc */ 14940 0, /* tp_print */ 14941 0, /* tp_getattr */ 14942 0, /* tp_setattr */ 14943 0, /* tp_reserved */ 14944 unicode_repr, /* tp_repr */ 14945 &unicode_as_number, /* tp_as_number */ 14946 &unicode_as_sequence, /* tp_as_sequence */ 14947 &unicode_as_mapping, /* tp_as_mapping */ 14948 (hashfunc) unicode_hash, /* tp_hash*/ 14949 0, /* tp_call*/ 14950 (reprfunc) unicode_str, /* tp_str */ 14951 PyObject_GenericGetAttr, /* tp_getattro */ 14952 0, /* tp_setattro */ 14953 0, /* tp_as_buffer */ 14954 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14955 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14956 unicode_doc, /* tp_doc */ 14957 0, /* tp_traverse */ 14958 0, /* tp_clear */ 14959 PyUnicode_RichCompare, /* tp_richcompare */ 14960 0, /* tp_weaklistoffset */ 14961 unicode_iter, /* tp_iter */ 14962 0, /* tp_iternext */ 14963 unicode_methods, /* tp_methods */ 14964 0, /* tp_members */ 14965 0, /* tp_getset */ 14966 &PyBaseObject_Type, /* tp_base */ 14967 0, /* tp_dict */ 14968 0, /* tp_descr_get */ 14969 0, /* tp_descr_set */ 14970 0, /* tp_dictoffset */ 14971 0, /* tp_init */ 14972 0, /* tp_alloc */ 14973 unicode_new, /* tp_new */ 14974 PyObject_Del, /* tp_free */ 14975}; 14976 14977/* Initialize the Unicode implementation */ 14978 14979int _PyUnicode_Init(void) 14980{ 14981 /* XXX - move this array to unicodectype.c ? */ 14982 Py_UCS2 linebreak[] = { 14983 0x000A, /* LINE FEED */ 14984 0x000D, /* CARRIAGE RETURN */ 14985 0x001C, /* FILE SEPARATOR */ 14986 0x001D, /* GROUP SEPARATOR */ 14987 0x001E, /* RECORD SEPARATOR */ 14988 0x0085, /* NEXT LINE */ 14989 0x2028, /* LINE SEPARATOR */ 14990 0x2029, /* PARAGRAPH SEPARATOR */ 14991 }; 14992 14993 /* Init the implementation */ 14994 _Py_INCREF_UNICODE_EMPTY(); 14995 if (!unicode_empty) 14996 Py_FatalError("Can't create empty string"); 14997 Py_DECREF(unicode_empty); 14998 14999 if (PyType_Ready(&PyUnicode_Type) < 0) 15000 Py_FatalError("Can't initialize 'unicode'"); 15001 15002 /* initialize the linebreak bloom filter */ 15003 bloom_linebreak = make_bloom_mask( 15004 PyUnicode_2BYTE_KIND, linebreak, 15005 Py_ARRAY_LENGTH(linebreak)); 15006 15007 if (PyType_Ready(&EncodingMapType) < 0) 15008 Py_FatalError("Can't initialize encoding map type"); 15009 15010 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15011 Py_FatalError("Can't initialize field name iterator type"); 15012 15013 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15014 Py_FatalError("Can't initialize formatter iter type"); 15015 15016#ifdef HAVE_MBCS 15017 winver.dwOSVersionInfoSize = sizeof(winver); 15018 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 15019 PyErr_SetFromWindowsErr(0); 15020 return -1; 15021 } 15022#endif 15023 return 0; 15024} 15025 15026/* Finalize the Unicode implementation */ 15027 15028int 15029PyUnicode_ClearFreeList(void) 15030{ 15031 return 0; 15032} 15033 15034void 15035_PyUnicode_Fini(void) 15036{ 15037 int i; 15038 15039 Py_CLEAR(unicode_empty); 15040 15041 for (i = 0; i < 256; i++) 15042 Py_CLEAR(unicode_latin1[i]); 15043 _PyUnicode_ClearStaticStrings(); 15044 (void)PyUnicode_ClearFreeList(); 15045} 15046 15047void 15048PyUnicode_InternInPlace(PyObject **p) 15049{ 15050 PyObject *s = *p; 15051 PyObject *t; 15052#ifdef Py_DEBUG 15053 assert(s != NULL); 15054 assert(_PyUnicode_CHECK(s)); 15055#else 15056 if (s == NULL || !PyUnicode_Check(s)) 15057 return; 15058#endif 15059 /* If it's a subclass, we don't really know what putting 15060 it in the interned dict might do. */ 15061 if (!PyUnicode_CheckExact(s)) 15062 return; 15063 if (PyUnicode_CHECK_INTERNED(s)) 15064 return; 15065 if (interned == NULL) { 15066 interned = PyDict_New(); 15067 if (interned == NULL) { 15068 PyErr_Clear(); /* Don't leave an exception */ 15069 return; 15070 } 15071 } 15072 /* It might be that the GetItem call fails even 15073 though the key is present in the dictionary, 15074 namely when this happens during a stack overflow. */ 15075 Py_ALLOW_RECURSION 15076 t = PyDict_GetItem(interned, s); 15077 Py_END_ALLOW_RECURSION 15078 15079 if (t) { 15080 Py_INCREF(t); 15081 Py_DECREF(*p); 15082 *p = t; 15083 return; 15084 } 15085 15086 PyThreadState_GET()->recursion_critical = 1; 15087 if (PyDict_SetItem(interned, s, s) < 0) { 15088 PyErr_Clear(); 15089 PyThreadState_GET()->recursion_critical = 0; 15090 return; 15091 } 15092 PyThreadState_GET()->recursion_critical = 0; 15093 /* The two references in interned are not counted by refcnt. 15094 The deallocator will take care of this */ 15095 Py_REFCNT(s) -= 2; 15096 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15097} 15098 15099void 15100PyUnicode_InternImmortal(PyObject **p) 15101{ 15102 PyUnicode_InternInPlace(p); 15103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15104 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15105 Py_INCREF(*p); 15106 } 15107} 15108 15109PyObject * 15110PyUnicode_InternFromString(const char *cp) 15111{ 15112 PyObject *s = PyUnicode_FromString(cp); 15113 if (s == NULL) 15114 return NULL; 15115 PyUnicode_InternInPlace(&s); 15116 return s; 15117} 15118 15119void 15120_Py_ReleaseInternedUnicodeStrings(void) 15121{ 15122 PyObject *keys; 15123 PyObject *s; 15124 Py_ssize_t i, n; 15125 Py_ssize_t immortal_size = 0, mortal_size = 0; 15126 15127 if (interned == NULL || !PyDict_Check(interned)) 15128 return; 15129 keys = PyDict_Keys(interned); 15130 if (keys == NULL || !PyList_Check(keys)) { 15131 PyErr_Clear(); 15132 return; 15133 } 15134 15135 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15136 detector, interned unicode strings are not forcibly deallocated; 15137 rather, we give them their stolen references back, and then clear 15138 and DECREF the interned dict. */ 15139 15140 n = PyList_GET_SIZE(keys); 15141 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15142 n); 15143 for (i = 0; i < n; i++) { 15144 s = PyList_GET_ITEM(keys, i); 15145 if (PyUnicode_READY(s) == -1) { 15146 assert(0 && "could not ready string"); 15147 fprintf(stderr, "could not ready string\n"); 15148 } 15149 switch (PyUnicode_CHECK_INTERNED(s)) { 15150 case SSTATE_NOT_INTERNED: 15151 /* XXX Shouldn't happen */ 15152 break; 15153 case SSTATE_INTERNED_IMMORTAL: 15154 Py_REFCNT(s) += 1; 15155 immortal_size += PyUnicode_GET_LENGTH(s); 15156 break; 15157 case SSTATE_INTERNED_MORTAL: 15158 Py_REFCNT(s) += 2; 15159 mortal_size += PyUnicode_GET_LENGTH(s); 15160 break; 15161 default: 15162 Py_FatalError("Inconsistent interned string state."); 15163 } 15164 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15165 } 15166 fprintf(stderr, "total size of all interned strings: " 15167 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15168 "mortal/immortal\n", mortal_size, immortal_size); 15169 Py_DECREF(keys); 15170 PyDict_Clear(interned); 15171 Py_CLEAR(interned); 15172} 15173 15174 15175/********************* Unicode Iterator **************************/ 15176 15177typedef struct { 15178 PyObject_HEAD 15179 Py_ssize_t it_index; 15180 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15181} unicodeiterobject; 15182 15183static void 15184unicodeiter_dealloc(unicodeiterobject *it) 15185{ 15186 _PyObject_GC_UNTRACK(it); 15187 Py_XDECREF(it->it_seq); 15188 PyObject_GC_Del(it); 15189} 15190 15191static int 15192unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15193{ 15194 Py_VISIT(it->it_seq); 15195 return 0; 15196} 15197 15198static PyObject * 15199unicodeiter_next(unicodeiterobject *it) 15200{ 15201 PyObject *seq, *item; 15202 15203 assert(it != NULL); 15204 seq = it->it_seq; 15205 if (seq == NULL) 15206 return NULL; 15207 assert(_PyUnicode_CHECK(seq)); 15208 15209 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15210 int kind = PyUnicode_KIND(seq); 15211 void *data = PyUnicode_DATA(seq); 15212 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15213 item = PyUnicode_FromOrdinal(chr); 15214 if (item != NULL) 15215 ++it->it_index; 15216 return item; 15217 } 15218 15219 Py_DECREF(seq); 15220 it->it_seq = NULL; 15221 return NULL; 15222} 15223 15224static PyObject * 15225unicodeiter_len(unicodeiterobject *it) 15226{ 15227 Py_ssize_t len = 0; 15228 if (it->it_seq) 15229 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15230 return PyLong_FromSsize_t(len); 15231} 15232 15233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15234 15235static PyObject * 15236unicodeiter_reduce(unicodeiterobject *it) 15237{ 15238 if (it->it_seq != NULL) { 15239 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15240 it->it_seq, it->it_index); 15241 } else { 15242 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15243 if (u == NULL) 15244 return NULL; 15245 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15246 } 15247} 15248 15249PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15250 15251static PyObject * 15252unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15253{ 15254 Py_ssize_t index = PyLong_AsSsize_t(state); 15255 if (index == -1 && PyErr_Occurred()) 15256 return NULL; 15257 if (it->it_seq != NULL) { 15258 if (index < 0) 15259 index = 0; 15260 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15261 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15262 it->it_index = index; 15263 } 15264 Py_RETURN_NONE; 15265} 15266 15267PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15268 15269static PyMethodDef unicodeiter_methods[] = { 15270 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15271 length_hint_doc}, 15272 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15273 reduce_doc}, 15274 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15275 setstate_doc}, 15276 {NULL, NULL} /* sentinel */ 15277}; 15278 15279PyTypeObject PyUnicodeIter_Type = { 15280 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15281 "str_iterator", /* tp_name */ 15282 sizeof(unicodeiterobject), /* tp_basicsize */ 15283 0, /* tp_itemsize */ 15284 /* methods */ 15285 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15286 0, /* tp_print */ 15287 0, /* tp_getattr */ 15288 0, /* tp_setattr */ 15289 0, /* tp_reserved */ 15290 0, /* tp_repr */ 15291 0, /* tp_as_number */ 15292 0, /* tp_as_sequence */ 15293 0, /* tp_as_mapping */ 15294 0, /* tp_hash */ 15295 0, /* tp_call */ 15296 0, /* tp_str */ 15297 PyObject_GenericGetAttr, /* tp_getattro */ 15298 0, /* tp_setattro */ 15299 0, /* tp_as_buffer */ 15300 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15301 0, /* tp_doc */ 15302 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15303 0, /* tp_clear */ 15304 0, /* tp_richcompare */ 15305 0, /* tp_weaklistoffset */ 15306 PyObject_SelfIter, /* tp_iter */ 15307 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15308 unicodeiter_methods, /* tp_methods */ 15309 0, 15310}; 15311 15312static PyObject * 15313unicode_iter(PyObject *seq) 15314{ 15315 unicodeiterobject *it; 15316 15317 if (!PyUnicode_Check(seq)) { 15318 PyErr_BadInternalCall(); 15319 return NULL; 15320 } 15321 if (PyUnicode_READY(seq) == -1) 15322 return NULL; 15323 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15324 if (it == NULL) 15325 return NULL; 15326 it->it_index = 0; 15327 Py_INCREF(seq); 15328 it->it_seq = seq; 15329 _PyObject_GC_TRACK(it); 15330 return (PyObject *)it; 15331} 15332 15333 15334size_t 15335Py_UNICODE_strlen(const Py_UNICODE *u) 15336{ 15337 int res = 0; 15338 while(*u++) 15339 res++; 15340 return res; 15341} 15342 15343Py_UNICODE* 15344Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15345{ 15346 Py_UNICODE *u = s1; 15347 while ((*u++ = *s2++)); 15348 return s1; 15349} 15350 15351Py_UNICODE* 15352Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15353{ 15354 Py_UNICODE *u = s1; 15355 while ((*u++ = *s2++)) 15356 if (n-- == 0) 15357 break; 15358 return s1; 15359} 15360 15361Py_UNICODE* 15362Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15363{ 15364 Py_UNICODE *u1 = s1; 15365 u1 += Py_UNICODE_strlen(u1); 15366 Py_UNICODE_strcpy(u1, s2); 15367 return s1; 15368} 15369 15370int 15371Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15372{ 15373 while (*s1 && *s2 && *s1 == *s2) 15374 s1++, s2++; 15375 if (*s1 && *s2) 15376 return (*s1 < *s2) ? -1 : +1; 15377 if (*s1) 15378 return 1; 15379 if (*s2) 15380 return -1; 15381 return 0; 15382} 15383 15384int 15385Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15386{ 15387 Py_UNICODE u1, u2; 15388 for (; n != 0; n--) { 15389 u1 = *s1; 15390 u2 = *s2; 15391 if (u1 != u2) 15392 return (u1 < u2) ? -1 : +1; 15393 if (u1 == '\0') 15394 return 0; 15395 s1++; 15396 s2++; 15397 } 15398 return 0; 15399} 15400 15401Py_UNICODE* 15402Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15403{ 15404 const Py_UNICODE *p; 15405 for (p = s; *p; p++) 15406 if (*p == c) 15407 return (Py_UNICODE*)p; 15408 return NULL; 15409} 15410 15411Py_UNICODE* 15412Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15413{ 15414 const Py_UNICODE *p; 15415 p = s + Py_UNICODE_strlen(s); 15416 while (p != s) { 15417 p--; 15418 if (*p == c) 15419 return (Py_UNICODE*)p; 15420 } 15421 return NULL; 15422} 15423 15424Py_UNICODE* 15425PyUnicode_AsUnicodeCopy(PyObject *unicode) 15426{ 15427 Py_UNICODE *u, *copy; 15428 Py_ssize_t len, size; 15429 15430 if (!PyUnicode_Check(unicode)) { 15431 PyErr_BadArgument(); 15432 return NULL; 15433 } 15434 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15435 if (u == NULL) 15436 return NULL; 15437 /* Ensure we won't overflow the size. */ 15438 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15439 PyErr_NoMemory(); 15440 return NULL; 15441 } 15442 size = len + 1; /* copy the null character */ 15443 size *= sizeof(Py_UNICODE); 15444 copy = PyMem_Malloc(size); 15445 if (copy == NULL) { 15446 PyErr_NoMemory(); 15447 return NULL; 15448 } 15449 memcpy(copy, u, size); 15450 return copy; 15451} 15452 15453/* A _string module, to export formatter_parser and formatter_field_name_split 15454 to the string.Formatter class implemented in Python. */ 15455 15456static PyMethodDef _string_methods[] = { 15457 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15458 METH_O, PyDoc_STR("split the argument as a field name")}, 15459 {"formatter_parser", (PyCFunction) formatter_parser, 15460 METH_O, PyDoc_STR("parse the argument as a format string")}, 15461 {NULL, NULL} 15462}; 15463 15464static struct PyModuleDef _string_module = { 15465 PyModuleDef_HEAD_INIT, 15466 "_string", 15467 PyDoc_STR("string helper module"), 15468 0, 15469 _string_methods, 15470 NULL, 15471 NULL, 15472 NULL, 15473 NULL 15474}; 15475 15476PyMODINIT_FUNC 15477PyInit__string(void) 15478{ 15479 return PyModule_Create(&_string_module); 15480} 15481 15482 15483#ifdef __cplusplus 15484} 15485#endif 15486