unicodeobject.c revision ea71a525c34784d188252947f497ed251f9d4d5c
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* --- Globals ------------------------------------------------------------ 51 52NOTE: In the interpreter's initialization phase, some globals are currently 53 initialized dynamically as needed. In the process Unicode objects may 54 be created before the Unicode type is ready. 55 56*/ 57 58 59#ifdef __cplusplus 60extern "C" { 61#endif 62 63/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 64#define MAX_UNICODE 0x10ffff 65 66#ifdef Py_DEBUG 67# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 68#else 69# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 70#endif 71 72#define _PyUnicode_UTF8(op) \ 73 (((PyCompactUnicodeObject*)(op))->utf8) 74#define PyUnicode_UTF8(op) \ 75 (assert(_PyUnicode_CHECK(op)), \ 76 assert(PyUnicode_IS_READY(op)), \ 77 PyUnicode_IS_COMPACT_ASCII(op) ? \ 78 ((char*)((PyASCIIObject*)(op) + 1)) : \ 79 _PyUnicode_UTF8(op)) 80#define _PyUnicode_UTF8_LENGTH(op) \ 81 (((PyCompactUnicodeObject*)(op))->utf8_length) 82#define PyUnicode_UTF8_LENGTH(op) \ 83 (assert(_PyUnicode_CHECK(op)), \ 84 assert(PyUnicode_IS_READY(op)), \ 85 PyUnicode_IS_COMPACT_ASCII(op) ? \ 86 ((PyASCIIObject*)(op))->length : \ 87 _PyUnicode_UTF8_LENGTH(op)) 88#define _PyUnicode_WSTR(op) \ 89 (((PyASCIIObject*)(op))->wstr) 90#define _PyUnicode_WSTR_LENGTH(op) \ 91 (((PyCompactUnicodeObject*)(op))->wstr_length) 92#define _PyUnicode_LENGTH(op) \ 93 (((PyASCIIObject *)(op))->length) 94#define _PyUnicode_STATE(op) \ 95 (((PyASCIIObject *)(op))->state) 96#define _PyUnicode_HASH(op) \ 97 (((PyASCIIObject *)(op))->hash) 98#define _PyUnicode_KIND(op) \ 99 (assert(_PyUnicode_CHECK(op)), \ 100 ((PyASCIIObject *)(op))->state.kind) 101#define _PyUnicode_GET_LENGTH(op) \ 102 (assert(_PyUnicode_CHECK(op)), \ 103 ((PyASCIIObject *)(op))->length) 104#define _PyUnicode_DATA_ANY(op) \ 105 (((PyUnicodeObject*)(op))->data.any) 106 107#undef PyUnicode_READY 108#define PyUnicode_READY(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 (PyUnicode_IS_READY(op) ? \ 111 0 : \ 112 _PyUnicode_Ready(op))) 113 114#define _PyUnicode_SHARE_UTF8(op) \ 115 (assert(_PyUnicode_CHECK(op)), \ 116 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 117 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 118#define _PyUnicode_SHARE_WSTR(op) \ 119 (assert(_PyUnicode_CHECK(op)), \ 120 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 121 122/* true if the Unicode object has an allocated UTF-8 memory block 123 (not shared with other data) */ 124#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 (!PyUnicode_IS_COMPACT_ASCII(op) \ 127 && _PyUnicode_UTF8(op) \ 128 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 129 130/* true if the Unicode object has an allocated wstr memory block 131 (not shared with other data) */ 132#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 133 (assert(_PyUnicode_CHECK(op)), \ 134 (_PyUnicode_WSTR(op) && \ 135 (!PyUnicode_IS_READY(op) || \ 136 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 137 138/* Generic helper macro to convert characters of different types. 139 from_type and to_type have to be valid type names, begin and end 140 are pointers to the source characters which should be of type 141 "from_type *". to is a pointer of type "to_type *" and points to the 142 buffer where the result characters are written to. */ 143#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 144 do { \ 145 to_type *_to = (to_type *) to; \ 146 const from_type *_iter = (begin); \ 147 const from_type *_end = (end); \ 148 Py_ssize_t n = (_end) - (_iter); \ 149 const from_type *_unrolled_end = \ 150 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 151 while (_iter < (_unrolled_end)) { \ 152 _to[0] = (to_type) _iter[0]; \ 153 _to[1] = (to_type) _iter[1]; \ 154 _to[2] = (to_type) _iter[2]; \ 155 _to[3] = (to_type) _iter[3]; \ 156 _iter += 4; _to += 4; \ 157 } \ 158 while (_iter < (_end)) \ 159 *_to++ = (to_type) *_iter++; \ 160 } while (0) 161 162/* This dictionary holds all interned unicode strings. Note that references 163 to strings in this dictionary are *not* counted in the string's ob_refcnt. 164 When the interned string reaches a refcnt of 0 the string deallocation 165 function will delete the reference from this dictionary. 166 167 Another way to look at this is that to say that the actual reference 168 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 169*/ 170static PyObject *interned = NULL; 171 172/* The empty Unicode object is shared to improve performance. */ 173static PyObject *unicode_empty = NULL; 174 175#define _Py_INCREF_UNICODE_EMPTY() \ 176 do { \ 177 if (unicode_empty != NULL) \ 178 Py_INCREF(unicode_empty); \ 179 else { \ 180 unicode_empty = PyUnicode_New(0, 0); \ 181 if (unicode_empty != NULL) { \ 182 Py_INCREF(unicode_empty); \ 183 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 184 } \ 185 } \ 186 } while (0) 187 188#define _Py_RETURN_UNICODE_EMPTY() \ 189 do { \ 190 _Py_INCREF_UNICODE_EMPTY(); \ 191 return unicode_empty; \ 192 } while (0) 193 194/* Forward declaration */ 195Py_LOCAL_INLINE(int) 196_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 197 198/* List of static strings. */ 199static _Py_Identifier *static_strings = NULL; 200 201/* Single character Unicode strings in the Latin-1 range are being 202 shared as well. */ 203static PyObject *unicode_latin1[256] = {NULL}; 204 205/* Fast detection of the most frequent whitespace characters */ 206const unsigned char _Py_ascii_whitespace[] = { 207 0, 0, 0, 0, 0, 0, 0, 0, 208/* case 0x0009: * CHARACTER TABULATION */ 209/* case 0x000A: * LINE FEED */ 210/* case 0x000B: * LINE TABULATION */ 211/* case 0x000C: * FORM FEED */ 212/* case 0x000D: * CARRIAGE RETURN */ 213 0, 1, 1, 1, 1, 1, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215/* case 0x001C: * FILE SEPARATOR */ 216/* case 0x001D: * GROUP SEPARATOR */ 217/* case 0x001E: * RECORD SEPARATOR */ 218/* case 0x001F: * UNIT SEPARATOR */ 219 0, 0, 0, 0, 1, 1, 1, 1, 220/* case 0x0020: * SPACE */ 221 1, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0 234}; 235 236/* forward */ 237static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 238static PyObject* get_latin1_char(unsigned char ch); 239static int unicode_modifiable(PyObject *unicode); 240 241 242static PyObject * 243_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 244static PyObject * 245_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 246static PyObject * 247_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 248 249static PyObject * 250unicode_encode_call_errorhandler(const char *errors, 251 PyObject **errorHandler,const char *encoding, const char *reason, 252 PyObject *unicode, PyObject **exceptionObject, 253 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 254 255static void 256raise_encode_exception(PyObject **exceptionObject, 257 const char *encoding, 258 PyObject *unicode, 259 Py_ssize_t startpos, Py_ssize_t endpos, 260 const char *reason); 261 262/* Same for linebreaks */ 263static unsigned char ascii_linebreak[] = { 264 0, 0, 0, 0, 0, 0, 0, 0, 265/* 0x000A, * LINE FEED */ 266/* 0x000B, * LINE TABULATION */ 267/* 0x000C, * FORM FEED */ 268/* 0x000D, * CARRIAGE RETURN */ 269 0, 0, 1, 1, 1, 1, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0, 271/* 0x001C, * FILE SEPARATOR */ 272/* 0x001D, * GROUP SEPARATOR */ 273/* 0x001E, * RECORD SEPARATOR */ 274 0, 0, 0, 0, 1, 1, 1, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0, 278 0, 0, 0, 0, 0, 0, 0, 0, 279 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0 288}; 289 290/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 291 This function is kept for backward compatibility with the old API. */ 292Py_UNICODE 293PyUnicode_GetMax(void) 294{ 295#ifdef Py_UNICODE_WIDE 296 return 0x10FFFF; 297#else 298 /* This is actually an illegal character, so it should 299 not be passed to unichr. */ 300 return 0xFFFF; 301#endif 302} 303 304#ifdef Py_DEBUG 305int 306_PyUnicode_CheckConsistency(PyObject *op, int check_content) 307{ 308 PyASCIIObject *ascii; 309 unsigned int kind; 310 311 assert(PyUnicode_Check(op)); 312 313 ascii = (PyASCIIObject *)op; 314 kind = ascii->state.kind; 315 316 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 317 assert(kind == PyUnicode_1BYTE_KIND); 318 assert(ascii->state.ready == 1); 319 } 320 else { 321 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 322 void *data; 323 324 if (ascii->state.compact == 1) { 325 data = compact + 1; 326 assert(kind == PyUnicode_1BYTE_KIND 327 || kind == PyUnicode_2BYTE_KIND 328 || kind == PyUnicode_4BYTE_KIND); 329 assert(ascii->state.ascii == 0); 330 assert(ascii->state.ready == 1); 331 assert (compact->utf8 != data); 332 } 333 else { 334 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 335 336 data = unicode->data.any; 337 if (kind == PyUnicode_WCHAR_KIND) { 338 assert(ascii->length == 0); 339 assert(ascii->hash == -1); 340 assert(ascii->state.compact == 0); 341 assert(ascii->state.ascii == 0); 342 assert(ascii->state.ready == 0); 343 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 344 assert(ascii->wstr != NULL); 345 assert(data == NULL); 346 assert(compact->utf8 == NULL); 347 } 348 else { 349 assert(kind == PyUnicode_1BYTE_KIND 350 || kind == PyUnicode_2BYTE_KIND 351 || kind == PyUnicode_4BYTE_KIND); 352 assert(ascii->state.compact == 0); 353 assert(ascii->state.ready == 1); 354 assert(data != NULL); 355 if (ascii->state.ascii) { 356 assert (compact->utf8 == data); 357 assert (compact->utf8_length == ascii->length); 358 } 359 else 360 assert (compact->utf8 != data); 361 } 362 } 363 if (kind != PyUnicode_WCHAR_KIND) { 364 if ( 365#if SIZEOF_WCHAR_T == 2 366 kind == PyUnicode_2BYTE_KIND 367#else 368 kind == PyUnicode_4BYTE_KIND 369#endif 370 ) 371 { 372 assert(ascii->wstr == data); 373 assert(compact->wstr_length == ascii->length); 374 } else 375 assert(ascii->wstr != data); 376 } 377 378 if (compact->utf8 == NULL) 379 assert(compact->utf8_length == 0); 380 if (ascii->wstr == NULL) 381 assert(compact->wstr_length == 0); 382 } 383 /* check that the best kind is used */ 384 if (check_content && kind != PyUnicode_WCHAR_KIND) 385 { 386 Py_ssize_t i; 387 Py_UCS4 maxchar = 0; 388 void *data; 389 Py_UCS4 ch; 390 391 data = PyUnicode_DATA(ascii); 392 for (i=0; i < ascii->length; i++) 393 { 394 ch = PyUnicode_READ(kind, data, i); 395 if (ch > maxchar) 396 maxchar = ch; 397 } 398 if (kind == PyUnicode_1BYTE_KIND) { 399 if (ascii->state.ascii == 0) { 400 assert(maxchar >= 128); 401 assert(maxchar <= 255); 402 } 403 else 404 assert(maxchar < 128); 405 } 406 else if (kind == PyUnicode_2BYTE_KIND) { 407 assert(maxchar >= 0x100); 408 assert(maxchar <= 0xFFFF); 409 } 410 else { 411 assert(maxchar >= 0x10000); 412 assert(maxchar <= MAX_UNICODE); 413 } 414 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 415 } 416 return 1; 417} 418#endif 419 420static PyObject* 421unicode_result_wchar(PyObject *unicode) 422{ 423#ifndef Py_DEBUG 424 Py_ssize_t len; 425 426 len = _PyUnicode_WSTR_LENGTH(unicode); 427 if (len == 0) { 428 Py_DECREF(unicode); 429 _Py_RETURN_UNICODE_EMPTY(); 430 } 431 432 if (len == 1) { 433 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 434 if ((Py_UCS4)ch < 256) { 435 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 436 Py_DECREF(unicode); 437 return latin1_char; 438 } 439 } 440 441 if (_PyUnicode_Ready(unicode) < 0) { 442 Py_DECREF(unicode); 443 return NULL; 444 } 445#else 446 assert(Py_REFCNT(unicode) == 1); 447 448 /* don't make the result ready in debug mode to ensure that the caller 449 makes the string ready before using it */ 450 assert(_PyUnicode_CheckConsistency(unicode, 1)); 451#endif 452 return unicode; 453} 454 455static PyObject* 456unicode_result_ready(PyObject *unicode) 457{ 458 Py_ssize_t length; 459 460 length = PyUnicode_GET_LENGTH(unicode); 461 if (length == 0) { 462 if (unicode != unicode_empty) { 463 Py_DECREF(unicode); 464 _Py_RETURN_UNICODE_EMPTY(); 465 } 466 return unicode_empty; 467 } 468 469 if (length == 1) { 470 void *data = PyUnicode_DATA(unicode); 471 int kind = PyUnicode_KIND(unicode); 472 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 473 if (ch < 256) { 474 PyObject *latin1_char = unicode_latin1[ch]; 475 if (latin1_char != NULL) { 476 if (unicode != latin1_char) { 477 Py_INCREF(latin1_char); 478 Py_DECREF(unicode); 479 } 480 return latin1_char; 481 } 482 else { 483 assert(_PyUnicode_CheckConsistency(unicode, 1)); 484 Py_INCREF(unicode); 485 unicode_latin1[ch] = unicode; 486 return unicode; 487 } 488 } 489 } 490 491 assert(_PyUnicode_CheckConsistency(unicode, 1)); 492 return unicode; 493} 494 495static PyObject* 496unicode_result(PyObject *unicode) 497{ 498 assert(_PyUnicode_CHECK(unicode)); 499 if (PyUnicode_IS_READY(unicode)) 500 return unicode_result_ready(unicode); 501 else 502 return unicode_result_wchar(unicode); 503} 504 505static PyObject* 506unicode_result_unchanged(PyObject *unicode) 507{ 508 if (PyUnicode_CheckExact(unicode)) { 509 if (PyUnicode_READY(unicode) == -1) 510 return NULL; 511 Py_INCREF(unicode); 512 return unicode; 513 } 514 else 515 /* Subtype -- return genuine unicode string with the same value. */ 516 return _PyUnicode_Copy(unicode); 517} 518 519#ifdef HAVE_MBCS 520static OSVERSIONINFOEX winver; 521#endif 522 523/* --- Bloom Filters ----------------------------------------------------- */ 524 525/* stuff to implement simple "bloom filters" for Unicode characters. 526 to keep things simple, we use a single bitmask, using the least 5 527 bits from each unicode characters as the bit index. */ 528 529/* the linebreak mask is set up by Unicode_Init below */ 530 531#if LONG_BIT >= 128 532#define BLOOM_WIDTH 128 533#elif LONG_BIT >= 64 534#define BLOOM_WIDTH 64 535#elif LONG_BIT >= 32 536#define BLOOM_WIDTH 32 537#else 538#error "LONG_BIT is smaller than 32" 539#endif 540 541#define BLOOM_MASK unsigned long 542 543static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 544 545#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 546 547#define BLOOM_LINEBREAK(ch) \ 548 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 549 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 550 551Py_LOCAL_INLINE(BLOOM_MASK) 552make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 553{ 554#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 555 do { \ 556 TYPE *data = (TYPE *)PTR; \ 557 TYPE *end = data + LEN; \ 558 Py_UCS4 ch; \ 559 for (; data != end; data++) { \ 560 ch = *data; \ 561 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 562 } \ 563 break; \ 564 } while (0) 565 566 /* calculate simple bloom-style bitmask for a given unicode string */ 567 568 BLOOM_MASK mask; 569 570 mask = 0; 571 switch (kind) { 572 case PyUnicode_1BYTE_KIND: 573 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 574 break; 575 case PyUnicode_2BYTE_KIND: 576 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 577 break; 578 case PyUnicode_4BYTE_KIND: 579 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 580 break; 581 default: 582 assert(0); 583 } 584 return mask; 585 586#undef BLOOM_UPDATE 587} 588 589/* Compilation of templated routines */ 590 591#include "stringlib/asciilib.h" 592#include "stringlib/fastsearch.h" 593#include "stringlib/partition.h" 594#include "stringlib/split.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/find_max_char.h" 598#include "stringlib/localeutil.h" 599#include "stringlib/undef.h" 600 601#include "stringlib/ucs1lib.h" 602#include "stringlib/fastsearch.h" 603#include "stringlib/partition.h" 604#include "stringlib/split.h" 605#include "stringlib/count.h" 606#include "stringlib/find.h" 607#include "stringlib/replace.h" 608#include "stringlib/find_max_char.h" 609#include "stringlib/localeutil.h" 610#include "stringlib/undef.h" 611 612#include "stringlib/ucs2lib.h" 613#include "stringlib/fastsearch.h" 614#include "stringlib/partition.h" 615#include "stringlib/split.h" 616#include "stringlib/count.h" 617#include "stringlib/find.h" 618#include "stringlib/replace.h" 619#include "stringlib/find_max_char.h" 620#include "stringlib/localeutil.h" 621#include "stringlib/undef.h" 622 623#include "stringlib/ucs4lib.h" 624#include "stringlib/fastsearch.h" 625#include "stringlib/partition.h" 626#include "stringlib/split.h" 627#include "stringlib/count.h" 628#include "stringlib/find.h" 629#include "stringlib/replace.h" 630#include "stringlib/find_max_char.h" 631#include "stringlib/localeutil.h" 632#include "stringlib/undef.h" 633 634#include "stringlib/unicodedefs.h" 635#include "stringlib/fastsearch.h" 636#include "stringlib/count.h" 637#include "stringlib/find.h" 638#include "stringlib/undef.h" 639 640/* --- Unicode Object ----------------------------------------------------- */ 641 642static PyObject * 643fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 644 645Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 646 Py_ssize_t size, Py_UCS4 ch, 647 int direction) 648{ 649 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 650 651 switch (kind) { 652 case PyUnicode_1BYTE_KIND: 653 { 654 Py_UCS1 ch1 = (Py_UCS1) ch; 655 if (ch1 == ch) 656 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 657 else 658 return -1; 659 } 660 case PyUnicode_2BYTE_KIND: 661 { 662 Py_UCS2 ch2 = (Py_UCS2) ch; 663 if (ch2 == ch) 664 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 665 else 666 return -1; 667 } 668 case PyUnicode_4BYTE_KIND: 669 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 670 default: 671 assert(0); 672 return -1; 673 } 674} 675 676#ifdef Py_DEBUG 677/* Fill the data of an Unicode string with invalid characters to detect bugs 678 earlier. 679 680 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 681 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 682 invalid character in Unicode 6.0. */ 683static void 684unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 685{ 686 int kind = PyUnicode_KIND(unicode); 687 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 688 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 689 if (length <= old_length) 690 return; 691 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 692} 693#endif 694 695static PyObject* 696resize_compact(PyObject *unicode, Py_ssize_t length) 697{ 698 Py_ssize_t char_size; 699 Py_ssize_t struct_size; 700 Py_ssize_t new_size; 701 int share_wstr; 702 PyObject *new_unicode; 703#ifdef Py_DEBUG 704 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 705#endif 706 707 assert(unicode_modifiable(unicode)); 708 assert(PyUnicode_IS_READY(unicode)); 709 assert(PyUnicode_IS_COMPACT(unicode)); 710 711 char_size = PyUnicode_KIND(unicode); 712 if (PyUnicode_IS_ASCII(unicode)) 713 struct_size = sizeof(PyASCIIObject); 714 else 715 struct_size = sizeof(PyCompactUnicodeObject); 716 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 717 718 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 719 PyErr_NoMemory(); 720 return NULL; 721 } 722 new_size = (struct_size + (length + 1) * char_size); 723 724 _Py_DEC_REFTOTAL; 725 _Py_ForgetReference(unicode); 726 727 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 728 if (new_unicode == NULL) { 729 _Py_NewReference(unicode); 730 PyErr_NoMemory(); 731 return NULL; 732 } 733 unicode = new_unicode; 734 _Py_NewReference(unicode); 735 736 _PyUnicode_LENGTH(unicode) = length; 737 if (share_wstr) { 738 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 739 if (!PyUnicode_IS_ASCII(unicode)) 740 _PyUnicode_WSTR_LENGTH(unicode) = length; 741 } 742 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 743 PyObject_DEL(_PyUnicode_WSTR(unicode)); 744 _PyUnicode_WSTR(unicode) = NULL; 745 } 746#ifdef Py_DEBUG 747 unicode_fill_invalid(unicode, old_length); 748#endif 749 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 750 length, 0); 751 assert(_PyUnicode_CheckConsistency(unicode, 0)); 752 return unicode; 753} 754 755static int 756resize_inplace(PyObject *unicode, Py_ssize_t length) 757{ 758 wchar_t *wstr; 759 Py_ssize_t new_size; 760 assert(!PyUnicode_IS_COMPACT(unicode)); 761 assert(Py_REFCNT(unicode) == 1); 762 763 if (PyUnicode_IS_READY(unicode)) { 764 Py_ssize_t char_size; 765 int share_wstr, share_utf8; 766 void *data; 767#ifdef Py_DEBUG 768 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 769#endif 770 771 data = _PyUnicode_DATA_ANY(unicode); 772 char_size = PyUnicode_KIND(unicode); 773 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 774 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 775 776 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 777 PyErr_NoMemory(); 778 return -1; 779 } 780 new_size = (length + 1) * char_size; 781 782 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 783 { 784 PyObject_DEL(_PyUnicode_UTF8(unicode)); 785 _PyUnicode_UTF8(unicode) = NULL; 786 _PyUnicode_UTF8_LENGTH(unicode) = 0; 787 } 788 789 data = (PyObject *)PyObject_REALLOC(data, new_size); 790 if (data == NULL) { 791 PyErr_NoMemory(); 792 return -1; 793 } 794 _PyUnicode_DATA_ANY(unicode) = data; 795 if (share_wstr) { 796 _PyUnicode_WSTR(unicode) = data; 797 _PyUnicode_WSTR_LENGTH(unicode) = length; 798 } 799 if (share_utf8) { 800 _PyUnicode_UTF8(unicode) = data; 801 _PyUnicode_UTF8_LENGTH(unicode) = length; 802 } 803 _PyUnicode_LENGTH(unicode) = length; 804 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 805#ifdef Py_DEBUG 806 unicode_fill_invalid(unicode, old_length); 807#endif 808 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 809 assert(_PyUnicode_CheckConsistency(unicode, 0)); 810 return 0; 811 } 812 } 813 assert(_PyUnicode_WSTR(unicode) != NULL); 814 815 /* check for integer overflow */ 816 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 817 PyErr_NoMemory(); 818 return -1; 819 } 820 new_size = sizeof(wchar_t) * (length + 1); 821 wstr = _PyUnicode_WSTR(unicode); 822 wstr = PyObject_REALLOC(wstr, new_size); 823 if (!wstr) { 824 PyErr_NoMemory(); 825 return -1; 826 } 827 _PyUnicode_WSTR(unicode) = wstr; 828 _PyUnicode_WSTR(unicode)[length] = 0; 829 _PyUnicode_WSTR_LENGTH(unicode) = length; 830 assert(_PyUnicode_CheckConsistency(unicode, 0)); 831 return 0; 832} 833 834static PyObject* 835resize_copy(PyObject *unicode, Py_ssize_t length) 836{ 837 Py_ssize_t copy_length; 838 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 839 PyObject *copy; 840 841 if (PyUnicode_READY(unicode) == -1) 842 return NULL; 843 844 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 845 if (copy == NULL) 846 return NULL; 847 848 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 849 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 850 return copy; 851 } 852 else { 853 PyObject *w; 854 855 w = (PyObject*)_PyUnicode_New(length); 856 if (w == NULL) 857 return NULL; 858 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 859 copy_length = Py_MIN(copy_length, length); 860 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 861 copy_length * sizeof(wchar_t)); 862 return w; 863 } 864} 865 866/* We allocate one more byte to make sure the string is 867 Ux0000 terminated; some code (e.g. new_identifier) 868 relies on that. 869 870 XXX This allocator could further be enhanced by assuring that the 871 free list never reduces its size below 1. 872 873*/ 874 875static PyUnicodeObject * 876_PyUnicode_New(Py_ssize_t length) 877{ 878 register PyUnicodeObject *unicode; 879 size_t new_size; 880 881 /* Optimization for empty strings */ 882 if (length == 0 && unicode_empty != NULL) { 883 Py_INCREF(unicode_empty); 884 return (PyUnicodeObject*)unicode_empty; 885 } 886 887 /* Ensure we won't overflow the size. */ 888 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 889 return (PyUnicodeObject *)PyErr_NoMemory(); 890 } 891 if (length < 0) { 892 PyErr_SetString(PyExc_SystemError, 893 "Negative size passed to _PyUnicode_New"); 894 return NULL; 895 } 896 897 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 898 if (unicode == NULL) 899 return NULL; 900 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 901 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 902 if (!_PyUnicode_WSTR(unicode)) { 903 Py_DECREF(unicode); 904 PyErr_NoMemory(); 905 return NULL; 906 } 907 908 /* Initialize the first element to guard against cases where 909 * the caller fails before initializing str -- unicode_resize() 910 * reads str[0], and the Keep-Alive optimization can keep memory 911 * allocated for str alive across a call to unicode_dealloc(unicode). 912 * We don't want unicode_resize to read uninitialized memory in 913 * that case. 914 */ 915 _PyUnicode_WSTR(unicode)[0] = 0; 916 _PyUnicode_WSTR(unicode)[length] = 0; 917 _PyUnicode_WSTR_LENGTH(unicode) = length; 918 _PyUnicode_HASH(unicode) = -1; 919 _PyUnicode_STATE(unicode).interned = 0; 920 _PyUnicode_STATE(unicode).kind = 0; 921 _PyUnicode_STATE(unicode).compact = 0; 922 _PyUnicode_STATE(unicode).ready = 0; 923 _PyUnicode_STATE(unicode).ascii = 0; 924 _PyUnicode_DATA_ANY(unicode) = NULL; 925 _PyUnicode_LENGTH(unicode) = 0; 926 _PyUnicode_UTF8(unicode) = NULL; 927 _PyUnicode_UTF8_LENGTH(unicode) = 0; 928 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 929 return unicode; 930} 931 932static const char* 933unicode_kind_name(PyObject *unicode) 934{ 935 /* don't check consistency: unicode_kind_name() is called from 936 _PyUnicode_Dump() */ 937 if (!PyUnicode_IS_COMPACT(unicode)) 938 { 939 if (!PyUnicode_IS_READY(unicode)) 940 return "wstr"; 941 switch (PyUnicode_KIND(unicode)) 942 { 943 case PyUnicode_1BYTE_KIND: 944 if (PyUnicode_IS_ASCII(unicode)) 945 return "legacy ascii"; 946 else 947 return "legacy latin1"; 948 case PyUnicode_2BYTE_KIND: 949 return "legacy UCS2"; 950 case PyUnicode_4BYTE_KIND: 951 return "legacy UCS4"; 952 default: 953 return "<legacy invalid kind>"; 954 } 955 } 956 assert(PyUnicode_IS_READY(unicode)); 957 switch (PyUnicode_KIND(unicode)) { 958 case PyUnicode_1BYTE_KIND: 959 if (PyUnicode_IS_ASCII(unicode)) 960 return "ascii"; 961 else 962 return "latin1"; 963 case PyUnicode_2BYTE_KIND: 964 return "UCS2"; 965 case PyUnicode_4BYTE_KIND: 966 return "UCS4"; 967 default: 968 return "<invalid compact kind>"; 969 } 970} 971 972#ifdef Py_DEBUG 973/* Functions wrapping macros for use in debugger */ 974char *_PyUnicode_utf8(void *unicode){ 975 return PyUnicode_UTF8(unicode); 976} 977 978void *_PyUnicode_compact_data(void *unicode) { 979 return _PyUnicode_COMPACT_DATA(unicode); 980} 981void *_PyUnicode_data(void *unicode){ 982 printf("obj %p\n", unicode); 983 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 984 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 985 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 986 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 987 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 988 return PyUnicode_DATA(unicode); 989} 990 991void 992_PyUnicode_Dump(PyObject *op) 993{ 994 PyASCIIObject *ascii = (PyASCIIObject *)op; 995 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 996 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 997 void *data; 998 999 if (ascii->state.compact) 1000 { 1001 if (ascii->state.ascii) 1002 data = (ascii + 1); 1003 else 1004 data = (compact + 1); 1005 } 1006 else 1007 data = unicode->data.any; 1008 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 1009 1010 if (ascii->wstr == data) 1011 printf("shared "); 1012 printf("wstr=%p", ascii->wstr); 1013 1014 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1015 printf(" (%zu), ", compact->wstr_length); 1016 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1017 printf("shared "); 1018 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 1019 } 1020 printf(", data=%p\n", data); 1021} 1022#endif 1023 1024PyObject * 1025PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1026{ 1027 PyObject *obj; 1028 PyCompactUnicodeObject *unicode; 1029 void *data; 1030 enum PyUnicode_Kind kind; 1031 int is_sharing, is_ascii; 1032 Py_ssize_t char_size; 1033 Py_ssize_t struct_size; 1034 1035 /* Optimization for empty strings */ 1036 if (size == 0 && unicode_empty != NULL) { 1037 Py_INCREF(unicode_empty); 1038 return unicode_empty; 1039 } 1040 1041 is_ascii = 0; 1042 is_sharing = 0; 1043 struct_size = sizeof(PyCompactUnicodeObject); 1044 if (maxchar < 128) { 1045 kind = PyUnicode_1BYTE_KIND; 1046 char_size = 1; 1047 is_ascii = 1; 1048 struct_size = sizeof(PyASCIIObject); 1049 } 1050 else if (maxchar < 256) { 1051 kind = PyUnicode_1BYTE_KIND; 1052 char_size = 1; 1053 } 1054 else if (maxchar < 65536) { 1055 kind = PyUnicode_2BYTE_KIND; 1056 char_size = 2; 1057 if (sizeof(wchar_t) == 2) 1058 is_sharing = 1; 1059 } 1060 else { 1061 if (maxchar > MAX_UNICODE) { 1062 PyErr_SetString(PyExc_SystemError, 1063 "invalid maximum character passed to PyUnicode_New"); 1064 return NULL; 1065 } 1066 kind = PyUnicode_4BYTE_KIND; 1067 char_size = 4; 1068 if (sizeof(wchar_t) == 4) 1069 is_sharing = 1; 1070 } 1071 1072 /* Ensure we won't overflow the size. */ 1073 if (size < 0) { 1074 PyErr_SetString(PyExc_SystemError, 1075 "Negative size passed to PyUnicode_New"); 1076 return NULL; 1077 } 1078 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1079 return PyErr_NoMemory(); 1080 1081 /* Duplicated allocation code from _PyObject_New() instead of a call to 1082 * PyObject_New() so we are able to allocate space for the object and 1083 * it's data buffer. 1084 */ 1085 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1086 if (obj == NULL) 1087 return PyErr_NoMemory(); 1088 obj = PyObject_INIT(obj, &PyUnicode_Type); 1089 if (obj == NULL) 1090 return NULL; 1091 1092 unicode = (PyCompactUnicodeObject *)obj; 1093 if (is_ascii) 1094 data = ((PyASCIIObject*)obj) + 1; 1095 else 1096 data = unicode + 1; 1097 _PyUnicode_LENGTH(unicode) = size; 1098 _PyUnicode_HASH(unicode) = -1; 1099 _PyUnicode_STATE(unicode).interned = 0; 1100 _PyUnicode_STATE(unicode).kind = kind; 1101 _PyUnicode_STATE(unicode).compact = 1; 1102 _PyUnicode_STATE(unicode).ready = 1; 1103 _PyUnicode_STATE(unicode).ascii = is_ascii; 1104 if (is_ascii) { 1105 ((char*)data)[size] = 0; 1106 _PyUnicode_WSTR(unicode) = NULL; 1107 } 1108 else if (kind == PyUnicode_1BYTE_KIND) { 1109 ((char*)data)[size] = 0; 1110 _PyUnicode_WSTR(unicode) = NULL; 1111 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1112 unicode->utf8 = NULL; 1113 unicode->utf8_length = 0; 1114 } 1115 else { 1116 unicode->utf8 = NULL; 1117 unicode->utf8_length = 0; 1118 if (kind == PyUnicode_2BYTE_KIND) 1119 ((Py_UCS2*)data)[size] = 0; 1120 else /* kind == PyUnicode_4BYTE_KIND */ 1121 ((Py_UCS4*)data)[size] = 0; 1122 if (is_sharing) { 1123 _PyUnicode_WSTR_LENGTH(unicode) = size; 1124 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1125 } 1126 else { 1127 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1128 _PyUnicode_WSTR(unicode) = NULL; 1129 } 1130 } 1131#ifdef Py_DEBUG 1132 unicode_fill_invalid((PyObject*)unicode, 0); 1133#endif 1134 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1135 return obj; 1136} 1137 1138#if SIZEOF_WCHAR_T == 2 1139/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1140 will decode surrogate pairs, the other conversions are implemented as macros 1141 for efficiency. 1142 1143 This function assumes that unicode can hold one more code point than wstr 1144 characters for a terminating null character. */ 1145static void 1146unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1147 PyObject *unicode) 1148{ 1149 const wchar_t *iter; 1150 Py_UCS4 *ucs4_out; 1151 1152 assert(unicode != NULL); 1153 assert(_PyUnicode_CHECK(unicode)); 1154 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1155 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1156 1157 for (iter = begin; iter < end; ) { 1158 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1159 _PyUnicode_GET_LENGTH(unicode))); 1160 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1161 && (iter+1) < end 1162 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1163 { 1164 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1165 iter += 2; 1166 } 1167 else { 1168 *ucs4_out++ = *iter; 1169 iter++; 1170 } 1171 } 1172 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1173 _PyUnicode_GET_LENGTH(unicode))); 1174 1175} 1176#endif 1177 1178static int 1179unicode_check_modifiable(PyObject *unicode) 1180{ 1181 if (!unicode_modifiable(unicode)) { 1182 PyErr_SetString(PyExc_SystemError, 1183 "Cannot modify a string currently used"); 1184 return -1; 1185 } 1186 return 0; 1187} 1188 1189static int 1190_copy_characters(PyObject *to, Py_ssize_t to_start, 1191 PyObject *from, Py_ssize_t from_start, 1192 Py_ssize_t how_many, int check_maxchar) 1193{ 1194 unsigned int from_kind, to_kind; 1195 void *from_data, *to_data; 1196 1197 assert(0 <= how_many); 1198 assert(0 <= from_start); 1199 assert(0 <= to_start); 1200 assert(PyUnicode_Check(from)); 1201 assert(PyUnicode_IS_READY(from)); 1202 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1203 1204 assert(PyUnicode_Check(to)); 1205 assert(PyUnicode_IS_READY(to)); 1206 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1207 1208 if (how_many == 0) 1209 return 0; 1210 1211 from_kind = PyUnicode_KIND(from); 1212 from_data = PyUnicode_DATA(from); 1213 to_kind = PyUnicode_KIND(to); 1214 to_data = PyUnicode_DATA(to); 1215 1216#ifdef Py_DEBUG 1217 if (!check_maxchar 1218 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1219 { 1220 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1221 Py_UCS4 ch; 1222 Py_ssize_t i; 1223 for (i=0; i < how_many; i++) { 1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1225 assert(ch <= to_maxchar); 1226 } 1227 } 1228#endif 1229 1230 if (from_kind == to_kind) { 1231 if (check_maxchar 1232 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1233 { 1234 /* Writing Latin-1 characters into an ASCII string requires to 1235 check that all written characters are pure ASCII */ 1236 Py_UCS4 max_char; 1237 max_char = ucs1lib_find_max_char(from_data, 1238 (Py_UCS1*)from_data + how_many); 1239 if (max_char >= 128) 1240 return -1; 1241 } 1242 Py_MEMCPY((char*)to_data + to_kind * to_start, 1243 (char*)from_data + from_kind * from_start, 1244 to_kind * how_many); 1245 } 1246 else if (from_kind == PyUnicode_1BYTE_KIND 1247 && to_kind == PyUnicode_2BYTE_KIND) 1248 { 1249 _PyUnicode_CONVERT_BYTES( 1250 Py_UCS1, Py_UCS2, 1251 PyUnicode_1BYTE_DATA(from) + from_start, 1252 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1253 PyUnicode_2BYTE_DATA(to) + to_start 1254 ); 1255 } 1256 else if (from_kind == PyUnicode_1BYTE_KIND 1257 && to_kind == PyUnicode_4BYTE_KIND) 1258 { 1259 _PyUnicode_CONVERT_BYTES( 1260 Py_UCS1, Py_UCS4, 1261 PyUnicode_1BYTE_DATA(from) + from_start, 1262 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1263 PyUnicode_4BYTE_DATA(to) + to_start 1264 ); 1265 } 1266 else if (from_kind == PyUnicode_2BYTE_KIND 1267 && to_kind == PyUnicode_4BYTE_KIND) 1268 { 1269 _PyUnicode_CONVERT_BYTES( 1270 Py_UCS2, Py_UCS4, 1271 PyUnicode_2BYTE_DATA(from) + from_start, 1272 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1273 PyUnicode_4BYTE_DATA(to) + to_start 1274 ); 1275 } 1276 else { 1277 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1278 1279 if (!check_maxchar) { 1280 if (from_kind == PyUnicode_2BYTE_KIND 1281 && to_kind == PyUnicode_1BYTE_KIND) 1282 { 1283 _PyUnicode_CONVERT_BYTES( 1284 Py_UCS2, Py_UCS1, 1285 PyUnicode_2BYTE_DATA(from) + from_start, 1286 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1287 PyUnicode_1BYTE_DATA(to) + to_start 1288 ); 1289 } 1290 else if (from_kind == PyUnicode_4BYTE_KIND 1291 && to_kind == PyUnicode_1BYTE_KIND) 1292 { 1293 _PyUnicode_CONVERT_BYTES( 1294 Py_UCS4, Py_UCS1, 1295 PyUnicode_4BYTE_DATA(from) + from_start, 1296 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1297 PyUnicode_1BYTE_DATA(to) + to_start 1298 ); 1299 } 1300 else if (from_kind == PyUnicode_4BYTE_KIND 1301 && to_kind == PyUnicode_2BYTE_KIND) 1302 { 1303 _PyUnicode_CONVERT_BYTES( 1304 Py_UCS4, Py_UCS2, 1305 PyUnicode_4BYTE_DATA(from) + from_start, 1306 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1307 PyUnicode_2BYTE_DATA(to) + to_start 1308 ); 1309 } 1310 else { 1311 assert(0); 1312 return -1; 1313 } 1314 } 1315 else { 1316 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1317 Py_UCS4 ch; 1318 Py_ssize_t i; 1319 1320 for (i=0; i < how_many; i++) { 1321 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1322 if (ch > to_maxchar) 1323 return -1; 1324 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1325 } 1326 } 1327 } 1328 return 0; 1329} 1330 1331void 1332_PyUnicode_FastCopyCharacters( 1333 PyObject *to, Py_ssize_t to_start, 1334 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1335{ 1336 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1337} 1338 1339Py_ssize_t 1340PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1341 PyObject *from, Py_ssize_t from_start, 1342 Py_ssize_t how_many) 1343{ 1344 int err; 1345 1346 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1347 PyErr_BadInternalCall(); 1348 return -1; 1349 } 1350 1351 if (PyUnicode_READY(from) == -1) 1352 return -1; 1353 if (PyUnicode_READY(to) == -1) 1354 return -1; 1355 1356 if (from_start < 0) { 1357 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1358 return -1; 1359 } 1360 if (to_start < 0) { 1361 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1362 return -1; 1363 } 1364 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1365 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1366 PyErr_Format(PyExc_SystemError, 1367 "Cannot write %zi characters at %zi " 1368 "in a string of %zi characters", 1369 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1370 return -1; 1371 } 1372 1373 if (how_many == 0) 1374 return 0; 1375 1376 if (unicode_check_modifiable(to)) 1377 return -1; 1378 1379 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1380 if (err) { 1381 PyErr_Format(PyExc_SystemError, 1382 "Cannot copy %s characters " 1383 "into a string of %s characters", 1384 unicode_kind_name(from), 1385 unicode_kind_name(to)); 1386 return -1; 1387 } 1388 return how_many; 1389} 1390 1391/* Find the maximum code point and count the number of surrogate pairs so a 1392 correct string length can be computed before converting a string to UCS4. 1393 This function counts single surrogates as a character and not as a pair. 1394 1395 Return 0 on success, or -1 on error. */ 1396static int 1397find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1398 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1399{ 1400 const wchar_t *iter; 1401 Py_UCS4 ch; 1402 1403 assert(num_surrogates != NULL && maxchar != NULL); 1404 *num_surrogates = 0; 1405 *maxchar = 0; 1406 1407 for (iter = begin; iter < end; ) { 1408#if SIZEOF_WCHAR_T == 2 1409 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1410 && (iter+1) < end 1411 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1412 { 1413 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1414 ++(*num_surrogates); 1415 iter += 2; 1416 } 1417 else 1418#endif 1419 { 1420 ch = *iter; 1421 iter++; 1422 } 1423 if (ch > *maxchar) { 1424 *maxchar = ch; 1425 if (*maxchar > MAX_UNICODE) { 1426 PyErr_Format(PyExc_ValueError, 1427 "character U+%x is not in range [U+0000; U+10ffff]", 1428 ch); 1429 return -1; 1430 } 1431 } 1432 } 1433 return 0; 1434} 1435 1436int 1437_PyUnicode_Ready(PyObject *unicode) 1438{ 1439 wchar_t *end; 1440 Py_UCS4 maxchar = 0; 1441 Py_ssize_t num_surrogates; 1442#if SIZEOF_WCHAR_T == 2 1443 Py_ssize_t length_wo_surrogates; 1444#endif 1445 1446 /* _PyUnicode_Ready() is only intended for old-style API usage where 1447 strings were created using _PyObject_New() and where no canonical 1448 representation (the str field) has been set yet aka strings 1449 which are not yet ready. */ 1450 assert(_PyUnicode_CHECK(unicode)); 1451 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1452 assert(_PyUnicode_WSTR(unicode) != NULL); 1453 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1454 assert(_PyUnicode_UTF8(unicode) == NULL); 1455 /* Actually, it should neither be interned nor be anything else: */ 1456 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1457 1458 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1459 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1460 &maxchar, &num_surrogates) == -1) 1461 return -1; 1462 1463 if (maxchar < 256) { 1464 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1465 if (!_PyUnicode_DATA_ANY(unicode)) { 1466 PyErr_NoMemory(); 1467 return -1; 1468 } 1469 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1470 _PyUnicode_WSTR(unicode), end, 1471 PyUnicode_1BYTE_DATA(unicode)); 1472 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1474 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1475 if (maxchar < 128) { 1476 _PyUnicode_STATE(unicode).ascii = 1; 1477 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1478 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1479 } 1480 else { 1481 _PyUnicode_STATE(unicode).ascii = 0; 1482 _PyUnicode_UTF8(unicode) = NULL; 1483 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1484 } 1485 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1486 _PyUnicode_WSTR(unicode) = NULL; 1487 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1488 } 1489 /* In this case we might have to convert down from 4-byte native 1490 wchar_t to 2-byte unicode. */ 1491 else if (maxchar < 65536) { 1492 assert(num_surrogates == 0 && 1493 "FindMaxCharAndNumSurrogatePairs() messed up"); 1494 1495#if SIZEOF_WCHAR_T == 2 1496 /* We can share representations and are done. */ 1497 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1498 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1499 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1500 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1501 _PyUnicode_UTF8(unicode) = NULL; 1502 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1503#else 1504 /* sizeof(wchar_t) == 4 */ 1505 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1506 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1507 if (!_PyUnicode_DATA_ANY(unicode)) { 1508 PyErr_NoMemory(); 1509 return -1; 1510 } 1511 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1512 _PyUnicode_WSTR(unicode), end, 1513 PyUnicode_2BYTE_DATA(unicode)); 1514 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1515 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1516 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1517 _PyUnicode_UTF8(unicode) = NULL; 1518 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1519 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1520 _PyUnicode_WSTR(unicode) = NULL; 1521 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1522#endif 1523 } 1524 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1525 else { 1526#if SIZEOF_WCHAR_T == 2 1527 /* in case the native representation is 2-bytes, we need to allocate a 1528 new normalized 4-byte version. */ 1529 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1530 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1531 if (!_PyUnicode_DATA_ANY(unicode)) { 1532 PyErr_NoMemory(); 1533 return -1; 1534 } 1535 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1536 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1537 _PyUnicode_UTF8(unicode) = NULL; 1538 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1539 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1540 _PyUnicode_STATE(unicode).ready = 1; 1541 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1542 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1543 _PyUnicode_WSTR(unicode) = NULL; 1544 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1545#else 1546 assert(num_surrogates == 0); 1547 1548 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1549 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1550 _PyUnicode_UTF8(unicode) = NULL; 1551 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1552 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1553#endif 1554 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1555 } 1556 _PyUnicode_STATE(unicode).ready = 1; 1557 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1558 return 0; 1559} 1560 1561static void 1562unicode_dealloc(register PyObject *unicode) 1563{ 1564 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1565 case SSTATE_NOT_INTERNED: 1566 break; 1567 1568 case SSTATE_INTERNED_MORTAL: 1569 /* revive dead object temporarily for DelItem */ 1570 Py_REFCNT(unicode) = 3; 1571 if (PyDict_DelItem(interned, unicode) != 0) 1572 Py_FatalError( 1573 "deletion of interned string failed"); 1574 break; 1575 1576 case SSTATE_INTERNED_IMMORTAL: 1577 Py_FatalError("Immortal interned string died."); 1578 1579 default: 1580 Py_FatalError("Inconsistent interned string state."); 1581 } 1582 1583 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1584 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1585 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1586 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1587 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1588 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1589 1590 Py_TYPE(unicode)->tp_free(unicode); 1591} 1592 1593#ifdef Py_DEBUG 1594static int 1595unicode_is_singleton(PyObject *unicode) 1596{ 1597 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1598 if (unicode == unicode_empty) 1599 return 1; 1600 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1601 { 1602 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1603 if (ch < 256 && unicode_latin1[ch] == unicode) 1604 return 1; 1605 } 1606 return 0; 1607} 1608#endif 1609 1610static int 1611unicode_modifiable(PyObject *unicode) 1612{ 1613 assert(_PyUnicode_CHECK(unicode)); 1614 if (Py_REFCNT(unicode) != 1) 1615 return 0; 1616 if (_PyUnicode_HASH(unicode) != -1) 1617 return 0; 1618 if (PyUnicode_CHECK_INTERNED(unicode)) 1619 return 0; 1620 if (!PyUnicode_CheckExact(unicode)) 1621 return 0; 1622#ifdef Py_DEBUG 1623 /* singleton refcount is greater than 1 */ 1624 assert(!unicode_is_singleton(unicode)); 1625#endif 1626 return 1; 1627} 1628 1629static int 1630unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1631{ 1632 PyObject *unicode; 1633 Py_ssize_t old_length; 1634 1635 assert(p_unicode != NULL); 1636 unicode = *p_unicode; 1637 1638 assert(unicode != NULL); 1639 assert(PyUnicode_Check(unicode)); 1640 assert(0 <= length); 1641 1642 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1643 old_length = PyUnicode_WSTR_LENGTH(unicode); 1644 else 1645 old_length = PyUnicode_GET_LENGTH(unicode); 1646 if (old_length == length) 1647 return 0; 1648 1649 if (length == 0) { 1650 _Py_INCREF_UNICODE_EMPTY(); 1651 if (!unicode_empty) 1652 return -1; 1653 Py_DECREF(*p_unicode); 1654 *p_unicode = unicode_empty; 1655 return 0; 1656 } 1657 1658 if (!unicode_modifiable(unicode)) { 1659 PyObject *copy = resize_copy(unicode, length); 1660 if (copy == NULL) 1661 return -1; 1662 Py_DECREF(*p_unicode); 1663 *p_unicode = copy; 1664 return 0; 1665 } 1666 1667 if (PyUnicode_IS_COMPACT(unicode)) { 1668 PyObject *new_unicode = resize_compact(unicode, length); 1669 if (new_unicode == NULL) 1670 return -1; 1671 *p_unicode = new_unicode; 1672 return 0; 1673 } 1674 return resize_inplace(unicode, length); 1675} 1676 1677int 1678PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1679{ 1680 PyObject *unicode; 1681 if (p_unicode == NULL) { 1682 PyErr_BadInternalCall(); 1683 return -1; 1684 } 1685 unicode = *p_unicode; 1686 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1687 { 1688 PyErr_BadInternalCall(); 1689 return -1; 1690 } 1691 return unicode_resize(p_unicode, length); 1692} 1693 1694/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1695 1696 WARNING: The function doesn't copy the terminating null character and 1697 doesn't check the maximum character (may write a latin1 character in an 1698 ASCII string). */ 1699static void 1700unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1701 const char *str, Py_ssize_t len) 1702{ 1703 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1704 void *data = PyUnicode_DATA(unicode); 1705 const char *end = str + len; 1706 1707 switch (kind) { 1708 case PyUnicode_1BYTE_KIND: { 1709 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1710#ifdef Py_DEBUG 1711 if (PyUnicode_IS_ASCII(unicode)) { 1712 Py_UCS4 maxchar = ucs1lib_find_max_char( 1713 (const Py_UCS1*)str, 1714 (const Py_UCS1*)str + len); 1715 assert(maxchar < 128); 1716 } 1717#endif 1718 memcpy((char *) data + index, str, len); 1719 break; 1720 } 1721 case PyUnicode_2BYTE_KIND: { 1722 Py_UCS2 *start = (Py_UCS2 *)data + index; 1723 Py_UCS2 *ucs2 = start; 1724 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1725 1726 for (; str < end; ++ucs2, ++str) 1727 *ucs2 = (Py_UCS2)*str; 1728 1729 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1730 break; 1731 } 1732 default: { 1733 Py_UCS4 *start = (Py_UCS4 *)data + index; 1734 Py_UCS4 *ucs4 = start; 1735 assert(kind == PyUnicode_4BYTE_KIND); 1736 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1737 1738 for (; str < end; ++ucs4, ++str) 1739 *ucs4 = (Py_UCS4)*str; 1740 1741 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1742 } 1743 } 1744} 1745 1746 1747static PyObject* 1748get_latin1_char(unsigned char ch) 1749{ 1750 PyObject *unicode = unicode_latin1[ch]; 1751 if (!unicode) { 1752 unicode = PyUnicode_New(1, ch); 1753 if (!unicode) 1754 return NULL; 1755 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1756 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1757 unicode_latin1[ch] = unicode; 1758 } 1759 Py_INCREF(unicode); 1760 return unicode; 1761} 1762 1763PyObject * 1764PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1765{ 1766 PyObject *unicode; 1767 Py_UCS4 maxchar = 0; 1768 Py_ssize_t num_surrogates; 1769 1770 if (u == NULL) 1771 return (PyObject*)_PyUnicode_New(size); 1772 1773 /* If the Unicode data is known at construction time, we can apply 1774 some optimizations which share commonly used objects. */ 1775 1776 /* Optimization for empty strings */ 1777 if (size == 0) 1778 _Py_RETURN_UNICODE_EMPTY(); 1779 1780 /* Single character Unicode objects in the Latin-1 range are 1781 shared when using this constructor */ 1782 if (size == 1 && (Py_UCS4)*u < 256) 1783 return get_latin1_char((unsigned char)*u); 1784 1785 /* If not empty and not single character, copy the Unicode data 1786 into the new object */ 1787 if (find_maxchar_surrogates(u, u + size, 1788 &maxchar, &num_surrogates) == -1) 1789 return NULL; 1790 1791 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1792 if (!unicode) 1793 return NULL; 1794 1795 switch (PyUnicode_KIND(unicode)) { 1796 case PyUnicode_1BYTE_KIND: 1797 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1798 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1799 break; 1800 case PyUnicode_2BYTE_KIND: 1801#if Py_UNICODE_SIZE == 2 1802 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1803#else 1804 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1805 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1806#endif 1807 break; 1808 case PyUnicode_4BYTE_KIND: 1809#if SIZEOF_WCHAR_T == 2 1810 /* This is the only case which has to process surrogates, thus 1811 a simple copy loop is not enough and we need a function. */ 1812 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1813#else 1814 assert(num_surrogates == 0); 1815 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1816#endif 1817 break; 1818 default: 1819 assert(0 && "Impossible state"); 1820 } 1821 1822 return unicode_result(unicode); 1823} 1824 1825PyObject * 1826PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1827{ 1828 if (size < 0) { 1829 PyErr_SetString(PyExc_SystemError, 1830 "Negative size passed to PyUnicode_FromStringAndSize"); 1831 return NULL; 1832 } 1833 if (u != NULL) 1834 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1835 else 1836 return (PyObject *)_PyUnicode_New(size); 1837} 1838 1839PyObject * 1840PyUnicode_FromString(const char *u) 1841{ 1842 size_t size = strlen(u); 1843 if (size > PY_SSIZE_T_MAX) { 1844 PyErr_SetString(PyExc_OverflowError, "input too long"); 1845 return NULL; 1846 } 1847 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1848} 1849 1850PyObject * 1851_PyUnicode_FromId(_Py_Identifier *id) 1852{ 1853 if (!id->object) { 1854 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1855 strlen(id->string), 1856 NULL, NULL); 1857 if (!id->object) 1858 return NULL; 1859 PyUnicode_InternInPlace(&id->object); 1860 assert(!id->next); 1861 id->next = static_strings; 1862 static_strings = id; 1863 } 1864 return id->object; 1865} 1866 1867void 1868_PyUnicode_ClearStaticStrings() 1869{ 1870 _Py_Identifier *tmp, *s = static_strings; 1871 while (s) { 1872 Py_DECREF(s->object); 1873 s->object = NULL; 1874 tmp = s->next; 1875 s->next = NULL; 1876 s = tmp; 1877 } 1878 static_strings = NULL; 1879} 1880 1881/* Internal function, doesn't check maximum character */ 1882 1883PyObject* 1884_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1885{ 1886 const unsigned char *s = (const unsigned char *)buffer; 1887 PyObject *unicode; 1888 if (size == 1) { 1889#ifdef Py_DEBUG 1890 assert((unsigned char)s[0] < 128); 1891#endif 1892 return get_latin1_char(s[0]); 1893 } 1894 unicode = PyUnicode_New(size, 127); 1895 if (!unicode) 1896 return NULL; 1897 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1898 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1899 return unicode; 1900} 1901 1902static Py_UCS4 1903kind_maxchar_limit(unsigned int kind) 1904{ 1905 switch (kind) { 1906 case PyUnicode_1BYTE_KIND: 1907 return 0x80; 1908 case PyUnicode_2BYTE_KIND: 1909 return 0x100; 1910 case PyUnicode_4BYTE_KIND: 1911 return 0x10000; 1912 default: 1913 assert(0 && "invalid kind"); 1914 return MAX_UNICODE; 1915 } 1916} 1917 1918Py_LOCAL_INLINE(Py_UCS4) 1919align_maxchar(Py_UCS4 maxchar) 1920{ 1921 if (maxchar <= 127) 1922 return 127; 1923 else if (maxchar <= 255) 1924 return 255; 1925 else if (maxchar <= 65535) 1926 return 65535; 1927 else 1928 return MAX_UNICODE; 1929} 1930 1931static PyObject* 1932_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1933{ 1934 PyObject *res; 1935 unsigned char max_char; 1936 1937 if (size == 0) 1938 _Py_RETURN_UNICODE_EMPTY(); 1939 assert(size > 0); 1940 if (size == 1) 1941 return get_latin1_char(u[0]); 1942 1943 max_char = ucs1lib_find_max_char(u, u + size); 1944 res = PyUnicode_New(size, max_char); 1945 if (!res) 1946 return NULL; 1947 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1948 assert(_PyUnicode_CheckConsistency(res, 1)); 1949 return res; 1950} 1951 1952static PyObject* 1953_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1954{ 1955 PyObject *res; 1956 Py_UCS2 max_char; 1957 1958 if (size == 0) 1959 _Py_RETURN_UNICODE_EMPTY(); 1960 assert(size > 0); 1961 if (size == 1) { 1962 Py_UCS4 ch = u[0]; 1963 int kind; 1964 void *data; 1965 if (ch < 256) 1966 return get_latin1_char((unsigned char)ch); 1967 1968 res = PyUnicode_New(1, ch); 1969 if (res == NULL) 1970 return NULL; 1971 kind = PyUnicode_KIND(res); 1972 data = PyUnicode_DATA(res); 1973 PyUnicode_WRITE(kind, data, 0, ch); 1974 assert(_PyUnicode_CheckConsistency(res, 1)); 1975 return res; 1976 } 1977 1978 max_char = ucs2lib_find_max_char(u, u + size); 1979 res = PyUnicode_New(size, max_char); 1980 if (!res) 1981 return NULL; 1982 if (max_char >= 256) 1983 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1984 else { 1985 _PyUnicode_CONVERT_BYTES( 1986 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1987 } 1988 assert(_PyUnicode_CheckConsistency(res, 1)); 1989 return res; 1990} 1991 1992static PyObject* 1993_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1994{ 1995 PyObject *res; 1996 Py_UCS4 max_char; 1997 1998 if (size == 0) 1999 _Py_RETURN_UNICODE_EMPTY(); 2000 assert(size > 0); 2001 if (size == 1) { 2002 Py_UCS4 ch = u[0]; 2003 int kind; 2004 void *data; 2005 if (ch < 256) 2006 return get_latin1_char((unsigned char)ch); 2007 2008 res = PyUnicode_New(1, ch); 2009 if (res == NULL) 2010 return NULL; 2011 kind = PyUnicode_KIND(res); 2012 data = PyUnicode_DATA(res); 2013 PyUnicode_WRITE(kind, data, 0, ch); 2014 assert(_PyUnicode_CheckConsistency(res, 1)); 2015 return res; 2016 } 2017 2018 max_char = ucs4lib_find_max_char(u, u + size); 2019 res = PyUnicode_New(size, max_char); 2020 if (!res) 2021 return NULL; 2022 if (max_char < 256) 2023 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2024 PyUnicode_1BYTE_DATA(res)); 2025 else if (max_char < 0x10000) 2026 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2027 PyUnicode_2BYTE_DATA(res)); 2028 else 2029 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2030 assert(_PyUnicode_CheckConsistency(res, 1)); 2031 return res; 2032} 2033 2034PyObject* 2035PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2036{ 2037 if (size < 0) { 2038 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2039 return NULL; 2040 } 2041 switch (kind) { 2042 case PyUnicode_1BYTE_KIND: 2043 return _PyUnicode_FromUCS1(buffer, size); 2044 case PyUnicode_2BYTE_KIND: 2045 return _PyUnicode_FromUCS2(buffer, size); 2046 case PyUnicode_4BYTE_KIND: 2047 return _PyUnicode_FromUCS4(buffer, size); 2048 default: 2049 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2050 return NULL; 2051 } 2052} 2053 2054Py_UCS4 2055_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2056{ 2057 enum PyUnicode_Kind kind; 2058 void *startptr, *endptr; 2059 2060 assert(PyUnicode_IS_READY(unicode)); 2061 assert(0 <= start); 2062 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2063 assert(start <= end); 2064 2065 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2066 return PyUnicode_MAX_CHAR_VALUE(unicode); 2067 2068 if (start == end) 2069 return 127; 2070 2071 if (PyUnicode_IS_ASCII(unicode)) 2072 return 127; 2073 2074 kind = PyUnicode_KIND(unicode); 2075 startptr = PyUnicode_DATA(unicode); 2076 endptr = (char *)startptr + end * kind; 2077 startptr = (char *)startptr + start * kind; 2078 switch(kind) { 2079 case PyUnicode_1BYTE_KIND: 2080 return ucs1lib_find_max_char(startptr, endptr); 2081 case PyUnicode_2BYTE_KIND: 2082 return ucs2lib_find_max_char(startptr, endptr); 2083 case PyUnicode_4BYTE_KIND: 2084 return ucs4lib_find_max_char(startptr, endptr); 2085 default: 2086 assert(0); 2087 return 0; 2088 } 2089} 2090 2091/* Ensure that a string uses the most efficient storage, if it is not the 2092 case: create a new string with of the right kind. Write NULL into *p_unicode 2093 on error. */ 2094static void 2095unicode_adjust_maxchar(PyObject **p_unicode) 2096{ 2097 PyObject *unicode, *copy; 2098 Py_UCS4 max_char; 2099 Py_ssize_t len; 2100 unsigned int kind; 2101 2102 assert(p_unicode != NULL); 2103 unicode = *p_unicode; 2104 assert(PyUnicode_IS_READY(unicode)); 2105 if (PyUnicode_IS_ASCII(unicode)) 2106 return; 2107 2108 len = PyUnicode_GET_LENGTH(unicode); 2109 kind = PyUnicode_KIND(unicode); 2110 if (kind == PyUnicode_1BYTE_KIND) { 2111 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2112 max_char = ucs1lib_find_max_char(u, u + len); 2113 if (max_char >= 128) 2114 return; 2115 } 2116 else if (kind == PyUnicode_2BYTE_KIND) { 2117 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2118 max_char = ucs2lib_find_max_char(u, u + len); 2119 if (max_char >= 256) 2120 return; 2121 } 2122 else { 2123 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2124 assert(kind == PyUnicode_4BYTE_KIND); 2125 max_char = ucs4lib_find_max_char(u, u + len); 2126 if (max_char >= 0x10000) 2127 return; 2128 } 2129 copy = PyUnicode_New(len, max_char); 2130 if (copy != NULL) 2131 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2132 Py_DECREF(unicode); 2133 *p_unicode = copy; 2134} 2135 2136PyObject* 2137_PyUnicode_Copy(PyObject *unicode) 2138{ 2139 Py_ssize_t length; 2140 PyObject *copy; 2141 2142 if (!PyUnicode_Check(unicode)) { 2143 PyErr_BadInternalCall(); 2144 return NULL; 2145 } 2146 if (PyUnicode_READY(unicode) == -1) 2147 return NULL; 2148 2149 length = PyUnicode_GET_LENGTH(unicode); 2150 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2151 if (!copy) 2152 return NULL; 2153 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2154 2155 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2156 length * PyUnicode_KIND(unicode)); 2157 assert(_PyUnicode_CheckConsistency(copy, 1)); 2158 return copy; 2159} 2160 2161 2162/* Widen Unicode objects to larger buffers. Don't write terminating null 2163 character. Return NULL on error. */ 2164 2165void* 2166_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2167{ 2168 Py_ssize_t len; 2169 void *result; 2170 unsigned int skind; 2171 2172 if (PyUnicode_READY(s) == -1) 2173 return NULL; 2174 2175 len = PyUnicode_GET_LENGTH(s); 2176 skind = PyUnicode_KIND(s); 2177 if (skind >= kind) { 2178 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2179 return NULL; 2180 } 2181 switch (kind) { 2182 case PyUnicode_2BYTE_KIND: 2183 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2184 if (!result) 2185 return PyErr_NoMemory(); 2186 assert(skind == PyUnicode_1BYTE_KIND); 2187 _PyUnicode_CONVERT_BYTES( 2188 Py_UCS1, Py_UCS2, 2189 PyUnicode_1BYTE_DATA(s), 2190 PyUnicode_1BYTE_DATA(s) + len, 2191 result); 2192 return result; 2193 case PyUnicode_4BYTE_KIND: 2194 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2195 if (!result) 2196 return PyErr_NoMemory(); 2197 if (skind == PyUnicode_2BYTE_KIND) { 2198 _PyUnicode_CONVERT_BYTES( 2199 Py_UCS2, Py_UCS4, 2200 PyUnicode_2BYTE_DATA(s), 2201 PyUnicode_2BYTE_DATA(s) + len, 2202 result); 2203 } 2204 else { 2205 assert(skind == PyUnicode_1BYTE_KIND); 2206 _PyUnicode_CONVERT_BYTES( 2207 Py_UCS1, Py_UCS4, 2208 PyUnicode_1BYTE_DATA(s), 2209 PyUnicode_1BYTE_DATA(s) + len, 2210 result); 2211 } 2212 return result; 2213 default: 2214 break; 2215 } 2216 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2217 return NULL; 2218} 2219 2220static Py_UCS4* 2221as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2222 int copy_null) 2223{ 2224 int kind; 2225 void *data; 2226 Py_ssize_t len, targetlen; 2227 if (PyUnicode_READY(string) == -1) 2228 return NULL; 2229 kind = PyUnicode_KIND(string); 2230 data = PyUnicode_DATA(string); 2231 len = PyUnicode_GET_LENGTH(string); 2232 targetlen = len; 2233 if (copy_null) 2234 targetlen++; 2235 if (!target) { 2236 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2237 PyErr_NoMemory(); 2238 return NULL; 2239 } 2240 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2241 if (!target) { 2242 PyErr_NoMemory(); 2243 return NULL; 2244 } 2245 } 2246 else { 2247 if (targetsize < targetlen) { 2248 PyErr_Format(PyExc_SystemError, 2249 "string is longer than the buffer"); 2250 if (copy_null && 0 < targetsize) 2251 target[0] = 0; 2252 return NULL; 2253 } 2254 } 2255 if (kind == PyUnicode_1BYTE_KIND) { 2256 Py_UCS1 *start = (Py_UCS1 *) data; 2257 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2258 } 2259 else if (kind == PyUnicode_2BYTE_KIND) { 2260 Py_UCS2 *start = (Py_UCS2 *) data; 2261 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2262 } 2263 else { 2264 assert(kind == PyUnicode_4BYTE_KIND); 2265 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2266 } 2267 if (copy_null) 2268 target[len] = 0; 2269 return target; 2270} 2271 2272Py_UCS4* 2273PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2274 int copy_null) 2275{ 2276 if (target == NULL || targetsize < 0) { 2277 PyErr_BadInternalCall(); 2278 return NULL; 2279 } 2280 return as_ucs4(string, target, targetsize, copy_null); 2281} 2282 2283Py_UCS4* 2284PyUnicode_AsUCS4Copy(PyObject *string) 2285{ 2286 return as_ucs4(string, NULL, 0, 1); 2287} 2288 2289#ifdef HAVE_WCHAR_H 2290 2291PyObject * 2292PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2293{ 2294 if (w == NULL) { 2295 if (size == 0) 2296 _Py_RETURN_UNICODE_EMPTY(); 2297 PyErr_BadInternalCall(); 2298 return NULL; 2299 } 2300 2301 if (size == -1) { 2302 size = wcslen(w); 2303 } 2304 2305 return PyUnicode_FromUnicode(w, size); 2306} 2307 2308#endif /* HAVE_WCHAR_H */ 2309 2310static void 2311makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2312 char c) 2313{ 2314 *fmt++ = '%'; 2315 if (longflag) 2316 *fmt++ = 'l'; 2317 else if (longlongflag) { 2318 /* longlongflag should only ever be nonzero on machines with 2319 HAVE_LONG_LONG defined */ 2320#ifdef HAVE_LONG_LONG 2321 char *f = PY_FORMAT_LONG_LONG; 2322 while (*f) 2323 *fmt++ = *f++; 2324#else 2325 /* we shouldn't ever get here */ 2326 assert(0); 2327 *fmt++ = 'l'; 2328#endif 2329 } 2330 else if (size_tflag) { 2331 char *f = PY_FORMAT_SIZE_T; 2332 while (*f) 2333 *fmt++ = *f++; 2334 } 2335 *fmt++ = c; 2336 *fmt = '\0'; 2337} 2338 2339/* maximum number of characters required for output of %lld or %p. 2340 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2341 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2342#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2343 2344static int 2345unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2346 Py_ssize_t width, Py_ssize_t precision) 2347{ 2348 Py_ssize_t length, fill, arglen; 2349 Py_UCS4 maxchar; 2350 2351 if (PyUnicode_READY(str) == -1) 2352 return -1; 2353 2354 length = PyUnicode_GET_LENGTH(str); 2355 if ((precision == -1 || precision >= length) 2356 && width <= length) 2357 return _PyUnicodeWriter_WriteStr(writer, str); 2358 2359 if (precision != -1) 2360 length = Py_MIN(precision, length); 2361 2362 arglen = Py_MAX(length, width); 2363 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2364 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2365 else 2366 maxchar = writer->maxchar; 2367 2368 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2369 return -1; 2370 2371 if (width > length) { 2372 fill = width - length; 2373 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2374 return -1; 2375 writer->pos += fill; 2376 } 2377 2378 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2379 str, 0, length); 2380 writer->pos += length; 2381 return 0; 2382} 2383 2384static int 2385unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2386 Py_ssize_t width, Py_ssize_t precision) 2387{ 2388 /* UTF-8 */ 2389 Py_ssize_t length; 2390 PyObject *unicode; 2391 int res; 2392 2393 length = strlen(str); 2394 if (precision != -1) 2395 length = Py_MIN(length, precision); 2396 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2397 if (unicode == NULL) 2398 return -1; 2399 2400 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2401 Py_DECREF(unicode); 2402 return res; 2403} 2404 2405static const char* 2406unicode_fromformat_arg(_PyUnicodeWriter *writer, 2407 const char *f, va_list *vargs) 2408{ 2409 const char *p; 2410 Py_ssize_t len; 2411 int zeropad; 2412 Py_ssize_t width; 2413 Py_ssize_t precision; 2414 int longflag; 2415 int longlongflag; 2416 int size_tflag; 2417 Py_ssize_t fill; 2418 2419 p = f; 2420 f++; 2421 zeropad = 0; 2422 if (*f == '0') { 2423 zeropad = 1; 2424 f++; 2425 } 2426 2427 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2428 width = -1; 2429 if (Py_ISDIGIT((unsigned)*f)) { 2430 width = *f - '0'; 2431 f++; 2432 while (Py_ISDIGIT((unsigned)*f)) { 2433 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2434 PyErr_SetString(PyExc_ValueError, 2435 "width too big"); 2436 return NULL; 2437 } 2438 width = (width * 10) + (*f - '0'); 2439 f++; 2440 } 2441 } 2442 precision = -1; 2443 if (*f == '.') { 2444 f++; 2445 if (Py_ISDIGIT((unsigned)*f)) { 2446 precision = (*f - '0'); 2447 f++; 2448 while (Py_ISDIGIT((unsigned)*f)) { 2449 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2450 PyErr_SetString(PyExc_ValueError, 2451 "precision too big"); 2452 return NULL; 2453 } 2454 precision = (precision * 10) + (*f - '0'); 2455 f++; 2456 } 2457 } 2458 if (*f == '%') { 2459 /* "%.3%s" => f points to "3" */ 2460 f--; 2461 } 2462 } 2463 if (*f == '\0') { 2464 /* bogus format "%.123" => go backward, f points to "3" */ 2465 f--; 2466 } 2467 2468 /* Handle %ld, %lu, %lld and %llu. */ 2469 longflag = 0; 2470 longlongflag = 0; 2471 size_tflag = 0; 2472 if (*f == 'l') { 2473 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2474 longflag = 1; 2475 ++f; 2476 } 2477#ifdef HAVE_LONG_LONG 2478 else if (f[1] == 'l' && 2479 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2480 longlongflag = 1; 2481 f += 2; 2482 } 2483#endif 2484 } 2485 /* handle the size_t flag. */ 2486 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2487 size_tflag = 1; 2488 ++f; 2489 } 2490 2491 if (f[1] == '\0') 2492 writer->overallocate = 0; 2493 2494 switch (*f) { 2495 case 'c': 2496 { 2497 int ordinal = va_arg(*vargs, int); 2498 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2499 PyErr_SetString(PyExc_OverflowError, 2500 "character argument not in range(0x110000)"); 2501 return NULL; 2502 } 2503 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2504 return NULL; 2505 break; 2506 } 2507 2508 case 'i': 2509 case 'd': 2510 case 'u': 2511 case 'x': 2512 { 2513 /* used by sprintf */ 2514 char fmt[10]; /* should be enough for "%0lld\0" */ 2515 char buffer[MAX_LONG_LONG_CHARS]; 2516 Py_ssize_t arglen; 2517 2518 if (*f == 'u') { 2519 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2520 2521 if (longflag) 2522 len = sprintf(buffer, fmt, 2523 va_arg(*vargs, unsigned long)); 2524#ifdef HAVE_LONG_LONG 2525 else if (longlongflag) 2526 len = sprintf(buffer, fmt, 2527 va_arg(*vargs, unsigned PY_LONG_LONG)); 2528#endif 2529 else if (size_tflag) 2530 len = sprintf(buffer, fmt, 2531 va_arg(*vargs, size_t)); 2532 else 2533 len = sprintf(buffer, fmt, 2534 va_arg(*vargs, unsigned int)); 2535 } 2536 else if (*f == 'x') { 2537 makefmt(fmt, 0, 0, 0, 'x'); 2538 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2539 } 2540 else { 2541 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2542 2543 if (longflag) 2544 len = sprintf(buffer, fmt, 2545 va_arg(*vargs, long)); 2546#ifdef HAVE_LONG_LONG 2547 else if (longlongflag) 2548 len = sprintf(buffer, fmt, 2549 va_arg(*vargs, PY_LONG_LONG)); 2550#endif 2551 else if (size_tflag) 2552 len = sprintf(buffer, fmt, 2553 va_arg(*vargs, Py_ssize_t)); 2554 else 2555 len = sprintf(buffer, fmt, 2556 va_arg(*vargs, int)); 2557 } 2558 assert(len >= 0); 2559 2560 if (precision < len) 2561 precision = len; 2562 2563 arglen = Py_MAX(precision, width); 2564 assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127); 2565 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2566 return NULL; 2567 2568 if (width > precision) { 2569 Py_UCS4 fillchar; 2570 fill = width - precision; 2571 fillchar = zeropad?'0':' '; 2572 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2573 return NULL; 2574 writer->pos += fill; 2575 } 2576 if (precision > len) { 2577 fill = precision - len; 2578 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2579 return NULL; 2580 writer->pos += fill; 2581 } 2582 2583 unicode_write_cstr(writer->buffer, writer->pos, buffer, len); 2584 writer->pos += len; 2585 break; 2586 } 2587 2588 case 'p': 2589 { 2590 char number[MAX_LONG_LONG_CHARS]; 2591 2592 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2593 assert(len >= 0); 2594 2595 /* %p is ill-defined: ensure leading 0x. */ 2596 if (number[1] == 'X') 2597 number[1] = 'x'; 2598 else if (number[1] != 'x') { 2599 memmove(number + 2, number, 2600 strlen(number) + 1); 2601 number[0] = '0'; 2602 number[1] = 'x'; 2603 len += 2; 2604 } 2605 2606 assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127); 2607 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 2608 return NULL; 2609 unicode_write_cstr(writer->buffer, writer->pos, number, len); 2610 writer->pos += len; 2611 break; 2612 } 2613 2614 case 's': 2615 { 2616 /* UTF-8 */ 2617 const char *s = va_arg(*vargs, const char*); 2618 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2619 return NULL; 2620 break; 2621 } 2622 2623 case 'U': 2624 { 2625 PyObject *obj = va_arg(*vargs, PyObject *); 2626 assert(obj && _PyUnicode_CHECK(obj)); 2627 2628 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2629 return NULL; 2630 break; 2631 } 2632 2633 case 'V': 2634 { 2635 PyObject *obj = va_arg(*vargs, PyObject *); 2636 const char *str = va_arg(*vargs, const char *); 2637 if (obj) { 2638 assert(_PyUnicode_CHECK(obj)); 2639 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2640 return NULL; 2641 } 2642 else { 2643 assert(str != NULL); 2644 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2645 return NULL; 2646 } 2647 break; 2648 } 2649 2650 case 'S': 2651 { 2652 PyObject *obj = va_arg(*vargs, PyObject *); 2653 PyObject *str; 2654 assert(obj); 2655 str = PyObject_Str(obj); 2656 if (!str) 2657 return NULL; 2658 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2659 Py_DECREF(str); 2660 return NULL; 2661 } 2662 Py_DECREF(str); 2663 break; 2664 } 2665 2666 case 'R': 2667 { 2668 PyObject *obj = va_arg(*vargs, PyObject *); 2669 PyObject *repr; 2670 assert(obj); 2671 repr = PyObject_Repr(obj); 2672 if (!repr) 2673 return NULL; 2674 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2675 Py_DECREF(repr); 2676 return NULL; 2677 } 2678 Py_DECREF(repr); 2679 break; 2680 } 2681 2682 case 'A': 2683 { 2684 PyObject *obj = va_arg(*vargs, PyObject *); 2685 PyObject *ascii; 2686 assert(obj); 2687 ascii = PyObject_ASCII(obj); 2688 if (!ascii) 2689 return NULL; 2690 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2691 Py_DECREF(ascii); 2692 return NULL; 2693 } 2694 Py_DECREF(ascii); 2695 break; 2696 } 2697 2698 case '%': 2699 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2700 return NULL; 2701 break; 2702 2703 default: 2704 /* if we stumble upon an unknown formatting code, copy the rest 2705 of the format string to the output string. (we cannot just 2706 skip the code, since there's no way to know what's in the 2707 argument list) */ 2708 len = strlen(p); 2709 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) 2710 return NULL; 2711 f = p+len; 2712 return f; 2713 } 2714 2715 f++; 2716 return f; 2717} 2718 2719PyObject * 2720PyUnicode_FromFormatV(const char *format, va_list vargs) 2721{ 2722 va_list vargs2; 2723 const char *f; 2724 _PyUnicodeWriter writer; 2725 2726 _PyUnicodeWriter_Init(&writer); 2727 writer.min_length = strlen(format) + 100; 2728 writer.overallocate = 1; 2729 2730 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2731 Copy it to be able to pass a reference to a subfunction. */ 2732 Py_VA_COPY(vargs2, vargs); 2733 2734 for (f = format; *f; ) { 2735 if (*f == '%') { 2736 f = unicode_fromformat_arg(&writer, f, &vargs2); 2737 if (f == NULL) 2738 goto fail; 2739 } 2740 else { 2741 const char *p; 2742 Py_ssize_t len; 2743 2744 p = f; 2745 do 2746 { 2747 if ((unsigned char)*p > 127) { 2748 PyErr_Format(PyExc_ValueError, 2749 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2750 "string, got a non-ASCII byte: 0x%02x", 2751 (unsigned char)*p); 2752 return NULL; 2753 } 2754 p++; 2755 } 2756 while (*p != '\0' && *p != '%'); 2757 len = p - f; 2758 2759 if (*p == '\0') 2760 writer.overallocate = 0; 2761 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) 2762 goto fail; 2763 unicode_write_cstr(writer.buffer, writer.pos, f, len); 2764 writer.pos += len; 2765 2766 f = p; 2767 } 2768 } 2769 return _PyUnicodeWriter_Finish(&writer); 2770 2771 fail: 2772 _PyUnicodeWriter_Dealloc(&writer); 2773 return NULL; 2774} 2775 2776PyObject * 2777PyUnicode_FromFormat(const char *format, ...) 2778{ 2779 PyObject* ret; 2780 va_list vargs; 2781 2782#ifdef HAVE_STDARG_PROTOTYPES 2783 va_start(vargs, format); 2784#else 2785 va_start(vargs); 2786#endif 2787 ret = PyUnicode_FromFormatV(format, vargs); 2788 va_end(vargs); 2789 return ret; 2790} 2791 2792#ifdef HAVE_WCHAR_H 2793 2794/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2795 convert a Unicode object to a wide character string. 2796 2797 - If w is NULL: return the number of wide characters (including the null 2798 character) required to convert the unicode object. Ignore size argument. 2799 2800 - Otherwise: return the number of wide characters (excluding the null 2801 character) written into w. Write at most size wide characters (including 2802 the null character). */ 2803static Py_ssize_t 2804unicode_aswidechar(PyObject *unicode, 2805 wchar_t *w, 2806 Py_ssize_t size) 2807{ 2808 Py_ssize_t res; 2809 const wchar_t *wstr; 2810 2811 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2812 if (wstr == NULL) 2813 return -1; 2814 2815 if (w != NULL) { 2816 if (size > res) 2817 size = res + 1; 2818 else 2819 res = size; 2820 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2821 return res; 2822 } 2823 else 2824 return res + 1; 2825} 2826 2827Py_ssize_t 2828PyUnicode_AsWideChar(PyObject *unicode, 2829 wchar_t *w, 2830 Py_ssize_t size) 2831{ 2832 if (unicode == NULL) { 2833 PyErr_BadInternalCall(); 2834 return -1; 2835 } 2836 return unicode_aswidechar(unicode, w, size); 2837} 2838 2839wchar_t* 2840PyUnicode_AsWideCharString(PyObject *unicode, 2841 Py_ssize_t *size) 2842{ 2843 wchar_t* buffer; 2844 Py_ssize_t buflen; 2845 2846 if (unicode == NULL) { 2847 PyErr_BadInternalCall(); 2848 return NULL; 2849 } 2850 2851 buflen = unicode_aswidechar(unicode, NULL, 0); 2852 if (buflen == -1) 2853 return NULL; 2854 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2855 PyErr_NoMemory(); 2856 return NULL; 2857 } 2858 2859 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2860 if (buffer == NULL) { 2861 PyErr_NoMemory(); 2862 return NULL; 2863 } 2864 buflen = unicode_aswidechar(unicode, buffer, buflen); 2865 if (buflen == -1) { 2866 PyMem_FREE(buffer); 2867 return NULL; 2868 } 2869 if (size != NULL) 2870 *size = buflen; 2871 return buffer; 2872} 2873 2874#endif /* HAVE_WCHAR_H */ 2875 2876PyObject * 2877PyUnicode_FromOrdinal(int ordinal) 2878{ 2879 PyObject *v; 2880 void *data; 2881 int kind; 2882 2883 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2884 PyErr_SetString(PyExc_ValueError, 2885 "chr() arg not in range(0x110000)"); 2886 return NULL; 2887 } 2888 2889 if ((Py_UCS4)ordinal < 256) 2890 return get_latin1_char((unsigned char)ordinal); 2891 2892 v = PyUnicode_New(1, ordinal); 2893 if (v == NULL) 2894 return NULL; 2895 kind = PyUnicode_KIND(v); 2896 data = PyUnicode_DATA(v); 2897 PyUnicode_WRITE(kind, data, 0, ordinal); 2898 assert(_PyUnicode_CheckConsistency(v, 1)); 2899 return v; 2900} 2901 2902PyObject * 2903PyUnicode_FromObject(register PyObject *obj) 2904{ 2905 /* XXX Perhaps we should make this API an alias of 2906 PyObject_Str() instead ?! */ 2907 if (PyUnicode_CheckExact(obj)) { 2908 if (PyUnicode_READY(obj) == -1) 2909 return NULL; 2910 Py_INCREF(obj); 2911 return obj; 2912 } 2913 if (PyUnicode_Check(obj)) { 2914 /* For a Unicode subtype that's not a Unicode object, 2915 return a true Unicode object with the same data. */ 2916 return _PyUnicode_Copy(obj); 2917 } 2918 PyErr_Format(PyExc_TypeError, 2919 "Can't convert '%.100s' object to str implicitly", 2920 Py_TYPE(obj)->tp_name); 2921 return NULL; 2922} 2923 2924PyObject * 2925PyUnicode_FromEncodedObject(register PyObject *obj, 2926 const char *encoding, 2927 const char *errors) 2928{ 2929 Py_buffer buffer; 2930 PyObject *v; 2931 2932 if (obj == NULL) { 2933 PyErr_BadInternalCall(); 2934 return NULL; 2935 } 2936 2937 /* Decoding bytes objects is the most common case and should be fast */ 2938 if (PyBytes_Check(obj)) { 2939 if (PyBytes_GET_SIZE(obj) == 0) 2940 _Py_RETURN_UNICODE_EMPTY(); 2941 v = PyUnicode_Decode( 2942 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2943 encoding, errors); 2944 return v; 2945 } 2946 2947 if (PyUnicode_Check(obj)) { 2948 PyErr_SetString(PyExc_TypeError, 2949 "decoding str is not supported"); 2950 return NULL; 2951 } 2952 2953 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2954 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2955 PyErr_Format(PyExc_TypeError, 2956 "coercing to str: need bytes, bytearray " 2957 "or buffer-like object, %.80s found", 2958 Py_TYPE(obj)->tp_name); 2959 return NULL; 2960 } 2961 2962 if (buffer.len == 0) { 2963 PyBuffer_Release(&buffer); 2964 _Py_RETURN_UNICODE_EMPTY(); 2965 } 2966 2967 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2968 PyBuffer_Release(&buffer); 2969 return v; 2970} 2971 2972/* Convert encoding to lower case and replace '_' with '-' in order to 2973 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2974 1 on success. */ 2975int 2976_Py_normalize_encoding(const char *encoding, 2977 char *lower, 2978 size_t lower_len) 2979{ 2980 const char *e; 2981 char *l; 2982 char *l_end; 2983 2984 if (encoding == NULL) { 2985 strcpy(lower, "utf-8"); 2986 return 1; 2987 } 2988 e = encoding; 2989 l = lower; 2990 l_end = &lower[lower_len - 1]; 2991 while (*e) { 2992 if (l == l_end) 2993 return 0; 2994 if (Py_ISUPPER(*e)) { 2995 *l++ = Py_TOLOWER(*e++); 2996 } 2997 else if (*e == '_') { 2998 *l++ = '-'; 2999 e++; 3000 } 3001 else { 3002 *l++ = *e++; 3003 } 3004 } 3005 *l = '\0'; 3006 return 1; 3007} 3008 3009PyObject * 3010PyUnicode_Decode(const char *s, 3011 Py_ssize_t size, 3012 const char *encoding, 3013 const char *errors) 3014{ 3015 PyObject *buffer = NULL, *unicode; 3016 Py_buffer info; 3017 char lower[11]; /* Enough for any encoding shortcut */ 3018 3019 /* Shortcuts for common default encodings */ 3020 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3021 if ((strcmp(lower, "utf-8") == 0) || 3022 (strcmp(lower, "utf8") == 0)) 3023 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3024 else if ((strcmp(lower, "latin-1") == 0) || 3025 (strcmp(lower, "latin1") == 0) || 3026 (strcmp(lower, "iso-8859-1") == 0)) 3027 return PyUnicode_DecodeLatin1(s, size, errors); 3028#ifdef HAVE_MBCS 3029 else if (strcmp(lower, "mbcs") == 0) 3030 return PyUnicode_DecodeMBCS(s, size, errors); 3031#endif 3032 else if (strcmp(lower, "ascii") == 0) 3033 return PyUnicode_DecodeASCII(s, size, errors); 3034 else if (strcmp(lower, "utf-16") == 0) 3035 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3036 else if (strcmp(lower, "utf-32") == 0) 3037 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3038 } 3039 3040 /* Decode via the codec registry */ 3041 buffer = NULL; 3042 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3043 goto onError; 3044 buffer = PyMemoryView_FromBuffer(&info); 3045 if (buffer == NULL) 3046 goto onError; 3047 unicode = PyCodec_Decode(buffer, encoding, errors); 3048 if (unicode == NULL) 3049 goto onError; 3050 if (!PyUnicode_Check(unicode)) { 3051 PyErr_Format(PyExc_TypeError, 3052 "decoder did not return a str object (type=%.400s)", 3053 Py_TYPE(unicode)->tp_name); 3054 Py_DECREF(unicode); 3055 goto onError; 3056 } 3057 Py_DECREF(buffer); 3058 return unicode_result(unicode); 3059 3060 onError: 3061 Py_XDECREF(buffer); 3062 return NULL; 3063} 3064 3065PyObject * 3066PyUnicode_AsDecodedObject(PyObject *unicode, 3067 const char *encoding, 3068 const char *errors) 3069{ 3070 PyObject *v; 3071 3072 if (!PyUnicode_Check(unicode)) { 3073 PyErr_BadArgument(); 3074 goto onError; 3075 } 3076 3077 if (encoding == NULL) 3078 encoding = PyUnicode_GetDefaultEncoding(); 3079 3080 /* Decode via the codec registry */ 3081 v = PyCodec_Decode(unicode, encoding, errors); 3082 if (v == NULL) 3083 goto onError; 3084 return unicode_result(v); 3085 3086 onError: 3087 return NULL; 3088} 3089 3090PyObject * 3091PyUnicode_AsDecodedUnicode(PyObject *unicode, 3092 const char *encoding, 3093 const char *errors) 3094{ 3095 PyObject *v; 3096 3097 if (!PyUnicode_Check(unicode)) { 3098 PyErr_BadArgument(); 3099 goto onError; 3100 } 3101 3102 if (encoding == NULL) 3103 encoding = PyUnicode_GetDefaultEncoding(); 3104 3105 /* Decode via the codec registry */ 3106 v = PyCodec_Decode(unicode, encoding, errors); 3107 if (v == NULL) 3108 goto onError; 3109 if (!PyUnicode_Check(v)) { 3110 PyErr_Format(PyExc_TypeError, 3111 "decoder did not return a str object (type=%.400s)", 3112 Py_TYPE(v)->tp_name); 3113 Py_DECREF(v); 3114 goto onError; 3115 } 3116 return unicode_result(v); 3117 3118 onError: 3119 return NULL; 3120} 3121 3122PyObject * 3123PyUnicode_Encode(const Py_UNICODE *s, 3124 Py_ssize_t size, 3125 const char *encoding, 3126 const char *errors) 3127{ 3128 PyObject *v, *unicode; 3129 3130 unicode = PyUnicode_FromUnicode(s, size); 3131 if (unicode == NULL) 3132 return NULL; 3133 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3134 Py_DECREF(unicode); 3135 return v; 3136} 3137 3138PyObject * 3139PyUnicode_AsEncodedObject(PyObject *unicode, 3140 const char *encoding, 3141 const char *errors) 3142{ 3143 PyObject *v; 3144 3145 if (!PyUnicode_Check(unicode)) { 3146 PyErr_BadArgument(); 3147 goto onError; 3148 } 3149 3150 if (encoding == NULL) 3151 encoding = PyUnicode_GetDefaultEncoding(); 3152 3153 /* Encode via the codec registry */ 3154 v = PyCodec_Encode(unicode, encoding, errors); 3155 if (v == NULL) 3156 goto onError; 3157 return v; 3158 3159 onError: 3160 return NULL; 3161} 3162 3163static size_t 3164wcstombs_errorpos(const wchar_t *wstr) 3165{ 3166 size_t len; 3167#if SIZEOF_WCHAR_T == 2 3168 wchar_t buf[3]; 3169#else 3170 wchar_t buf[2]; 3171#endif 3172 char outbuf[MB_LEN_MAX]; 3173 const wchar_t *start, *previous; 3174 3175#if SIZEOF_WCHAR_T == 2 3176 buf[2] = 0; 3177#else 3178 buf[1] = 0; 3179#endif 3180 start = wstr; 3181 while (*wstr != L'\0') 3182 { 3183 previous = wstr; 3184#if SIZEOF_WCHAR_T == 2 3185 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3186 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3187 { 3188 buf[0] = wstr[0]; 3189 buf[1] = wstr[1]; 3190 wstr += 2; 3191 } 3192 else { 3193 buf[0] = *wstr; 3194 buf[1] = 0; 3195 wstr++; 3196 } 3197#else 3198 buf[0] = *wstr; 3199 wstr++; 3200#endif 3201 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3202 if (len == (size_t)-1) 3203 return previous - start; 3204 } 3205 3206 /* failed to find the unencodable character */ 3207 return 0; 3208} 3209 3210static int 3211locale_error_handler(const char *errors, int *surrogateescape) 3212{ 3213 if (errors == NULL) { 3214 *surrogateescape = 0; 3215 return 0; 3216 } 3217 3218 if (strcmp(errors, "strict") == 0) { 3219 *surrogateescape = 0; 3220 return 0; 3221 } 3222 if (strcmp(errors, "surrogateescape") == 0) { 3223 *surrogateescape = 1; 3224 return 0; 3225 } 3226 PyErr_Format(PyExc_ValueError, 3227 "only 'strict' and 'surrogateescape' error handlers " 3228 "are supported, not '%s'", 3229 errors); 3230 return -1; 3231} 3232 3233PyObject * 3234PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3235{ 3236 Py_ssize_t wlen, wlen2; 3237 wchar_t *wstr; 3238 PyObject *bytes = NULL; 3239 char *errmsg; 3240 PyObject *reason; 3241 PyObject *exc; 3242 size_t error_pos; 3243 int surrogateescape; 3244 3245 if (locale_error_handler(errors, &surrogateescape) < 0) 3246 return NULL; 3247 3248 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3249 if (wstr == NULL) 3250 return NULL; 3251 3252 wlen2 = wcslen(wstr); 3253 if (wlen2 != wlen) { 3254 PyMem_Free(wstr); 3255 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3256 return NULL; 3257 } 3258 3259 if (surrogateescape) { 3260 /* "surrogateescape" error handler */ 3261 char *str; 3262 3263 str = _Py_wchar2char(wstr, &error_pos); 3264 if (str == NULL) { 3265 if (error_pos == (size_t)-1) { 3266 PyErr_NoMemory(); 3267 PyMem_Free(wstr); 3268 return NULL; 3269 } 3270 else { 3271 goto encode_error; 3272 } 3273 } 3274 PyMem_Free(wstr); 3275 3276 bytes = PyBytes_FromString(str); 3277 PyMem_Free(str); 3278 } 3279 else { 3280 /* strict mode */ 3281 size_t len, len2; 3282 3283 len = wcstombs(NULL, wstr, 0); 3284 if (len == (size_t)-1) { 3285 error_pos = (size_t)-1; 3286 goto encode_error; 3287 } 3288 3289 bytes = PyBytes_FromStringAndSize(NULL, len); 3290 if (bytes == NULL) { 3291 PyMem_Free(wstr); 3292 return NULL; 3293 } 3294 3295 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3296 if (len2 == (size_t)-1 || len2 > len) { 3297 error_pos = (size_t)-1; 3298 goto encode_error; 3299 } 3300 PyMem_Free(wstr); 3301 } 3302 return bytes; 3303 3304encode_error: 3305 errmsg = strerror(errno); 3306 assert(errmsg != NULL); 3307 3308 if (error_pos == (size_t)-1) 3309 error_pos = wcstombs_errorpos(wstr); 3310 3311 PyMem_Free(wstr); 3312 Py_XDECREF(bytes); 3313 3314 if (errmsg != NULL) { 3315 size_t errlen; 3316 wstr = _Py_char2wchar(errmsg, &errlen); 3317 if (wstr != NULL) { 3318 reason = PyUnicode_FromWideChar(wstr, errlen); 3319 PyMem_Free(wstr); 3320 } else 3321 errmsg = NULL; 3322 } 3323 if (errmsg == NULL) 3324 reason = PyUnicode_FromString( 3325 "wcstombs() encountered an unencodable " 3326 "wide character"); 3327 if (reason == NULL) 3328 return NULL; 3329 3330 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3331 "locale", unicode, 3332 (Py_ssize_t)error_pos, 3333 (Py_ssize_t)(error_pos+1), 3334 reason); 3335 Py_DECREF(reason); 3336 if (exc != NULL) { 3337 PyCodec_StrictErrors(exc); 3338 Py_XDECREF(exc); 3339 } 3340 return NULL; 3341} 3342 3343PyObject * 3344PyUnicode_EncodeFSDefault(PyObject *unicode) 3345{ 3346#ifdef HAVE_MBCS 3347 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3348#elif defined(__APPLE__) 3349 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3350#else 3351 PyInterpreterState *interp = PyThreadState_GET()->interp; 3352 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3353 cannot use it to encode and decode filenames before it is loaded. Load 3354 the Python codec requires to encode at least its own filename. Use the C 3355 version of the locale codec until the codec registry is initialized and 3356 the Python codec is loaded. 3357 3358 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3359 cannot only rely on it: check also interp->fscodec_initialized for 3360 subinterpreters. */ 3361 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3362 return PyUnicode_AsEncodedString(unicode, 3363 Py_FileSystemDefaultEncoding, 3364 "surrogateescape"); 3365 } 3366 else { 3367 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3368 } 3369#endif 3370} 3371 3372PyObject * 3373PyUnicode_AsEncodedString(PyObject *unicode, 3374 const char *encoding, 3375 const char *errors) 3376{ 3377 PyObject *v; 3378 char lower[11]; /* Enough for any encoding shortcut */ 3379 3380 if (!PyUnicode_Check(unicode)) { 3381 PyErr_BadArgument(); 3382 return NULL; 3383 } 3384 3385 /* Shortcuts for common default encodings */ 3386 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3387 if ((strcmp(lower, "utf-8") == 0) || 3388 (strcmp(lower, "utf8") == 0)) 3389 { 3390 if (errors == NULL || strcmp(errors, "strict") == 0) 3391 return _PyUnicode_AsUTF8String(unicode, NULL); 3392 else 3393 return _PyUnicode_AsUTF8String(unicode, errors); 3394 } 3395 else if ((strcmp(lower, "latin-1") == 0) || 3396 (strcmp(lower, "latin1") == 0) || 3397 (strcmp(lower, "iso-8859-1") == 0)) 3398 return _PyUnicode_AsLatin1String(unicode, errors); 3399#ifdef HAVE_MBCS 3400 else if (strcmp(lower, "mbcs") == 0) 3401 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3402#endif 3403 else if (strcmp(lower, "ascii") == 0) 3404 return _PyUnicode_AsASCIIString(unicode, errors); 3405 } 3406 3407 /* Encode via the codec registry */ 3408 v = PyCodec_Encode(unicode, encoding, errors); 3409 if (v == NULL) 3410 return NULL; 3411 3412 /* The normal path */ 3413 if (PyBytes_Check(v)) 3414 return v; 3415 3416 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3417 if (PyByteArray_Check(v)) { 3418 int error; 3419 PyObject *b; 3420 3421 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3422 "encoder %s returned bytearray instead of bytes", 3423 encoding); 3424 if (error) { 3425 Py_DECREF(v); 3426 return NULL; 3427 } 3428 3429 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3430 Py_DECREF(v); 3431 return b; 3432 } 3433 3434 PyErr_Format(PyExc_TypeError, 3435 "encoder did not return a bytes object (type=%.400s)", 3436 Py_TYPE(v)->tp_name); 3437 Py_DECREF(v); 3438 return NULL; 3439} 3440 3441PyObject * 3442PyUnicode_AsEncodedUnicode(PyObject *unicode, 3443 const char *encoding, 3444 const char *errors) 3445{ 3446 PyObject *v; 3447 3448 if (!PyUnicode_Check(unicode)) { 3449 PyErr_BadArgument(); 3450 goto onError; 3451 } 3452 3453 if (encoding == NULL) 3454 encoding = PyUnicode_GetDefaultEncoding(); 3455 3456 /* Encode via the codec registry */ 3457 v = PyCodec_Encode(unicode, encoding, errors); 3458 if (v == NULL) 3459 goto onError; 3460 if (!PyUnicode_Check(v)) { 3461 PyErr_Format(PyExc_TypeError, 3462 "encoder did not return an str object (type=%.400s)", 3463 Py_TYPE(v)->tp_name); 3464 Py_DECREF(v); 3465 goto onError; 3466 } 3467 return v; 3468 3469 onError: 3470 return NULL; 3471} 3472 3473static size_t 3474mbstowcs_errorpos(const char *str, size_t len) 3475{ 3476#ifdef HAVE_MBRTOWC 3477 const char *start = str; 3478 mbstate_t mbs; 3479 size_t converted; 3480 wchar_t ch; 3481 3482 memset(&mbs, 0, sizeof mbs); 3483 while (len) 3484 { 3485 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3486 if (converted == 0) 3487 /* Reached end of string */ 3488 break; 3489 if (converted == (size_t)-1 || converted == (size_t)-2) { 3490 /* Conversion error or incomplete character */ 3491 return str - start; 3492 } 3493 else { 3494 str += converted; 3495 len -= converted; 3496 } 3497 } 3498 /* failed to find the undecodable byte sequence */ 3499 return 0; 3500#endif 3501 return 0; 3502} 3503 3504PyObject* 3505PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3506 const char *errors) 3507{ 3508 wchar_t smallbuf[256]; 3509 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3510 wchar_t *wstr; 3511 size_t wlen, wlen2; 3512 PyObject *unicode; 3513 int surrogateescape; 3514 size_t error_pos; 3515 char *errmsg; 3516 PyObject *reason, *exc; 3517 3518 if (locale_error_handler(errors, &surrogateescape) < 0) 3519 return NULL; 3520 3521 if (str[len] != '\0' || len != strlen(str)) { 3522 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3523 return NULL; 3524 } 3525 3526 if (surrogateescape) { 3527 /* "surrogateescape" error handler */ 3528 wstr = _Py_char2wchar(str, &wlen); 3529 if (wstr == NULL) { 3530 if (wlen == (size_t)-1) 3531 PyErr_NoMemory(); 3532 else 3533 PyErr_SetFromErrno(PyExc_OSError); 3534 return NULL; 3535 } 3536 3537 unicode = PyUnicode_FromWideChar(wstr, wlen); 3538 PyMem_Free(wstr); 3539 } 3540 else { 3541 /* strict mode */ 3542#ifndef HAVE_BROKEN_MBSTOWCS 3543 wlen = mbstowcs(NULL, str, 0); 3544#else 3545 wlen = len; 3546#endif 3547 if (wlen == (size_t)-1) 3548 goto decode_error; 3549 if (wlen+1 <= smallbuf_len) { 3550 wstr = smallbuf; 3551 } 3552 else { 3553 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3554 return PyErr_NoMemory(); 3555 3556 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3557 if (!wstr) 3558 return PyErr_NoMemory(); 3559 } 3560 3561 wlen2 = mbstowcs(wstr, str, wlen+1); 3562 if (wlen2 == (size_t)-1) { 3563 if (wstr != smallbuf) 3564 PyMem_Free(wstr); 3565 goto decode_error; 3566 } 3567#ifdef HAVE_BROKEN_MBSTOWCS 3568 assert(wlen2 == wlen); 3569#endif 3570 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3571 if (wstr != smallbuf) 3572 PyMem_Free(wstr); 3573 } 3574 return unicode; 3575 3576decode_error: 3577 errmsg = strerror(errno); 3578 assert(errmsg != NULL); 3579 3580 error_pos = mbstowcs_errorpos(str, len); 3581 if (errmsg != NULL) { 3582 size_t errlen; 3583 wstr = _Py_char2wchar(errmsg, &errlen); 3584 if (wstr != NULL) { 3585 reason = PyUnicode_FromWideChar(wstr, errlen); 3586 PyMem_Free(wstr); 3587 } else 3588 errmsg = NULL; 3589 } 3590 if (errmsg == NULL) 3591 reason = PyUnicode_FromString( 3592 "mbstowcs() encountered an invalid multibyte sequence"); 3593 if (reason == NULL) 3594 return NULL; 3595 3596 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3597 "locale", str, len, 3598 (Py_ssize_t)error_pos, 3599 (Py_ssize_t)(error_pos+1), 3600 reason); 3601 Py_DECREF(reason); 3602 if (exc != NULL) { 3603 PyCodec_StrictErrors(exc); 3604 Py_XDECREF(exc); 3605 } 3606 return NULL; 3607} 3608 3609PyObject* 3610PyUnicode_DecodeLocale(const char *str, const char *errors) 3611{ 3612 Py_ssize_t size = (Py_ssize_t)strlen(str); 3613 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3614} 3615 3616 3617PyObject* 3618PyUnicode_DecodeFSDefault(const char *s) { 3619 Py_ssize_t size = (Py_ssize_t)strlen(s); 3620 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3621} 3622 3623PyObject* 3624PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3625{ 3626#ifdef HAVE_MBCS 3627 return PyUnicode_DecodeMBCS(s, size, NULL); 3628#elif defined(__APPLE__) 3629 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3630#else 3631 PyInterpreterState *interp = PyThreadState_GET()->interp; 3632 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3633 cannot use it to encode and decode filenames before it is loaded. Load 3634 the Python codec requires to encode at least its own filename. Use the C 3635 version of the locale codec until the codec registry is initialized and 3636 the Python codec is loaded. 3637 3638 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3639 cannot only rely on it: check also interp->fscodec_initialized for 3640 subinterpreters. */ 3641 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3642 return PyUnicode_Decode(s, size, 3643 Py_FileSystemDefaultEncoding, 3644 "surrogateescape"); 3645 } 3646 else { 3647 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3648 } 3649#endif 3650} 3651 3652 3653int 3654_PyUnicode_HasNULChars(PyObject* str) 3655{ 3656 Py_ssize_t pos; 3657 3658 if (PyUnicode_READY(str) == -1) 3659 return -1; 3660 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3661 PyUnicode_GET_LENGTH(str), '\0', 1); 3662 if (pos == -1) 3663 return 0; 3664 else 3665 return 1; 3666} 3667 3668int 3669PyUnicode_FSConverter(PyObject* arg, void* addr) 3670{ 3671 PyObject *output = NULL; 3672 Py_ssize_t size; 3673 void *data; 3674 if (arg == NULL) { 3675 Py_DECREF(*(PyObject**)addr); 3676 return 1; 3677 } 3678 if (PyBytes_Check(arg)) { 3679 output = arg; 3680 Py_INCREF(output); 3681 } 3682 else { 3683 arg = PyUnicode_FromObject(arg); 3684 if (!arg) 3685 return 0; 3686 output = PyUnicode_EncodeFSDefault(arg); 3687 Py_DECREF(arg); 3688 if (!output) 3689 return 0; 3690 if (!PyBytes_Check(output)) { 3691 Py_DECREF(output); 3692 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3693 return 0; 3694 } 3695 } 3696 size = PyBytes_GET_SIZE(output); 3697 data = PyBytes_AS_STRING(output); 3698 if (size != strlen(data)) { 3699 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3700 Py_DECREF(output); 3701 return 0; 3702 } 3703 *(PyObject**)addr = output; 3704 return Py_CLEANUP_SUPPORTED; 3705} 3706 3707 3708int 3709PyUnicode_FSDecoder(PyObject* arg, void* addr) 3710{ 3711 PyObject *output = NULL; 3712 if (arg == NULL) { 3713 Py_DECREF(*(PyObject**)addr); 3714 return 1; 3715 } 3716 if (PyUnicode_Check(arg)) { 3717 if (PyUnicode_READY(arg) == -1) 3718 return 0; 3719 output = arg; 3720 Py_INCREF(output); 3721 } 3722 else { 3723 arg = PyBytes_FromObject(arg); 3724 if (!arg) 3725 return 0; 3726 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3727 PyBytes_GET_SIZE(arg)); 3728 Py_DECREF(arg); 3729 if (!output) 3730 return 0; 3731 if (!PyUnicode_Check(output)) { 3732 Py_DECREF(output); 3733 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3734 return 0; 3735 } 3736 } 3737 if (PyUnicode_READY(output) == -1) { 3738 Py_DECREF(output); 3739 return 0; 3740 } 3741 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3742 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3743 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3744 Py_DECREF(output); 3745 return 0; 3746 } 3747 *(PyObject**)addr = output; 3748 return Py_CLEANUP_SUPPORTED; 3749} 3750 3751 3752char* 3753PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3754{ 3755 PyObject *bytes; 3756 3757 if (!PyUnicode_Check(unicode)) { 3758 PyErr_BadArgument(); 3759 return NULL; 3760 } 3761 if (PyUnicode_READY(unicode) == -1) 3762 return NULL; 3763 3764 if (PyUnicode_UTF8(unicode) == NULL) { 3765 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3766 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3767 if (bytes == NULL) 3768 return NULL; 3769 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3770 if (_PyUnicode_UTF8(unicode) == NULL) { 3771 Py_DECREF(bytes); 3772 return NULL; 3773 } 3774 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3775 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3776 PyBytes_AS_STRING(bytes), 3777 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3778 Py_DECREF(bytes); 3779 } 3780 3781 if (psize) 3782 *psize = PyUnicode_UTF8_LENGTH(unicode); 3783 return PyUnicode_UTF8(unicode); 3784} 3785 3786char* 3787PyUnicode_AsUTF8(PyObject *unicode) 3788{ 3789 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3790} 3791 3792Py_UNICODE * 3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3794{ 3795 const unsigned char *one_byte; 3796#if SIZEOF_WCHAR_T == 4 3797 const Py_UCS2 *two_bytes; 3798#else 3799 const Py_UCS4 *four_bytes; 3800 const Py_UCS4 *ucs4_end; 3801 Py_ssize_t num_surrogates; 3802#endif 3803 wchar_t *w; 3804 wchar_t *wchar_end; 3805 3806 if (!PyUnicode_Check(unicode)) { 3807 PyErr_BadArgument(); 3808 return NULL; 3809 } 3810 if (_PyUnicode_WSTR(unicode) == NULL) { 3811 /* Non-ASCII compact unicode object */ 3812 assert(_PyUnicode_KIND(unicode) != 0); 3813 assert(PyUnicode_IS_READY(unicode)); 3814 3815 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3816#if SIZEOF_WCHAR_T == 2 3817 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3818 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3819 num_surrogates = 0; 3820 3821 for (; four_bytes < ucs4_end; ++four_bytes) { 3822 if (*four_bytes > 0xFFFF) 3823 ++num_surrogates; 3824 } 3825 3826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3827 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3828 if (!_PyUnicode_WSTR(unicode)) { 3829 PyErr_NoMemory(); 3830 return NULL; 3831 } 3832 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3833 3834 w = _PyUnicode_WSTR(unicode); 3835 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3836 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3837 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3838 if (*four_bytes > 0xFFFF) { 3839 assert(*four_bytes <= MAX_UNICODE); 3840 /* encode surrogate pair in this case */ 3841 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3842 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3843 } 3844 else 3845 *w = *four_bytes; 3846 3847 if (w > wchar_end) { 3848 assert(0 && "Miscalculated string end"); 3849 } 3850 } 3851 *w = 0; 3852#else 3853 /* sizeof(wchar_t) == 4 */ 3854 Py_FatalError("Impossible unicode object state, wstr and str " 3855 "should share memory already."); 3856 return NULL; 3857#endif 3858 } 3859 else { 3860 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3861 (_PyUnicode_LENGTH(unicode) + 1)); 3862 if (!_PyUnicode_WSTR(unicode)) { 3863 PyErr_NoMemory(); 3864 return NULL; 3865 } 3866 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3867 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3868 w = _PyUnicode_WSTR(unicode); 3869 wchar_end = w + _PyUnicode_LENGTH(unicode); 3870 3871 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3872 one_byte = PyUnicode_1BYTE_DATA(unicode); 3873 for (; w < wchar_end; ++one_byte, ++w) 3874 *w = *one_byte; 3875 /* null-terminate the wstr */ 3876 *w = 0; 3877 } 3878 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3879#if SIZEOF_WCHAR_T == 4 3880 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3881 for (; w < wchar_end; ++two_bytes, ++w) 3882 *w = *two_bytes; 3883 /* null-terminate the wstr */ 3884 *w = 0; 3885#else 3886 /* sizeof(wchar_t) == 2 */ 3887 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3888 _PyUnicode_WSTR(unicode) = NULL; 3889 Py_FatalError("Impossible unicode object state, wstr " 3890 "and str should share memory already."); 3891 return NULL; 3892#endif 3893 } 3894 else { 3895 assert(0 && "This should never happen."); 3896 } 3897 } 3898 } 3899 if (size != NULL) 3900 *size = PyUnicode_WSTR_LENGTH(unicode); 3901 return _PyUnicode_WSTR(unicode); 3902} 3903 3904Py_UNICODE * 3905PyUnicode_AsUnicode(PyObject *unicode) 3906{ 3907 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3908} 3909 3910 3911Py_ssize_t 3912PyUnicode_GetSize(PyObject *unicode) 3913{ 3914 if (!PyUnicode_Check(unicode)) { 3915 PyErr_BadArgument(); 3916 goto onError; 3917 } 3918 return PyUnicode_GET_SIZE(unicode); 3919 3920 onError: 3921 return -1; 3922} 3923 3924Py_ssize_t 3925PyUnicode_GetLength(PyObject *unicode) 3926{ 3927 if (!PyUnicode_Check(unicode)) { 3928 PyErr_BadArgument(); 3929 return -1; 3930 } 3931 if (PyUnicode_READY(unicode) == -1) 3932 return -1; 3933 return PyUnicode_GET_LENGTH(unicode); 3934} 3935 3936Py_UCS4 3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3938{ 3939 void *data; 3940 int kind; 3941 3942 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3943 PyErr_BadArgument(); 3944 return (Py_UCS4)-1; 3945 } 3946 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3947 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3948 return (Py_UCS4)-1; 3949 } 3950 data = PyUnicode_DATA(unicode); 3951 kind = PyUnicode_KIND(unicode); 3952 return PyUnicode_READ(kind, data, index); 3953} 3954 3955int 3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3957{ 3958 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3959 PyErr_BadArgument(); 3960 return -1; 3961 } 3962 assert(PyUnicode_IS_READY(unicode)); 3963 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3964 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3965 return -1; 3966 } 3967 if (unicode_check_modifiable(unicode)) 3968 return -1; 3969 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3970 PyErr_SetString(PyExc_ValueError, "character out of range"); 3971 return -1; 3972 } 3973 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3974 index, ch); 3975 return 0; 3976} 3977 3978const char * 3979PyUnicode_GetDefaultEncoding(void) 3980{ 3981 return "utf-8"; 3982} 3983 3984/* create or adjust a UnicodeDecodeError */ 3985static void 3986make_decode_exception(PyObject **exceptionObject, 3987 const char *encoding, 3988 const char *input, Py_ssize_t length, 3989 Py_ssize_t startpos, Py_ssize_t endpos, 3990 const char *reason) 3991{ 3992 if (*exceptionObject == NULL) { 3993 *exceptionObject = PyUnicodeDecodeError_Create( 3994 encoding, input, length, startpos, endpos, reason); 3995 } 3996 else { 3997 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3998 goto onError; 3999 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4000 goto onError; 4001 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4002 goto onError; 4003 } 4004 return; 4005 4006onError: 4007 Py_DECREF(*exceptionObject); 4008 *exceptionObject = NULL; 4009} 4010 4011#ifdef HAVE_MBCS 4012/* error handling callback helper: 4013 build arguments, call the callback and check the arguments, 4014 if no exception occurred, copy the replacement to the output 4015 and adjust various state variables. 4016 return 0 on success, -1 on error 4017*/ 4018 4019static int 4020unicode_decode_call_errorhandler_wchar( 4021 const char *errors, PyObject **errorHandler, 4022 const char *encoding, const char *reason, 4023 const char **input, const char **inend, Py_ssize_t *startinpos, 4024 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4025 PyObject **output, Py_ssize_t *outpos) 4026{ 4027 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4028 4029 PyObject *restuple = NULL; 4030 PyObject *repunicode = NULL; 4031 Py_ssize_t outsize; 4032 Py_ssize_t insize; 4033 Py_ssize_t requiredsize; 4034 Py_ssize_t newpos; 4035 PyObject *inputobj = NULL; 4036 wchar_t *repwstr; 4037 Py_ssize_t repwlen; 4038 4039 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4040 outsize = _PyUnicode_WSTR_LENGTH(*output); 4041 4042 if (*errorHandler == NULL) { 4043 *errorHandler = PyCodec_LookupError(errors); 4044 if (*errorHandler == NULL) 4045 goto onError; 4046 } 4047 4048 make_decode_exception(exceptionObject, 4049 encoding, 4050 *input, *inend - *input, 4051 *startinpos, *endinpos, 4052 reason); 4053 if (*exceptionObject == NULL) 4054 goto onError; 4055 4056 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4057 if (restuple == NULL) 4058 goto onError; 4059 if (!PyTuple_Check(restuple)) { 4060 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4061 goto onError; 4062 } 4063 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4064 goto onError; 4065 4066 /* Copy back the bytes variables, which might have been modified by the 4067 callback */ 4068 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4069 if (!inputobj) 4070 goto onError; 4071 if (!PyBytes_Check(inputobj)) { 4072 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4073 } 4074 *input = PyBytes_AS_STRING(inputobj); 4075 insize = PyBytes_GET_SIZE(inputobj); 4076 *inend = *input + insize; 4077 /* we can DECREF safely, as the exception has another reference, 4078 so the object won't go away. */ 4079 Py_DECREF(inputobj); 4080 4081 if (newpos<0) 4082 newpos = insize+newpos; 4083 if (newpos<0 || newpos>insize) { 4084 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4085 goto onError; 4086 } 4087 4088 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4089 if (repwstr == NULL) 4090 goto onError; 4091 /* need more space? (at least enough for what we 4092 have+the replacement+the rest of the string (starting 4093 at the new input position), so we won't have to check space 4094 when there are no errors in the rest of the string) */ 4095 requiredsize = *outpos + repwlen + insize-newpos; 4096 if (requiredsize > outsize) { 4097 if (requiredsize < 2*outsize) 4098 requiredsize = 2*outsize; 4099 if (unicode_resize(output, requiredsize) < 0) 4100 goto onError; 4101 } 4102 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4103 *outpos += repwlen; 4104 4105 *endinpos = newpos; 4106 *inptr = *input + newpos; 4107 4108 /* we made it! */ 4109 Py_XDECREF(restuple); 4110 return 0; 4111 4112 onError: 4113 Py_XDECREF(restuple); 4114 return -1; 4115} 4116#endif /* HAVE_MBCS */ 4117 4118static int 4119unicode_decode_call_errorhandler_writer( 4120 const char *errors, PyObject **errorHandler, 4121 const char *encoding, const char *reason, 4122 const char **input, const char **inend, Py_ssize_t *startinpos, 4123 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4124 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4125{ 4126 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4127 4128 PyObject *restuple = NULL; 4129 PyObject *repunicode = NULL; 4130 Py_ssize_t insize; 4131 Py_ssize_t newpos; 4132 Py_ssize_t replen; 4133 PyObject *inputobj = NULL; 4134 4135 if (*errorHandler == NULL) { 4136 *errorHandler = PyCodec_LookupError(errors); 4137 if (*errorHandler == NULL) 4138 goto onError; 4139 } 4140 4141 make_decode_exception(exceptionObject, 4142 encoding, 4143 *input, *inend - *input, 4144 *startinpos, *endinpos, 4145 reason); 4146 if (*exceptionObject == NULL) 4147 goto onError; 4148 4149 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4150 if (restuple == NULL) 4151 goto onError; 4152 if (!PyTuple_Check(restuple)) { 4153 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4154 goto onError; 4155 } 4156 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4157 goto onError; 4158 4159 /* Copy back the bytes variables, which might have been modified by the 4160 callback */ 4161 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4162 if (!inputobj) 4163 goto onError; 4164 if (!PyBytes_Check(inputobj)) { 4165 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4166 } 4167 *input = PyBytes_AS_STRING(inputobj); 4168 insize = PyBytes_GET_SIZE(inputobj); 4169 *inend = *input + insize; 4170 /* we can DECREF safely, as the exception has another reference, 4171 so the object won't go away. */ 4172 Py_DECREF(inputobj); 4173 4174 if (newpos<0) 4175 newpos = insize+newpos; 4176 if (newpos<0 || newpos>insize) { 4177 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4178 goto onError; 4179 } 4180 4181 if (PyUnicode_READY(repunicode) < 0) 4182 goto onError; 4183 replen = PyUnicode_GET_LENGTH(repunicode); 4184 writer->min_length += replen; 4185 if (replen > 1) 4186 writer->overallocate = 1; 4187 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4188 goto onError; 4189 4190 *endinpos = newpos; 4191 *inptr = *input + newpos; 4192 4193 /* we made it! */ 4194 Py_XDECREF(restuple); 4195 return 0; 4196 4197 onError: 4198 Py_XDECREF(restuple); 4199 return -1; 4200} 4201 4202/* --- UTF-7 Codec -------------------------------------------------------- */ 4203 4204/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4205 4206/* Three simple macros defining base-64. */ 4207 4208/* Is c a base-64 character? */ 4209 4210#define IS_BASE64(c) \ 4211 (((c) >= 'A' && (c) <= 'Z') || \ 4212 ((c) >= 'a' && (c) <= 'z') || \ 4213 ((c) >= '0' && (c) <= '9') || \ 4214 (c) == '+' || (c) == '/') 4215 4216/* given that c is a base-64 character, what is its base-64 value? */ 4217 4218#define FROM_BASE64(c) \ 4219 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4220 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4221 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4222 (c) == '+' ? 62 : 63) 4223 4224/* What is the base-64 character of the bottom 6 bits of n? */ 4225 4226#define TO_BASE64(n) \ 4227 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4228 4229/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4230 * decoded as itself. We are permissive on decoding; the only ASCII 4231 * byte not decoding to itself is the + which begins a base64 4232 * string. */ 4233 4234#define DECODE_DIRECT(c) \ 4235 ((c) <= 127 && (c) != '+') 4236 4237/* The UTF-7 encoder treats ASCII characters differently according to 4238 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4239 * the above). See RFC2152. This array identifies these different 4240 * sets: 4241 * 0 : "Set D" 4242 * alphanumeric and '(),-./:? 4243 * 1 : "Set O" 4244 * !"#$%&*;<=>@[]^_`{|} 4245 * 2 : "whitespace" 4246 * ht nl cr sp 4247 * 3 : special (must be base64 encoded) 4248 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4249 */ 4250 4251static 4252char utf7_category[128] = { 4253/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4254 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4255/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4257/* sp ! " # $ % & ' ( ) * + , - . / */ 4258 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4259/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4261/* @ A B C D E F G H I J K L M N O */ 4262 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4263/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4265/* ` a b c d e f g h i j k l m n o */ 4266 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4267/* p q r s t u v w x y z { | } ~ del */ 4268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4269}; 4270 4271/* ENCODE_DIRECT: this character should be encoded as itself. The 4272 * answer depends on whether we are encoding set O as itself, and also 4273 * on whether we are encoding whitespace as itself. RFC2152 makes it 4274 * clear that the answers to these questions vary between 4275 * applications, so this code needs to be flexible. */ 4276 4277#define ENCODE_DIRECT(c, directO, directWS) \ 4278 ((c) < 128 && (c) > 0 && \ 4279 ((utf7_category[(c)] == 0) || \ 4280 (directWS && (utf7_category[(c)] == 2)) || \ 4281 (directO && (utf7_category[(c)] == 1)))) 4282 4283PyObject * 4284PyUnicode_DecodeUTF7(const char *s, 4285 Py_ssize_t size, 4286 const char *errors) 4287{ 4288 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4289} 4290 4291/* The decoder. The only state we preserve is our read position, 4292 * i.e. how many characters we have consumed. So if we end in the 4293 * middle of a shift sequence we have to back off the read position 4294 * and the output to the beginning of the sequence, otherwise we lose 4295 * all the shift state (seen bits, number of bits seen, high 4296 * surrogate). */ 4297 4298PyObject * 4299PyUnicode_DecodeUTF7Stateful(const char *s, 4300 Py_ssize_t size, 4301 const char *errors, 4302 Py_ssize_t *consumed) 4303{ 4304 const char *starts = s; 4305 Py_ssize_t startinpos; 4306 Py_ssize_t endinpos; 4307 const char *e; 4308 _PyUnicodeWriter writer; 4309 const char *errmsg = ""; 4310 int inShift = 0; 4311 Py_ssize_t shiftOutStart; 4312 unsigned int base64bits = 0; 4313 unsigned long base64buffer = 0; 4314 Py_UCS4 surrogate = 0; 4315 PyObject *errorHandler = NULL; 4316 PyObject *exc = NULL; 4317 4318 if (size == 0) { 4319 if (consumed) 4320 *consumed = 0; 4321 _Py_RETURN_UNICODE_EMPTY(); 4322 } 4323 4324 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4325 _PyUnicodeWriter_Init(&writer); 4326 writer.min_length = size; 4327 4328 shiftOutStart = 0; 4329 e = s + size; 4330 4331 while (s < e) { 4332 Py_UCS4 ch; 4333 restart: 4334 ch = (unsigned char) *s; 4335 4336 if (inShift) { /* in a base-64 section */ 4337 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4338 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4339 base64bits += 6; 4340 s++; 4341 if (base64bits >= 16) { 4342 /* we have enough bits for a UTF-16 value */ 4343 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4344 base64bits -= 16; 4345 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4346 if (surrogate) { 4347 /* expecting a second surrogate */ 4348 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4349 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4350 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4351 goto onError; 4352 surrogate = 0; 4353 continue; 4354 } 4355 else { 4356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4357 goto onError; 4358 surrogate = 0; 4359 } 4360 } 4361 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4362 /* first surrogate */ 4363 surrogate = outCh; 4364 } 4365 else { 4366 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4367 goto onError; 4368 } 4369 } 4370 } 4371 else { /* now leaving a base-64 section */ 4372 inShift = 0; 4373 s++; 4374 if (surrogate) { 4375 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4376 goto onError; 4377 surrogate = 0; 4378 } 4379 if (base64bits > 0) { /* left-over bits */ 4380 if (base64bits >= 6) { 4381 /* We've seen at least one base-64 character */ 4382 errmsg = "partial character in shift sequence"; 4383 goto utf7Error; 4384 } 4385 else { 4386 /* Some bits remain; they should be zero */ 4387 if (base64buffer != 0) { 4388 errmsg = "non-zero padding bits in shift sequence"; 4389 goto utf7Error; 4390 } 4391 } 4392 } 4393 if (ch != '-') { 4394 /* '-' is absorbed; other terminating 4395 characters are preserved */ 4396 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4397 goto onError; 4398 } 4399 } 4400 } 4401 else if ( ch == '+' ) { 4402 startinpos = s-starts; 4403 s++; /* consume '+' */ 4404 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4405 s++; 4406 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4407 goto onError; 4408 } 4409 else { /* begin base64-encoded section */ 4410 inShift = 1; 4411 shiftOutStart = writer.pos; 4412 base64bits = 0; 4413 } 4414 } 4415 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4416 s++; 4417 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4418 goto onError; 4419 } 4420 else { 4421 startinpos = s-starts; 4422 s++; 4423 errmsg = "unexpected special character"; 4424 goto utf7Error; 4425 } 4426 continue; 4427utf7Error: 4428 endinpos = s-starts; 4429 if (unicode_decode_call_errorhandler_writer( 4430 errors, &errorHandler, 4431 "utf7", errmsg, 4432 &starts, &e, &startinpos, &endinpos, &exc, &s, 4433 &writer)) 4434 goto onError; 4435 } 4436 4437 /* end of string */ 4438 4439 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4440 /* if we're in an inconsistent state, that's an error */ 4441 if (surrogate || 4442 (base64bits >= 6) || 4443 (base64bits > 0 && base64buffer != 0)) { 4444 endinpos = size; 4445 if (unicode_decode_call_errorhandler_writer( 4446 errors, &errorHandler, 4447 "utf7", "unterminated shift sequence", 4448 &starts, &e, &startinpos, &endinpos, &exc, &s, 4449 &writer)) 4450 goto onError; 4451 if (s < e) 4452 goto restart; 4453 } 4454 } 4455 4456 /* return state */ 4457 if (consumed) { 4458 if (inShift) { 4459 writer.pos = shiftOutStart; /* back off output */ 4460 *consumed = startinpos; 4461 } 4462 else { 4463 *consumed = s-starts; 4464 } 4465 } 4466 4467 Py_XDECREF(errorHandler); 4468 Py_XDECREF(exc); 4469 return _PyUnicodeWriter_Finish(&writer); 4470 4471 onError: 4472 Py_XDECREF(errorHandler); 4473 Py_XDECREF(exc); 4474 _PyUnicodeWriter_Dealloc(&writer); 4475 return NULL; 4476} 4477 4478 4479PyObject * 4480_PyUnicode_EncodeUTF7(PyObject *str, 4481 int base64SetO, 4482 int base64WhiteSpace, 4483 const char *errors) 4484{ 4485 int kind; 4486 void *data; 4487 Py_ssize_t len; 4488 PyObject *v; 4489 int inShift = 0; 4490 Py_ssize_t i; 4491 unsigned int base64bits = 0; 4492 unsigned long base64buffer = 0; 4493 char * out; 4494 char * start; 4495 4496 if (PyUnicode_READY(str) == -1) 4497 return NULL; 4498 kind = PyUnicode_KIND(str); 4499 data = PyUnicode_DATA(str); 4500 len = PyUnicode_GET_LENGTH(str); 4501 4502 if (len == 0) 4503 return PyBytes_FromStringAndSize(NULL, 0); 4504 4505 /* It might be possible to tighten this worst case */ 4506 if (len > PY_SSIZE_T_MAX / 8) 4507 return PyErr_NoMemory(); 4508 v = PyBytes_FromStringAndSize(NULL, len * 8); 4509 if (v == NULL) 4510 return NULL; 4511 4512 start = out = PyBytes_AS_STRING(v); 4513 for (i = 0; i < len; ++i) { 4514 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4515 4516 if (inShift) { 4517 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4518 /* shifting out */ 4519 if (base64bits) { /* output remaining bits */ 4520 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4521 base64buffer = 0; 4522 base64bits = 0; 4523 } 4524 inShift = 0; 4525 /* Characters not in the BASE64 set implicitly unshift the sequence 4526 so no '-' is required, except if the character is itself a '-' */ 4527 if (IS_BASE64(ch) || ch == '-') { 4528 *out++ = '-'; 4529 } 4530 *out++ = (char) ch; 4531 } 4532 else { 4533 goto encode_char; 4534 } 4535 } 4536 else { /* not in a shift sequence */ 4537 if (ch == '+') { 4538 *out++ = '+'; 4539 *out++ = '-'; 4540 } 4541 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4542 *out++ = (char) ch; 4543 } 4544 else { 4545 *out++ = '+'; 4546 inShift = 1; 4547 goto encode_char; 4548 } 4549 } 4550 continue; 4551encode_char: 4552 if (ch >= 0x10000) { 4553 assert(ch <= MAX_UNICODE); 4554 4555 /* code first surrogate */ 4556 base64bits += 16; 4557 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4558 while (base64bits >= 6) { 4559 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4560 base64bits -= 6; 4561 } 4562 /* prepare second surrogate */ 4563 ch = Py_UNICODE_LOW_SURROGATE(ch); 4564 } 4565 base64bits += 16; 4566 base64buffer = (base64buffer << 16) | ch; 4567 while (base64bits >= 6) { 4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4569 base64bits -= 6; 4570 } 4571 } 4572 if (base64bits) 4573 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4574 if (inShift) 4575 *out++ = '-'; 4576 if (_PyBytes_Resize(&v, out - start) < 0) 4577 return NULL; 4578 return v; 4579} 4580PyObject * 4581PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4582 Py_ssize_t size, 4583 int base64SetO, 4584 int base64WhiteSpace, 4585 const char *errors) 4586{ 4587 PyObject *result; 4588 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4589 if (tmp == NULL) 4590 return NULL; 4591 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4592 base64WhiteSpace, errors); 4593 Py_DECREF(tmp); 4594 return result; 4595} 4596 4597#undef IS_BASE64 4598#undef FROM_BASE64 4599#undef TO_BASE64 4600#undef DECODE_DIRECT 4601#undef ENCODE_DIRECT 4602 4603/* --- UTF-8 Codec -------------------------------------------------------- */ 4604 4605PyObject * 4606PyUnicode_DecodeUTF8(const char *s, 4607 Py_ssize_t size, 4608 const char *errors) 4609{ 4610 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4611} 4612 4613#include "stringlib/asciilib.h" 4614#include "stringlib/codecs.h" 4615#include "stringlib/undef.h" 4616 4617#include "stringlib/ucs1lib.h" 4618#include "stringlib/codecs.h" 4619#include "stringlib/undef.h" 4620 4621#include "stringlib/ucs2lib.h" 4622#include "stringlib/codecs.h" 4623#include "stringlib/undef.h" 4624 4625#include "stringlib/ucs4lib.h" 4626#include "stringlib/codecs.h" 4627#include "stringlib/undef.h" 4628 4629/* Mask to quickly check whether a C 'long' contains a 4630 non-ASCII, UTF8-encoded char. */ 4631#if (SIZEOF_LONG == 8) 4632# define ASCII_CHAR_MASK 0x8080808080808080UL 4633#elif (SIZEOF_LONG == 4) 4634# define ASCII_CHAR_MASK 0x80808080UL 4635#else 4636# error C 'long' size should be either 4 or 8! 4637#endif 4638 4639static Py_ssize_t 4640ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4641{ 4642 const char *p = start; 4643 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4644 4645 /* 4646 * Issue #17237: m68k is a bit different from most architectures in 4647 * that objects do not use "natural alignment" - for example, int and 4648 * long are only aligned at 2-byte boundaries. Therefore the assert() 4649 * won't work; also, tests have shown that skipping the "optimised 4650 * version" will even speed up m68k. 4651 */ 4652#if !defined(__m68k__) 4653#if SIZEOF_LONG <= SIZEOF_VOID_P 4654 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4655 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4656 /* Fast path, see in STRINGLIB(utf8_decode) for 4657 an explanation. */ 4658 /* Help register allocation */ 4659 register const char *_p = p; 4660 register Py_UCS1 * q = dest; 4661 while (_p < aligned_end) { 4662 unsigned long value = *(const unsigned long *) _p; 4663 if (value & ASCII_CHAR_MASK) 4664 break; 4665 *((unsigned long *)q) = value; 4666 _p += SIZEOF_LONG; 4667 q += SIZEOF_LONG; 4668 } 4669 p = _p; 4670 while (p < end) { 4671 if ((unsigned char)*p & 0x80) 4672 break; 4673 *q++ = *p++; 4674 } 4675 return p - start; 4676 } 4677#endif 4678#endif 4679 while (p < end) { 4680 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4681 for an explanation. */ 4682 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4683 /* Help register allocation */ 4684 register const char *_p = p; 4685 while (_p < aligned_end) { 4686 unsigned long value = *(unsigned long *) _p; 4687 if (value & ASCII_CHAR_MASK) 4688 break; 4689 _p += SIZEOF_LONG; 4690 } 4691 p = _p; 4692 if (_p == end) 4693 break; 4694 } 4695 if ((unsigned char)*p & 0x80) 4696 break; 4697 ++p; 4698 } 4699 memcpy(dest, start, p - start); 4700 return p - start; 4701} 4702 4703PyObject * 4704PyUnicode_DecodeUTF8Stateful(const char *s, 4705 Py_ssize_t size, 4706 const char *errors, 4707 Py_ssize_t *consumed) 4708{ 4709 _PyUnicodeWriter writer; 4710 const char *starts = s; 4711 const char *end = s + size; 4712 4713 Py_ssize_t startinpos; 4714 Py_ssize_t endinpos; 4715 const char *errmsg = ""; 4716 PyObject *errorHandler = NULL; 4717 PyObject *exc = NULL; 4718 4719 if (size == 0) { 4720 if (consumed) 4721 *consumed = 0; 4722 _Py_RETURN_UNICODE_EMPTY(); 4723 } 4724 4725 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4726 if (size == 1 && (unsigned char)s[0] < 128) { 4727 if (consumed) 4728 *consumed = 1; 4729 return get_latin1_char((unsigned char)s[0]); 4730 } 4731 4732 _PyUnicodeWriter_Init(&writer); 4733 writer.min_length = size; 4734 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4735 goto onError; 4736 4737 writer.pos = ascii_decode(s, end, writer.data); 4738 s += writer.pos; 4739 while (s < end) { 4740 Py_UCS4 ch; 4741 int kind = writer.kind; 4742 if (kind == PyUnicode_1BYTE_KIND) { 4743 if (PyUnicode_IS_ASCII(writer.buffer)) 4744 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4745 else 4746 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4747 } else if (kind == PyUnicode_2BYTE_KIND) { 4748 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4749 } else { 4750 assert(kind == PyUnicode_4BYTE_KIND); 4751 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4752 } 4753 4754 switch (ch) { 4755 case 0: 4756 if (s == end || consumed) 4757 goto End; 4758 errmsg = "unexpected end of data"; 4759 startinpos = s - starts; 4760 endinpos = end - starts; 4761 break; 4762 case 1: 4763 errmsg = "invalid start byte"; 4764 startinpos = s - starts; 4765 endinpos = startinpos + 1; 4766 break; 4767 case 2: 4768 case 3: 4769 case 4: 4770 errmsg = "invalid continuation byte"; 4771 startinpos = s - starts; 4772 endinpos = startinpos + ch - 1; 4773 break; 4774 default: 4775 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4776 goto onError; 4777 continue; 4778 } 4779 4780 if (unicode_decode_call_errorhandler_writer( 4781 errors, &errorHandler, 4782 "utf-8", errmsg, 4783 &starts, &end, &startinpos, &endinpos, &exc, &s, 4784 &writer)) 4785 goto onError; 4786 } 4787 4788End: 4789 if (consumed) 4790 *consumed = s - starts; 4791 4792 Py_XDECREF(errorHandler); 4793 Py_XDECREF(exc); 4794 return _PyUnicodeWriter_Finish(&writer); 4795 4796onError: 4797 Py_XDECREF(errorHandler); 4798 Py_XDECREF(exc); 4799 _PyUnicodeWriter_Dealloc(&writer); 4800 return NULL; 4801} 4802 4803#ifdef __APPLE__ 4804 4805/* Simplified UTF-8 decoder using surrogateescape error handler, 4806 used to decode the command line arguments on Mac OS X. 4807 4808 Return a pointer to a newly allocated wide character string (use 4809 PyMem_Free() to free the memory), or NULL on memory allocation error. */ 4810 4811wchar_t* 4812_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4813{ 4814 const char *e; 4815 wchar_t *unicode; 4816 Py_ssize_t outpos; 4817 4818 /* Note: size will always be longer than the resulting Unicode 4819 character count */ 4820 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4821 return NULL; 4822 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4823 if (!unicode) 4824 return NULL; 4825 4826 /* Unpack UTF-8 encoded data */ 4827 e = s + size; 4828 outpos = 0; 4829 while (s < e) { 4830 Py_UCS4 ch; 4831#if SIZEOF_WCHAR_T == 4 4832 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4833#else 4834 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4835#endif 4836 if (ch > 0xFF) { 4837#if SIZEOF_WCHAR_T == 4 4838 assert(0); 4839#else 4840 assert(Py_UNICODE_IS_SURROGATE(ch)); 4841 /* compute and append the two surrogates: */ 4842 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4843 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4844#endif 4845 } 4846 else { 4847 if (!ch && s == e) 4848 break; 4849 /* surrogateescape */ 4850 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4851 } 4852 } 4853 unicode[outpos] = L'\0'; 4854 return unicode; 4855} 4856 4857#endif /* __APPLE__ */ 4858 4859/* Primary internal function which creates utf8 encoded bytes objects. 4860 4861 Allocation strategy: if the string is short, convert into a stack buffer 4862 and allocate exactly as much space needed at the end. Else allocate the 4863 maximum possible needed (4 result bytes per Unicode character), and return 4864 the excess memory at the end. 4865*/ 4866PyObject * 4867_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4868{ 4869 enum PyUnicode_Kind kind; 4870 void *data; 4871 Py_ssize_t size; 4872 4873 if (!PyUnicode_Check(unicode)) { 4874 PyErr_BadArgument(); 4875 return NULL; 4876 } 4877 4878 if (PyUnicode_READY(unicode) == -1) 4879 return NULL; 4880 4881 if (PyUnicode_UTF8(unicode)) 4882 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4883 PyUnicode_UTF8_LENGTH(unicode)); 4884 4885 kind = PyUnicode_KIND(unicode); 4886 data = PyUnicode_DATA(unicode); 4887 size = PyUnicode_GET_LENGTH(unicode); 4888 4889 switch (kind) { 4890 default: 4891 assert(0); 4892 case PyUnicode_1BYTE_KIND: 4893 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4894 assert(!PyUnicode_IS_ASCII(unicode)); 4895 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4896 case PyUnicode_2BYTE_KIND: 4897 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4898 case PyUnicode_4BYTE_KIND: 4899 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4900 } 4901} 4902 4903PyObject * 4904PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4905 Py_ssize_t size, 4906 const char *errors) 4907{ 4908 PyObject *v, *unicode; 4909 4910 unicode = PyUnicode_FromUnicode(s, size); 4911 if (unicode == NULL) 4912 return NULL; 4913 v = _PyUnicode_AsUTF8String(unicode, errors); 4914 Py_DECREF(unicode); 4915 return v; 4916} 4917 4918PyObject * 4919PyUnicode_AsUTF8String(PyObject *unicode) 4920{ 4921 return _PyUnicode_AsUTF8String(unicode, NULL); 4922} 4923 4924/* --- UTF-32 Codec ------------------------------------------------------- */ 4925 4926PyObject * 4927PyUnicode_DecodeUTF32(const char *s, 4928 Py_ssize_t size, 4929 const char *errors, 4930 int *byteorder) 4931{ 4932 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4933} 4934 4935PyObject * 4936PyUnicode_DecodeUTF32Stateful(const char *s, 4937 Py_ssize_t size, 4938 const char *errors, 4939 int *byteorder, 4940 Py_ssize_t *consumed) 4941{ 4942 const char *starts = s; 4943 Py_ssize_t startinpos; 4944 Py_ssize_t endinpos; 4945 _PyUnicodeWriter writer; 4946 const unsigned char *q, *e; 4947 int le, bo = 0; /* assume native ordering by default */ 4948 const char *errmsg = ""; 4949 PyObject *errorHandler = NULL; 4950 PyObject *exc = NULL; 4951 4952 q = (unsigned char *)s; 4953 e = q + size; 4954 4955 if (byteorder) 4956 bo = *byteorder; 4957 4958 /* Check for BOM marks (U+FEFF) in the input and adjust current 4959 byte order setting accordingly. In native mode, the leading BOM 4960 mark is skipped, in all other modes, it is copied to the output 4961 stream as-is (giving a ZWNBSP character). */ 4962 if (bo == 0 && size >= 4) { 4963 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4964 if (bom == 0x0000FEFF) { 4965 bo = -1; 4966 q += 4; 4967 } 4968 else if (bom == 0xFFFE0000) { 4969 bo = 1; 4970 q += 4; 4971 } 4972 if (byteorder) 4973 *byteorder = bo; 4974 } 4975 4976 if (q == e) { 4977 if (consumed) 4978 *consumed = size; 4979 _Py_RETURN_UNICODE_EMPTY(); 4980 } 4981 4982#ifdef WORDS_BIGENDIAN 4983 le = bo < 0; 4984#else 4985 le = bo <= 0; 4986#endif 4987 4988 _PyUnicodeWriter_Init(&writer); 4989 writer.min_length = (e - q + 3) / 4; 4990 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4991 goto onError; 4992 4993 while (1) { 4994 Py_UCS4 ch = 0; 4995 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 4996 4997 if (e - q >= 4) { 4998 enum PyUnicode_Kind kind = writer.kind; 4999 void *data = writer.data; 5000 const unsigned char *last = e - 4; 5001 Py_ssize_t pos = writer.pos; 5002 if (le) { 5003 do { 5004 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5005 if (ch > maxch) 5006 break; 5007 PyUnicode_WRITE(kind, data, pos++, ch); 5008 q += 4; 5009 } while (q <= last); 5010 } 5011 else { 5012 do { 5013 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5014 if (ch > maxch) 5015 break; 5016 PyUnicode_WRITE(kind, data, pos++, ch); 5017 q += 4; 5018 } while (q <= last); 5019 } 5020 writer.pos = pos; 5021 } 5022 5023 if (ch <= maxch) { 5024 if (q == e || consumed) 5025 break; 5026 /* remaining bytes at the end? (size should be divisible by 4) */ 5027 errmsg = "truncated data"; 5028 startinpos = ((const char *)q) - starts; 5029 endinpos = ((const char *)e) - starts; 5030 } 5031 else { 5032 if (ch < 0x110000) { 5033 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5034 goto onError; 5035 q += 4; 5036 continue; 5037 } 5038 errmsg = "codepoint not in range(0x110000)"; 5039 startinpos = ((const char *)q) - starts; 5040 endinpos = startinpos + 4; 5041 } 5042 5043 /* The remaining input chars are ignored if the callback 5044 chooses to skip the input */ 5045 if (unicode_decode_call_errorhandler_writer( 5046 errors, &errorHandler, 5047 "utf32", errmsg, 5048 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5049 &writer)) 5050 goto onError; 5051 } 5052 5053 if (consumed) 5054 *consumed = (const char *)q-starts; 5055 5056 Py_XDECREF(errorHandler); 5057 Py_XDECREF(exc); 5058 return _PyUnicodeWriter_Finish(&writer); 5059 5060 onError: 5061 _PyUnicodeWriter_Dealloc(&writer); 5062 Py_XDECREF(errorHandler); 5063 Py_XDECREF(exc); 5064 return NULL; 5065} 5066 5067PyObject * 5068_PyUnicode_EncodeUTF32(PyObject *str, 5069 const char *errors, 5070 int byteorder) 5071{ 5072 int kind; 5073 void *data; 5074 Py_ssize_t len; 5075 PyObject *v; 5076 unsigned char *p; 5077 Py_ssize_t nsize, i; 5078 /* Offsets from p for storing byte pairs in the right order. */ 5079#if PY_LITTLE_ENDIAN 5080 int iorder[] = {0, 1, 2, 3}; 5081#else 5082 int iorder[] = {3, 2, 1, 0}; 5083#endif 5084 5085#define STORECHAR(CH) \ 5086 do { \ 5087 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5088 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5089 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5090 p[iorder[0]] = (CH) & 0xff; \ 5091 p += 4; \ 5092 } while(0) 5093 5094 if (!PyUnicode_Check(str)) { 5095 PyErr_BadArgument(); 5096 return NULL; 5097 } 5098 if (PyUnicode_READY(str) == -1) 5099 return NULL; 5100 kind = PyUnicode_KIND(str); 5101 data = PyUnicode_DATA(str); 5102 len = PyUnicode_GET_LENGTH(str); 5103 5104 nsize = len + (byteorder == 0); 5105 if (nsize > PY_SSIZE_T_MAX / 4) 5106 return PyErr_NoMemory(); 5107 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5108 if (v == NULL) 5109 return NULL; 5110 5111 p = (unsigned char *)PyBytes_AS_STRING(v); 5112 if (byteorder == 0) 5113 STORECHAR(0xFEFF); 5114 if (len == 0) 5115 goto done; 5116 5117 if (byteorder == -1) { 5118 /* force LE */ 5119 iorder[0] = 0; 5120 iorder[1] = 1; 5121 iorder[2] = 2; 5122 iorder[3] = 3; 5123 } 5124 else if (byteorder == 1) { 5125 /* force BE */ 5126 iorder[0] = 3; 5127 iorder[1] = 2; 5128 iorder[2] = 1; 5129 iorder[3] = 0; 5130 } 5131 5132 for (i = 0; i < len; i++) 5133 STORECHAR(PyUnicode_READ(kind, data, i)); 5134 5135 done: 5136 return v; 5137#undef STORECHAR 5138} 5139 5140PyObject * 5141PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5142 Py_ssize_t size, 5143 const char *errors, 5144 int byteorder) 5145{ 5146 PyObject *result; 5147 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5148 if (tmp == NULL) 5149 return NULL; 5150 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5151 Py_DECREF(tmp); 5152 return result; 5153} 5154 5155PyObject * 5156PyUnicode_AsUTF32String(PyObject *unicode) 5157{ 5158 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5159} 5160 5161/* --- UTF-16 Codec ------------------------------------------------------- */ 5162 5163PyObject * 5164PyUnicode_DecodeUTF16(const char *s, 5165 Py_ssize_t size, 5166 const char *errors, 5167 int *byteorder) 5168{ 5169 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5170} 5171 5172PyObject * 5173PyUnicode_DecodeUTF16Stateful(const char *s, 5174 Py_ssize_t size, 5175 const char *errors, 5176 int *byteorder, 5177 Py_ssize_t *consumed) 5178{ 5179 const char *starts = s; 5180 Py_ssize_t startinpos; 5181 Py_ssize_t endinpos; 5182 _PyUnicodeWriter writer; 5183 const unsigned char *q, *e; 5184 int bo = 0; /* assume native ordering by default */ 5185 int native_ordering; 5186 const char *errmsg = ""; 5187 PyObject *errorHandler = NULL; 5188 PyObject *exc = NULL; 5189 5190 q = (unsigned char *)s; 5191 e = q + size; 5192 5193 if (byteorder) 5194 bo = *byteorder; 5195 5196 /* Check for BOM marks (U+FEFF) in the input and adjust current 5197 byte order setting accordingly. In native mode, the leading BOM 5198 mark is skipped, in all other modes, it is copied to the output 5199 stream as-is (giving a ZWNBSP character). */ 5200 if (bo == 0 && size >= 2) { 5201 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5202 if (bom == 0xFEFF) { 5203 q += 2; 5204 bo = -1; 5205 } 5206 else if (bom == 0xFFFE) { 5207 q += 2; 5208 bo = 1; 5209 } 5210 if (byteorder) 5211 *byteorder = bo; 5212 } 5213 5214 if (q == e) { 5215 if (consumed) 5216 *consumed = size; 5217 _Py_RETURN_UNICODE_EMPTY(); 5218 } 5219 5220#if PY_LITTLE_ENDIAN 5221 native_ordering = bo <= 0; 5222#else 5223 native_ordering = bo >= 0; 5224#endif 5225 5226 /* Note: size will always be longer than the resulting Unicode 5227 character count */ 5228 _PyUnicodeWriter_Init(&writer); 5229 writer.min_length = (e - q + 1) / 2; 5230 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5231 goto onError; 5232 5233 while (1) { 5234 Py_UCS4 ch = 0; 5235 if (e - q >= 2) { 5236 int kind = writer.kind; 5237 if (kind == PyUnicode_1BYTE_KIND) { 5238 if (PyUnicode_IS_ASCII(writer.buffer)) 5239 ch = asciilib_utf16_decode(&q, e, 5240 (Py_UCS1*)writer.data, &writer.pos, 5241 native_ordering); 5242 else 5243 ch = ucs1lib_utf16_decode(&q, e, 5244 (Py_UCS1*)writer.data, &writer.pos, 5245 native_ordering); 5246 } else if (kind == PyUnicode_2BYTE_KIND) { 5247 ch = ucs2lib_utf16_decode(&q, e, 5248 (Py_UCS2*)writer.data, &writer.pos, 5249 native_ordering); 5250 } else { 5251 assert(kind == PyUnicode_4BYTE_KIND); 5252 ch = ucs4lib_utf16_decode(&q, e, 5253 (Py_UCS4*)writer.data, &writer.pos, 5254 native_ordering); 5255 } 5256 } 5257 5258 switch (ch) 5259 { 5260 case 0: 5261 /* remaining byte at the end? (size should be even) */ 5262 if (q == e || consumed) 5263 goto End; 5264 errmsg = "truncated data"; 5265 startinpos = ((const char *)q) - starts; 5266 endinpos = ((const char *)e) - starts; 5267 break; 5268 /* The remaining input chars are ignored if the callback 5269 chooses to skip the input */ 5270 case 1: 5271 q -= 2; 5272 if (consumed) 5273 goto End; 5274 errmsg = "unexpected end of data"; 5275 startinpos = ((const char *)q) - starts; 5276 endinpos = ((const char *)e) - starts; 5277 break; 5278 case 2: 5279 errmsg = "illegal encoding"; 5280 startinpos = ((const char *)q) - 2 - starts; 5281 endinpos = startinpos + 2; 5282 break; 5283 case 3: 5284 errmsg = "illegal UTF-16 surrogate"; 5285 startinpos = ((const char *)q) - 4 - starts; 5286 endinpos = startinpos + 2; 5287 break; 5288 default: 5289 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5290 goto onError; 5291 continue; 5292 } 5293 5294 if (unicode_decode_call_errorhandler_writer( 5295 errors, 5296 &errorHandler, 5297 "utf16", errmsg, 5298 &starts, 5299 (const char **)&e, 5300 &startinpos, 5301 &endinpos, 5302 &exc, 5303 (const char **)&q, 5304 &writer)) 5305 goto onError; 5306 } 5307 5308End: 5309 if (consumed) 5310 *consumed = (const char *)q-starts; 5311 5312 Py_XDECREF(errorHandler); 5313 Py_XDECREF(exc); 5314 return _PyUnicodeWriter_Finish(&writer); 5315 5316 onError: 5317 _PyUnicodeWriter_Dealloc(&writer); 5318 Py_XDECREF(errorHandler); 5319 Py_XDECREF(exc); 5320 return NULL; 5321} 5322 5323PyObject * 5324_PyUnicode_EncodeUTF16(PyObject *str, 5325 const char *errors, 5326 int byteorder) 5327{ 5328 enum PyUnicode_Kind kind; 5329 const void *data; 5330 Py_ssize_t len; 5331 PyObject *v; 5332 unsigned short *out; 5333 Py_ssize_t bytesize; 5334 Py_ssize_t pairs; 5335#if PY_BIG_ENDIAN 5336 int native_ordering = byteorder >= 0; 5337#else 5338 int native_ordering = byteorder <= 0; 5339#endif 5340 5341 if (!PyUnicode_Check(str)) { 5342 PyErr_BadArgument(); 5343 return NULL; 5344 } 5345 if (PyUnicode_READY(str) == -1) 5346 return NULL; 5347 kind = PyUnicode_KIND(str); 5348 data = PyUnicode_DATA(str); 5349 len = PyUnicode_GET_LENGTH(str); 5350 5351 pairs = 0; 5352 if (kind == PyUnicode_4BYTE_KIND) { 5353 const Py_UCS4 *in = (const Py_UCS4 *)data; 5354 const Py_UCS4 *end = in + len; 5355 while (in < end) 5356 if (*in++ >= 0x10000) 5357 pairs++; 5358 } 5359 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5360 return PyErr_NoMemory(); 5361 bytesize = (len + pairs + (byteorder == 0)) * 2; 5362 v = PyBytes_FromStringAndSize(NULL, bytesize); 5363 if (v == NULL) 5364 return NULL; 5365 5366 /* output buffer is 2-bytes aligned */ 5367 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5368 out = (unsigned short *)PyBytes_AS_STRING(v); 5369 if (byteorder == 0) 5370 *out++ = 0xFEFF; 5371 if (len == 0) 5372 goto done; 5373 5374 switch (kind) { 5375 case PyUnicode_1BYTE_KIND: { 5376 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5377 break; 5378 } 5379 case PyUnicode_2BYTE_KIND: { 5380 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5381 break; 5382 } 5383 case PyUnicode_4BYTE_KIND: { 5384 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5385 break; 5386 } 5387 default: 5388 assert(0); 5389 } 5390 5391 done: 5392 return v; 5393} 5394 5395PyObject * 5396PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5397 Py_ssize_t size, 5398 const char *errors, 5399 int byteorder) 5400{ 5401 PyObject *result; 5402 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5403 if (tmp == NULL) 5404 return NULL; 5405 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5406 Py_DECREF(tmp); 5407 return result; 5408} 5409 5410PyObject * 5411PyUnicode_AsUTF16String(PyObject *unicode) 5412{ 5413 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5414} 5415 5416/* --- Unicode Escape Codec ----------------------------------------------- */ 5417 5418/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5419 if all the escapes in the string make it still a valid ASCII string. 5420 Returns -1 if any escapes were found which cause the string to 5421 pop out of ASCII range. Otherwise returns the length of the 5422 required buffer to hold the string. 5423 */ 5424static Py_ssize_t 5425length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5426{ 5427 const unsigned char *p = (const unsigned char *)s; 5428 const unsigned char *end = p + size; 5429 Py_ssize_t length = 0; 5430 5431 if (size < 0) 5432 return -1; 5433 5434 for (; p < end; ++p) { 5435 if (*p > 127) { 5436 /* Non-ASCII */ 5437 return -1; 5438 } 5439 else if (*p != '\\') { 5440 /* Normal character */ 5441 ++length; 5442 } 5443 else { 5444 /* Backslash-escape, check next char */ 5445 ++p; 5446 /* Escape sequence reaches till end of string or 5447 non-ASCII follow-up. */ 5448 if (p >= end || *p > 127) 5449 return -1; 5450 switch (*p) { 5451 case '\n': 5452 /* backslash + \n result in zero characters */ 5453 break; 5454 case '\\': case '\'': case '\"': 5455 case 'b': case 'f': case 't': 5456 case 'n': case 'r': case 'v': case 'a': 5457 ++length; 5458 break; 5459 case '0': case '1': case '2': case '3': 5460 case '4': case '5': case '6': case '7': 5461 case 'x': case 'u': case 'U': case 'N': 5462 /* these do not guarantee ASCII characters */ 5463 return -1; 5464 default: 5465 /* count the backslash + the other character */ 5466 length += 2; 5467 } 5468 } 5469 } 5470 return length; 5471} 5472 5473static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5474 5475PyObject * 5476PyUnicode_DecodeUnicodeEscape(const char *s, 5477 Py_ssize_t size, 5478 const char *errors) 5479{ 5480 const char *starts = s; 5481 Py_ssize_t startinpos; 5482 Py_ssize_t endinpos; 5483 _PyUnicodeWriter writer; 5484 const char *end; 5485 char* message; 5486 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5487 PyObject *errorHandler = NULL; 5488 PyObject *exc = NULL; 5489 Py_ssize_t len; 5490 5491 len = length_of_escaped_ascii_string(s, size); 5492 if (len == 0) 5493 _Py_RETURN_UNICODE_EMPTY(); 5494 5495 /* After length_of_escaped_ascii_string() there are two alternatives, 5496 either the string is pure ASCII with named escapes like \n, etc. 5497 and we determined it's exact size (common case) 5498 or it contains \x, \u, ... escape sequences. then we create a 5499 legacy wchar string and resize it at the end of this function. */ 5500 _PyUnicodeWriter_Init(&writer); 5501 if (len > 0) { 5502 writer.min_length = len; 5503 } 5504 else { 5505 /* Escaped strings will always be longer than the resulting 5506 Unicode string, so we start with size here and then reduce the 5507 length after conversion to the true value. 5508 (but if the error callback returns a long replacement string 5509 we'll have to allocate more space) */ 5510 writer.min_length = size; 5511 } 5512 5513 if (size == 0) 5514 return _PyUnicodeWriter_Finish(&writer); 5515 end = s + size; 5516 5517 while (s < end) { 5518 unsigned char c; 5519 Py_UCS4 x; 5520 int digits; 5521 5522 /* Non-escape characters are interpreted as Unicode ordinals */ 5523 if (*s != '\\') { 5524 x = (unsigned char)*s; 5525 s++; 5526 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5527 goto onError; 5528 continue; 5529 } 5530 5531 startinpos = s-starts; 5532 /* \ - Escapes */ 5533 s++; 5534 c = *s++; 5535 if (s > end) 5536 c = '\0'; /* Invalid after \ */ 5537 5538 switch (c) { 5539 5540 /* \x escapes */ 5541#define WRITECHAR(ch) \ 5542 do { \ 5543 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5544 goto onError; \ 5545 } while(0) 5546 5547 case '\n': break; 5548 case '\\': WRITECHAR('\\'); break; 5549 case '\'': WRITECHAR('\''); break; 5550 case '\"': WRITECHAR('\"'); break; 5551 case 'b': WRITECHAR('\b'); break; 5552 /* FF */ 5553 case 'f': WRITECHAR('\014'); break; 5554 case 't': WRITECHAR('\t'); break; 5555 case 'n': WRITECHAR('\n'); break; 5556 case 'r': WRITECHAR('\r'); break; 5557 /* VT */ 5558 case 'v': WRITECHAR('\013'); break; 5559 /* BEL, not classic C */ 5560 case 'a': WRITECHAR('\007'); break; 5561 5562 /* \OOO (octal) escapes */ 5563 case '0': case '1': case '2': case '3': 5564 case '4': case '5': case '6': case '7': 5565 x = s[-1] - '0'; 5566 if (s < end && '0' <= *s && *s <= '7') { 5567 x = (x<<3) + *s++ - '0'; 5568 if (s < end && '0' <= *s && *s <= '7') 5569 x = (x<<3) + *s++ - '0'; 5570 } 5571 WRITECHAR(x); 5572 break; 5573 5574 /* hex escapes */ 5575 /* \xXX */ 5576 case 'x': 5577 digits = 2; 5578 message = "truncated \\xXX escape"; 5579 goto hexescape; 5580 5581 /* \uXXXX */ 5582 case 'u': 5583 digits = 4; 5584 message = "truncated \\uXXXX escape"; 5585 goto hexescape; 5586 5587 /* \UXXXXXXXX */ 5588 case 'U': 5589 digits = 8; 5590 message = "truncated \\UXXXXXXXX escape"; 5591 hexescape: 5592 chr = 0; 5593 if (end - s < digits) { 5594 /* count only hex digits */ 5595 for (; s < end; ++s) { 5596 c = (unsigned char)*s; 5597 if (!Py_ISXDIGIT(c)) 5598 goto error; 5599 } 5600 goto error; 5601 } 5602 for (; digits--; ++s) { 5603 c = (unsigned char)*s; 5604 if (!Py_ISXDIGIT(c)) 5605 goto error; 5606 chr = (chr<<4) & ~0xF; 5607 if (c >= '0' && c <= '9') 5608 chr += c - '0'; 5609 else if (c >= 'a' && c <= 'f') 5610 chr += 10 + c - 'a'; 5611 else 5612 chr += 10 + c - 'A'; 5613 } 5614 if (chr == 0xffffffff && PyErr_Occurred()) 5615 /* _decoding_error will have already written into the 5616 target buffer. */ 5617 break; 5618 store: 5619 /* when we get here, chr is a 32-bit unicode character */ 5620 message = "illegal Unicode character"; 5621 if (chr > MAX_UNICODE) 5622 goto error; 5623 WRITECHAR(chr); 5624 break; 5625 5626 /* \N{name} */ 5627 case 'N': 5628 message = "malformed \\N character escape"; 5629 if (ucnhash_CAPI == NULL) { 5630 /* load the unicode data module */ 5631 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5632 PyUnicodeData_CAPSULE_NAME, 1); 5633 if (ucnhash_CAPI == NULL) 5634 goto ucnhashError; 5635 } 5636 if (*s == '{') { 5637 const char *start = s+1; 5638 /* look for the closing brace */ 5639 while (*s != '}' && s < end) 5640 s++; 5641 if (s > start && s < end && *s == '}') { 5642 /* found a name. look it up in the unicode database */ 5643 message = "unknown Unicode character name"; 5644 s++; 5645 if (s - start - 1 <= INT_MAX && 5646 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5647 &chr, 0)) 5648 goto store; 5649 } 5650 } 5651 goto error; 5652 5653 default: 5654 if (s > end) { 5655 message = "\\ at end of string"; 5656 s--; 5657 goto error; 5658 } 5659 else { 5660 WRITECHAR('\\'); 5661 WRITECHAR((unsigned char)s[-1]); 5662 } 5663 break; 5664 } 5665 continue; 5666 5667 error: 5668 endinpos = s-starts; 5669 if (unicode_decode_call_errorhandler_writer( 5670 errors, &errorHandler, 5671 "unicodeescape", message, 5672 &starts, &end, &startinpos, &endinpos, &exc, &s, 5673 &writer)) 5674 goto onError; 5675 continue; 5676 } 5677#undef WRITECHAR 5678 5679 Py_XDECREF(errorHandler); 5680 Py_XDECREF(exc); 5681 return _PyUnicodeWriter_Finish(&writer); 5682 5683 ucnhashError: 5684 PyErr_SetString( 5685 PyExc_UnicodeError, 5686 "\\N escapes not supported (can't load unicodedata module)" 5687 ); 5688 _PyUnicodeWriter_Dealloc(&writer); 5689 Py_XDECREF(errorHandler); 5690 Py_XDECREF(exc); 5691 return NULL; 5692 5693 onError: 5694 _PyUnicodeWriter_Dealloc(&writer); 5695 Py_XDECREF(errorHandler); 5696 Py_XDECREF(exc); 5697 return NULL; 5698} 5699 5700/* Return a Unicode-Escape string version of the Unicode object. 5701 5702 If quotes is true, the string is enclosed in u"" or u'' quotes as 5703 appropriate. 5704 5705*/ 5706 5707PyObject * 5708PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5709{ 5710 Py_ssize_t i, len; 5711 PyObject *repr; 5712 char *p; 5713 int kind; 5714 void *data; 5715 Py_ssize_t expandsize = 0; 5716 5717 /* Initial allocation is based on the longest-possible character 5718 escape. 5719 5720 For UCS1 strings it's '\xxx', 4 bytes per source character. 5721 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5722 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5723 */ 5724 5725 if (!PyUnicode_Check(unicode)) { 5726 PyErr_BadArgument(); 5727 return NULL; 5728 } 5729 if (PyUnicode_READY(unicode) == -1) 5730 return NULL; 5731 len = PyUnicode_GET_LENGTH(unicode); 5732 kind = PyUnicode_KIND(unicode); 5733 data = PyUnicode_DATA(unicode); 5734 switch (kind) { 5735 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5736 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5737 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5738 } 5739 5740 if (len == 0) 5741 return PyBytes_FromStringAndSize(NULL, 0); 5742 5743 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5744 return PyErr_NoMemory(); 5745 5746 repr = PyBytes_FromStringAndSize(NULL, 5747 2 5748 + expandsize*len 5749 + 1); 5750 if (repr == NULL) 5751 return NULL; 5752 5753 p = PyBytes_AS_STRING(repr); 5754 5755 for (i = 0; i < len; i++) { 5756 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5757 5758 /* Escape backslashes */ 5759 if (ch == '\\') { 5760 *p++ = '\\'; 5761 *p++ = (char) ch; 5762 continue; 5763 } 5764 5765 /* Map 21-bit characters to '\U00xxxxxx' */ 5766 else if (ch >= 0x10000) { 5767 assert(ch <= MAX_UNICODE); 5768 *p++ = '\\'; 5769 *p++ = 'U'; 5770 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5771 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5772 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5773 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5774 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5775 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5776 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5777 *p++ = Py_hexdigits[ch & 0x0000000F]; 5778 continue; 5779 } 5780 5781 /* Map 16-bit characters to '\uxxxx' */ 5782 if (ch >= 256) { 5783 *p++ = '\\'; 5784 *p++ = 'u'; 5785 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5786 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5787 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5788 *p++ = Py_hexdigits[ch & 0x000F]; 5789 } 5790 5791 /* Map special whitespace to '\t', \n', '\r' */ 5792 else if (ch == '\t') { 5793 *p++ = '\\'; 5794 *p++ = 't'; 5795 } 5796 else if (ch == '\n') { 5797 *p++ = '\\'; 5798 *p++ = 'n'; 5799 } 5800 else if (ch == '\r') { 5801 *p++ = '\\'; 5802 *p++ = 'r'; 5803 } 5804 5805 /* Map non-printable US ASCII to '\xhh' */ 5806 else if (ch < ' ' || ch >= 0x7F) { 5807 *p++ = '\\'; 5808 *p++ = 'x'; 5809 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5810 *p++ = Py_hexdigits[ch & 0x000F]; 5811 } 5812 5813 /* Copy everything else as-is */ 5814 else 5815 *p++ = (char) ch; 5816 } 5817 5818 assert(p - PyBytes_AS_STRING(repr) > 0); 5819 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5820 return NULL; 5821 return repr; 5822} 5823 5824PyObject * 5825PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5826 Py_ssize_t size) 5827{ 5828 PyObject *result; 5829 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5830 if (tmp == NULL) 5831 return NULL; 5832 result = PyUnicode_AsUnicodeEscapeString(tmp); 5833 Py_DECREF(tmp); 5834 return result; 5835} 5836 5837/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5838 5839PyObject * 5840PyUnicode_DecodeRawUnicodeEscape(const char *s, 5841 Py_ssize_t size, 5842 const char *errors) 5843{ 5844 const char *starts = s; 5845 Py_ssize_t startinpos; 5846 Py_ssize_t endinpos; 5847 _PyUnicodeWriter writer; 5848 const char *end; 5849 const char *bs; 5850 PyObject *errorHandler = NULL; 5851 PyObject *exc = NULL; 5852 5853 if (size == 0) 5854 _Py_RETURN_UNICODE_EMPTY(); 5855 5856 /* Escaped strings will always be longer than the resulting 5857 Unicode string, so we start with size here and then reduce the 5858 length after conversion to the true value. (But decoding error 5859 handler might have to resize the string) */ 5860 _PyUnicodeWriter_Init(&writer); 5861 writer.min_length = size; 5862 5863 end = s + size; 5864 while (s < end) { 5865 unsigned char c; 5866 Py_UCS4 x; 5867 int i; 5868 int count; 5869 5870 /* Non-escape characters are interpreted as Unicode ordinals */ 5871 if (*s != '\\') { 5872 x = (unsigned char)*s++; 5873 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5874 goto onError; 5875 continue; 5876 } 5877 startinpos = s-starts; 5878 5879 /* \u-escapes are only interpreted iff the number of leading 5880 backslashes if odd */ 5881 bs = s; 5882 for (;s < end;) { 5883 if (*s != '\\') 5884 break; 5885 x = (unsigned char)*s++; 5886 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5887 goto onError; 5888 } 5889 if (((s - bs) & 1) == 0 || 5890 s >= end || 5891 (*s != 'u' && *s != 'U')) { 5892 continue; 5893 } 5894 writer.pos--; 5895 count = *s=='u' ? 4 : 8; 5896 s++; 5897 5898 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5899 for (x = 0, i = 0; i < count; ++i, ++s) { 5900 c = (unsigned char)*s; 5901 if (!Py_ISXDIGIT(c)) { 5902 endinpos = s-starts; 5903 if (unicode_decode_call_errorhandler_writer( 5904 errors, &errorHandler, 5905 "rawunicodeescape", "truncated \\uXXXX", 5906 &starts, &end, &startinpos, &endinpos, &exc, &s, 5907 &writer)) 5908 goto onError; 5909 goto nextByte; 5910 } 5911 x = (x<<4) & ~0xF; 5912 if (c >= '0' && c <= '9') 5913 x += c - '0'; 5914 else if (c >= 'a' && c <= 'f') 5915 x += 10 + c - 'a'; 5916 else 5917 x += 10 + c - 'A'; 5918 } 5919 if (x <= MAX_UNICODE) { 5920 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5921 goto onError; 5922 } 5923 else { 5924 endinpos = s-starts; 5925 if (unicode_decode_call_errorhandler_writer( 5926 errors, &errorHandler, 5927 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5928 &starts, &end, &startinpos, &endinpos, &exc, &s, 5929 &writer)) 5930 goto onError; 5931 } 5932 nextByte: 5933 ; 5934 } 5935 Py_XDECREF(errorHandler); 5936 Py_XDECREF(exc); 5937 return _PyUnicodeWriter_Finish(&writer); 5938 5939 onError: 5940 _PyUnicodeWriter_Dealloc(&writer); 5941 Py_XDECREF(errorHandler); 5942 Py_XDECREF(exc); 5943 return NULL; 5944} 5945 5946 5947PyObject * 5948PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5949{ 5950 PyObject *repr; 5951 char *p; 5952 char *q; 5953 Py_ssize_t expandsize, pos; 5954 int kind; 5955 void *data; 5956 Py_ssize_t len; 5957 5958 if (!PyUnicode_Check(unicode)) { 5959 PyErr_BadArgument(); 5960 return NULL; 5961 } 5962 if (PyUnicode_READY(unicode) == -1) 5963 return NULL; 5964 kind = PyUnicode_KIND(unicode); 5965 data = PyUnicode_DATA(unicode); 5966 len = PyUnicode_GET_LENGTH(unicode); 5967 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 5968 bytes, and 1 byte characters 4. */ 5969 expandsize = kind * 2 + 2; 5970 5971 if (len > PY_SSIZE_T_MAX / expandsize) 5972 return PyErr_NoMemory(); 5973 5974 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 5975 if (repr == NULL) 5976 return NULL; 5977 if (len == 0) 5978 return repr; 5979 5980 p = q = PyBytes_AS_STRING(repr); 5981 for (pos = 0; pos < len; pos++) { 5982 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 5983 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5984 if (ch >= 0x10000) { 5985 assert(ch <= MAX_UNICODE); 5986 *p++ = '\\'; 5987 *p++ = 'U'; 5988 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 5989 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 5990 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 5991 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 5992 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 5993 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 5994 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 5995 *p++ = Py_hexdigits[ch & 15]; 5996 } 5997 /* Map 16-bit characters to '\uxxxx' */ 5998 else if (ch >= 256) { 5999 *p++ = '\\'; 6000 *p++ = 'u'; 6001 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6002 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6003 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6004 *p++ = Py_hexdigits[ch & 15]; 6005 } 6006 /* Copy everything else as-is */ 6007 else 6008 *p++ = (char) ch; 6009 } 6010 6011 assert(p > q); 6012 if (_PyBytes_Resize(&repr, p - q) < 0) 6013 return NULL; 6014 return repr; 6015} 6016 6017PyObject * 6018PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6019 Py_ssize_t size) 6020{ 6021 PyObject *result; 6022 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6023 if (tmp == NULL) 6024 return NULL; 6025 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6026 Py_DECREF(tmp); 6027 return result; 6028} 6029 6030/* --- Unicode Internal Codec ------------------------------------------- */ 6031 6032PyObject * 6033_PyUnicode_DecodeUnicodeInternal(const char *s, 6034 Py_ssize_t size, 6035 const char *errors) 6036{ 6037 const char *starts = s; 6038 Py_ssize_t startinpos; 6039 Py_ssize_t endinpos; 6040 _PyUnicodeWriter writer; 6041 const char *end; 6042 const char *reason; 6043 PyObject *errorHandler = NULL; 6044 PyObject *exc = NULL; 6045 6046 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6047 "unicode_internal codec has been deprecated", 6048 1)) 6049 return NULL; 6050 6051 if (size == 0) 6052 _Py_RETURN_UNICODE_EMPTY(); 6053 6054 _PyUnicodeWriter_Init(&writer); 6055 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6056 PyErr_NoMemory(); 6057 goto onError; 6058 } 6059 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6060 6061 end = s + size; 6062 while (s < end) { 6063 Py_UNICODE uch; 6064 Py_UCS4 ch; 6065 if (end - s < Py_UNICODE_SIZE) { 6066 endinpos = end-starts; 6067 reason = "truncated input"; 6068 goto error; 6069 } 6070 /* We copy the raw representation one byte at a time because the 6071 pointer may be unaligned (see test_codeccallbacks). */ 6072 ((char *) &uch)[0] = s[0]; 6073 ((char *) &uch)[1] = s[1]; 6074#ifdef Py_UNICODE_WIDE 6075 ((char *) &uch)[2] = s[2]; 6076 ((char *) &uch)[3] = s[3]; 6077#endif 6078 ch = uch; 6079#ifdef Py_UNICODE_WIDE 6080 /* We have to sanity check the raw data, otherwise doom looms for 6081 some malformed UCS-4 data. */ 6082 if (ch > 0x10ffff) { 6083 endinpos = s - starts + Py_UNICODE_SIZE; 6084 reason = "illegal code point (> 0x10FFFF)"; 6085 goto error; 6086 } 6087#endif 6088 s += Py_UNICODE_SIZE; 6089#ifndef Py_UNICODE_WIDE 6090 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6091 { 6092 Py_UNICODE uch2; 6093 ((char *) &uch2)[0] = s[0]; 6094 ((char *) &uch2)[1] = s[1]; 6095 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6096 { 6097 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6098 s += Py_UNICODE_SIZE; 6099 } 6100 } 6101#endif 6102 6103 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6104 goto onError; 6105 continue; 6106 6107 error: 6108 startinpos = s - starts; 6109 if (unicode_decode_call_errorhandler_writer( 6110 errors, &errorHandler, 6111 "unicode_internal", reason, 6112 &starts, &end, &startinpos, &endinpos, &exc, &s, 6113 &writer)) 6114 goto onError; 6115 } 6116 6117 Py_XDECREF(errorHandler); 6118 Py_XDECREF(exc); 6119 return _PyUnicodeWriter_Finish(&writer); 6120 6121 onError: 6122 _PyUnicodeWriter_Dealloc(&writer); 6123 Py_XDECREF(errorHandler); 6124 Py_XDECREF(exc); 6125 return NULL; 6126} 6127 6128/* --- Latin-1 Codec ------------------------------------------------------ */ 6129 6130PyObject * 6131PyUnicode_DecodeLatin1(const char *s, 6132 Py_ssize_t size, 6133 const char *errors) 6134{ 6135 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6136 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6137} 6138 6139/* create or adjust a UnicodeEncodeError */ 6140static void 6141make_encode_exception(PyObject **exceptionObject, 6142 const char *encoding, 6143 PyObject *unicode, 6144 Py_ssize_t startpos, Py_ssize_t endpos, 6145 const char *reason) 6146{ 6147 if (*exceptionObject == NULL) { 6148 *exceptionObject = PyObject_CallFunction( 6149 PyExc_UnicodeEncodeError, "sOnns", 6150 encoding, unicode, startpos, endpos, reason); 6151 } 6152 else { 6153 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6154 goto onError; 6155 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6156 goto onError; 6157 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6158 goto onError; 6159 return; 6160 onError: 6161 Py_DECREF(*exceptionObject); 6162 *exceptionObject = NULL; 6163 } 6164} 6165 6166/* raises a UnicodeEncodeError */ 6167static void 6168raise_encode_exception(PyObject **exceptionObject, 6169 const char *encoding, 6170 PyObject *unicode, 6171 Py_ssize_t startpos, Py_ssize_t endpos, 6172 const char *reason) 6173{ 6174 make_encode_exception(exceptionObject, 6175 encoding, unicode, startpos, endpos, reason); 6176 if (*exceptionObject != NULL) 6177 PyCodec_StrictErrors(*exceptionObject); 6178} 6179 6180/* error handling callback helper: 6181 build arguments, call the callback and check the arguments, 6182 put the result into newpos and return the replacement string, which 6183 has to be freed by the caller */ 6184static PyObject * 6185unicode_encode_call_errorhandler(const char *errors, 6186 PyObject **errorHandler, 6187 const char *encoding, const char *reason, 6188 PyObject *unicode, PyObject **exceptionObject, 6189 Py_ssize_t startpos, Py_ssize_t endpos, 6190 Py_ssize_t *newpos) 6191{ 6192 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6193 Py_ssize_t len; 6194 PyObject *restuple; 6195 PyObject *resunicode; 6196 6197 if (*errorHandler == NULL) { 6198 *errorHandler = PyCodec_LookupError(errors); 6199 if (*errorHandler == NULL) 6200 return NULL; 6201 } 6202 6203 if (PyUnicode_READY(unicode) == -1) 6204 return NULL; 6205 len = PyUnicode_GET_LENGTH(unicode); 6206 6207 make_encode_exception(exceptionObject, 6208 encoding, unicode, startpos, endpos, reason); 6209 if (*exceptionObject == NULL) 6210 return NULL; 6211 6212 restuple = PyObject_CallFunctionObjArgs( 6213 *errorHandler, *exceptionObject, NULL); 6214 if (restuple == NULL) 6215 return NULL; 6216 if (!PyTuple_Check(restuple)) { 6217 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6218 Py_DECREF(restuple); 6219 return NULL; 6220 } 6221 if (!PyArg_ParseTuple(restuple, argparse, 6222 &resunicode, newpos)) { 6223 Py_DECREF(restuple); 6224 return NULL; 6225 } 6226 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6227 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6228 Py_DECREF(restuple); 6229 return NULL; 6230 } 6231 if (*newpos<0) 6232 *newpos = len + *newpos; 6233 if (*newpos<0 || *newpos>len) { 6234 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6235 Py_DECREF(restuple); 6236 return NULL; 6237 } 6238 Py_INCREF(resunicode); 6239 Py_DECREF(restuple); 6240 return resunicode; 6241} 6242 6243static PyObject * 6244unicode_encode_ucs1(PyObject *unicode, 6245 const char *errors, 6246 unsigned int limit) 6247{ 6248 /* input state */ 6249 Py_ssize_t pos=0, size; 6250 int kind; 6251 void *data; 6252 /* output object */ 6253 PyObject *res; 6254 /* pointer into the output */ 6255 char *str; 6256 /* current output position */ 6257 Py_ssize_t ressize; 6258 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6259 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6260 PyObject *errorHandler = NULL; 6261 PyObject *exc = NULL; 6262 /* the following variable is used for caching string comparisons 6263 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6264 int known_errorHandler = -1; 6265 6266 if (PyUnicode_READY(unicode) == -1) 6267 return NULL; 6268 size = PyUnicode_GET_LENGTH(unicode); 6269 kind = PyUnicode_KIND(unicode); 6270 data = PyUnicode_DATA(unicode); 6271 /* allocate enough for a simple encoding without 6272 replacements, if we need more, we'll resize */ 6273 if (size == 0) 6274 return PyBytes_FromStringAndSize(NULL, 0); 6275 res = PyBytes_FromStringAndSize(NULL, size); 6276 if (res == NULL) 6277 return NULL; 6278 str = PyBytes_AS_STRING(res); 6279 ressize = size; 6280 6281 while (pos < size) { 6282 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6283 6284 /* can we encode this? */ 6285 if (c<limit) { 6286 /* no overflow check, because we know that the space is enough */ 6287 *str++ = (char)c; 6288 ++pos; 6289 } 6290 else { 6291 Py_ssize_t requiredsize; 6292 PyObject *repunicode; 6293 Py_ssize_t repsize, newpos, respos, i; 6294 /* startpos for collecting unencodable chars */ 6295 Py_ssize_t collstart = pos; 6296 Py_ssize_t collend = pos; 6297 /* find all unecodable characters */ 6298 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6299 ++collend; 6300 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6301 if (known_errorHandler==-1) { 6302 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6303 known_errorHandler = 1; 6304 else if (!strcmp(errors, "replace")) 6305 known_errorHandler = 2; 6306 else if (!strcmp(errors, "ignore")) 6307 known_errorHandler = 3; 6308 else if (!strcmp(errors, "xmlcharrefreplace")) 6309 known_errorHandler = 4; 6310 else 6311 known_errorHandler = 0; 6312 } 6313 switch (known_errorHandler) { 6314 case 1: /* strict */ 6315 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6316 goto onError; 6317 case 2: /* replace */ 6318 while (collstart++<collend) 6319 *str++ = '?'; /* fall through */ 6320 case 3: /* ignore */ 6321 pos = collend; 6322 break; 6323 case 4: /* xmlcharrefreplace */ 6324 respos = str - PyBytes_AS_STRING(res); 6325 /* determine replacement size */ 6326 for (i = collstart, repsize = 0; i < collend; ++i) { 6327 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6328 if (ch < 10) 6329 repsize += 2+1+1; 6330 else if (ch < 100) 6331 repsize += 2+2+1; 6332 else if (ch < 1000) 6333 repsize += 2+3+1; 6334 else if (ch < 10000) 6335 repsize += 2+4+1; 6336 else if (ch < 100000) 6337 repsize += 2+5+1; 6338 else if (ch < 1000000) 6339 repsize += 2+6+1; 6340 else { 6341 assert(ch <= MAX_UNICODE); 6342 repsize += 2+7+1; 6343 } 6344 } 6345 requiredsize = respos+repsize+(size-collend); 6346 if (requiredsize > ressize) { 6347 if (requiredsize<2*ressize) 6348 requiredsize = 2*ressize; 6349 if (_PyBytes_Resize(&res, requiredsize)) 6350 goto onError; 6351 str = PyBytes_AS_STRING(res) + respos; 6352 ressize = requiredsize; 6353 } 6354 /* generate replacement */ 6355 for (i = collstart; i < collend; ++i) { 6356 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6357 } 6358 pos = collend; 6359 break; 6360 default: 6361 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6362 encoding, reason, unicode, &exc, 6363 collstart, collend, &newpos); 6364 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6365 PyUnicode_READY(repunicode) == -1)) 6366 goto onError; 6367 if (PyBytes_Check(repunicode)) { 6368 /* Directly copy bytes result to output. */ 6369 repsize = PyBytes_Size(repunicode); 6370 if (repsize > 1) { 6371 /* Make room for all additional bytes. */ 6372 respos = str - PyBytes_AS_STRING(res); 6373 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6374 Py_DECREF(repunicode); 6375 goto onError; 6376 } 6377 str = PyBytes_AS_STRING(res) + respos; 6378 ressize += repsize-1; 6379 } 6380 memcpy(str, PyBytes_AsString(repunicode), repsize); 6381 str += repsize; 6382 pos = newpos; 6383 Py_DECREF(repunicode); 6384 break; 6385 } 6386 /* need more space? (at least enough for what we 6387 have+the replacement+the rest of the string, so 6388 we won't have to check space for encodable characters) */ 6389 respos = str - PyBytes_AS_STRING(res); 6390 repsize = PyUnicode_GET_LENGTH(repunicode); 6391 requiredsize = respos+repsize+(size-collend); 6392 if (requiredsize > ressize) { 6393 if (requiredsize<2*ressize) 6394 requiredsize = 2*ressize; 6395 if (_PyBytes_Resize(&res, requiredsize)) { 6396 Py_DECREF(repunicode); 6397 goto onError; 6398 } 6399 str = PyBytes_AS_STRING(res) + respos; 6400 ressize = requiredsize; 6401 } 6402 /* check if there is anything unencodable in the replacement 6403 and copy it to the output */ 6404 for (i = 0; repsize-->0; ++i, ++str) { 6405 c = PyUnicode_READ_CHAR(repunicode, i); 6406 if (c >= limit) { 6407 raise_encode_exception(&exc, encoding, unicode, 6408 pos, pos+1, reason); 6409 Py_DECREF(repunicode); 6410 goto onError; 6411 } 6412 *str = (char)c; 6413 } 6414 pos = newpos; 6415 Py_DECREF(repunicode); 6416 } 6417 } 6418 } 6419 /* Resize if we allocated to much */ 6420 size = str - PyBytes_AS_STRING(res); 6421 if (size < ressize) { /* If this falls res will be NULL */ 6422 assert(size >= 0); 6423 if (_PyBytes_Resize(&res, size) < 0) 6424 goto onError; 6425 } 6426 6427 Py_XDECREF(errorHandler); 6428 Py_XDECREF(exc); 6429 return res; 6430 6431 onError: 6432 Py_XDECREF(res); 6433 Py_XDECREF(errorHandler); 6434 Py_XDECREF(exc); 6435 return NULL; 6436} 6437 6438/* Deprecated */ 6439PyObject * 6440PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6441 Py_ssize_t size, 6442 const char *errors) 6443{ 6444 PyObject *result; 6445 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6446 if (unicode == NULL) 6447 return NULL; 6448 result = unicode_encode_ucs1(unicode, errors, 256); 6449 Py_DECREF(unicode); 6450 return result; 6451} 6452 6453PyObject * 6454_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6455{ 6456 if (!PyUnicode_Check(unicode)) { 6457 PyErr_BadArgument(); 6458 return NULL; 6459 } 6460 if (PyUnicode_READY(unicode) == -1) 6461 return NULL; 6462 /* Fast path: if it is a one-byte string, construct 6463 bytes object directly. */ 6464 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6465 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6466 PyUnicode_GET_LENGTH(unicode)); 6467 /* Non-Latin-1 characters present. Defer to above function to 6468 raise the exception. */ 6469 return unicode_encode_ucs1(unicode, errors, 256); 6470} 6471 6472PyObject* 6473PyUnicode_AsLatin1String(PyObject *unicode) 6474{ 6475 return _PyUnicode_AsLatin1String(unicode, NULL); 6476} 6477 6478/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6479 6480PyObject * 6481PyUnicode_DecodeASCII(const char *s, 6482 Py_ssize_t size, 6483 const char *errors) 6484{ 6485 const char *starts = s; 6486 _PyUnicodeWriter writer; 6487 int kind; 6488 void *data; 6489 Py_ssize_t startinpos; 6490 Py_ssize_t endinpos; 6491 Py_ssize_t outpos; 6492 const char *e; 6493 PyObject *errorHandler = NULL; 6494 PyObject *exc = NULL; 6495 6496 if (size == 0) 6497 _Py_RETURN_UNICODE_EMPTY(); 6498 6499 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6500 if (size == 1 && (unsigned char)s[0] < 128) 6501 return get_latin1_char((unsigned char)s[0]); 6502 6503 _PyUnicodeWriter_Init(&writer); 6504 writer.min_length = size; 6505 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6506 return NULL; 6507 6508 e = s + size; 6509 data = writer.data; 6510 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6511 writer.pos = outpos; 6512 if (writer.pos == size) 6513 return _PyUnicodeWriter_Finish(&writer); 6514 6515 s += writer.pos; 6516 kind = writer.kind; 6517 while (s < e) { 6518 register unsigned char c = (unsigned char)*s; 6519 if (c < 128) { 6520 PyUnicode_WRITE(kind, data, writer.pos, c); 6521 writer.pos++; 6522 ++s; 6523 } 6524 else { 6525 startinpos = s-starts; 6526 endinpos = startinpos + 1; 6527 if (unicode_decode_call_errorhandler_writer( 6528 errors, &errorHandler, 6529 "ascii", "ordinal not in range(128)", 6530 &starts, &e, &startinpos, &endinpos, &exc, &s, 6531 &writer)) 6532 goto onError; 6533 kind = writer.kind; 6534 data = writer.data; 6535 } 6536 } 6537 Py_XDECREF(errorHandler); 6538 Py_XDECREF(exc); 6539 return _PyUnicodeWriter_Finish(&writer); 6540 6541 onError: 6542 _PyUnicodeWriter_Dealloc(&writer); 6543 Py_XDECREF(errorHandler); 6544 Py_XDECREF(exc); 6545 return NULL; 6546} 6547 6548/* Deprecated */ 6549PyObject * 6550PyUnicode_EncodeASCII(const Py_UNICODE *p, 6551 Py_ssize_t size, 6552 const char *errors) 6553{ 6554 PyObject *result; 6555 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6556 if (unicode == NULL) 6557 return NULL; 6558 result = unicode_encode_ucs1(unicode, errors, 128); 6559 Py_DECREF(unicode); 6560 return result; 6561} 6562 6563PyObject * 6564_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6565{ 6566 if (!PyUnicode_Check(unicode)) { 6567 PyErr_BadArgument(); 6568 return NULL; 6569 } 6570 if (PyUnicode_READY(unicode) == -1) 6571 return NULL; 6572 /* Fast path: if it is an ASCII-only string, construct bytes object 6573 directly. Else defer to above function to raise the exception. */ 6574 if (PyUnicode_IS_ASCII(unicode)) 6575 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6576 PyUnicode_GET_LENGTH(unicode)); 6577 return unicode_encode_ucs1(unicode, errors, 128); 6578} 6579 6580PyObject * 6581PyUnicode_AsASCIIString(PyObject *unicode) 6582{ 6583 return _PyUnicode_AsASCIIString(unicode, NULL); 6584} 6585 6586#ifdef HAVE_MBCS 6587 6588/* --- MBCS codecs for Windows -------------------------------------------- */ 6589 6590#if SIZEOF_INT < SIZEOF_SIZE_T 6591#define NEED_RETRY 6592#endif 6593 6594#ifndef WC_ERR_INVALID_CHARS 6595# define WC_ERR_INVALID_CHARS 0x0080 6596#endif 6597 6598static char* 6599code_page_name(UINT code_page, PyObject **obj) 6600{ 6601 *obj = NULL; 6602 if (code_page == CP_ACP) 6603 return "mbcs"; 6604 if (code_page == CP_UTF7) 6605 return "CP_UTF7"; 6606 if (code_page == CP_UTF8) 6607 return "CP_UTF8"; 6608 6609 *obj = PyBytes_FromFormat("cp%u", code_page); 6610 if (*obj == NULL) 6611 return NULL; 6612 return PyBytes_AS_STRING(*obj); 6613} 6614 6615static int 6616is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6617{ 6618 const char *curr = s + offset; 6619 const char *prev; 6620 6621 if (!IsDBCSLeadByteEx(code_page, *curr)) 6622 return 0; 6623 6624 prev = CharPrevExA(code_page, s, curr, 0); 6625 if (prev == curr) 6626 return 1; 6627 /* FIXME: This code is limited to "true" double-byte encodings, 6628 as it assumes an incomplete character consists of a single 6629 byte. */ 6630 if (curr - prev == 2) 6631 return 1; 6632 if (!IsDBCSLeadByteEx(code_page, *prev)) 6633 return 1; 6634 return 0; 6635} 6636 6637static DWORD 6638decode_code_page_flags(UINT code_page) 6639{ 6640 if (code_page == CP_UTF7) { 6641 /* The CP_UTF7 decoder only supports flags=0 */ 6642 return 0; 6643 } 6644 else 6645 return MB_ERR_INVALID_CHARS; 6646} 6647 6648/* 6649 * Decode a byte string from a Windows code page into unicode object in strict 6650 * mode. 6651 * 6652 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6653 * OSError and returns -1 on other error. 6654 */ 6655static int 6656decode_code_page_strict(UINT code_page, 6657 PyObject **v, 6658 const char *in, 6659 int insize) 6660{ 6661 const DWORD flags = decode_code_page_flags(code_page); 6662 wchar_t *out; 6663 DWORD outsize; 6664 6665 /* First get the size of the result */ 6666 assert(insize > 0); 6667 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6668 if (outsize <= 0) 6669 goto error; 6670 6671 if (*v == NULL) { 6672 /* Create unicode object */ 6673 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6674 *v = (PyObject*)_PyUnicode_New(outsize); 6675 if (*v == NULL) 6676 return -1; 6677 out = PyUnicode_AS_UNICODE(*v); 6678 } 6679 else { 6680 /* Extend unicode object */ 6681 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6682 if (unicode_resize(v, n + outsize) < 0) 6683 return -1; 6684 out = PyUnicode_AS_UNICODE(*v) + n; 6685 } 6686 6687 /* Do the conversion */ 6688 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6689 if (outsize <= 0) 6690 goto error; 6691 return insize; 6692 6693error: 6694 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6695 return -2; 6696 PyErr_SetFromWindowsErr(0); 6697 return -1; 6698} 6699 6700/* 6701 * Decode a byte string from a code page into unicode object with an error 6702 * handler. 6703 * 6704 * Returns consumed size if succeed, or raise an OSError or 6705 * UnicodeDecodeError exception and returns -1 on error. 6706 */ 6707static int 6708decode_code_page_errors(UINT code_page, 6709 PyObject **v, 6710 const char *in, const int size, 6711 const char *errors) 6712{ 6713 const char *startin = in; 6714 const char *endin = in + size; 6715 const DWORD flags = decode_code_page_flags(code_page); 6716 /* Ideally, we should get reason from FormatMessage. This is the Windows 6717 2000 English version of the message. */ 6718 const char *reason = "No mapping for the Unicode character exists " 6719 "in the target code page."; 6720 /* each step cannot decode more than 1 character, but a character can be 6721 represented as a surrogate pair */ 6722 wchar_t buffer[2], *startout, *out; 6723 int insize; 6724 Py_ssize_t outsize; 6725 PyObject *errorHandler = NULL; 6726 PyObject *exc = NULL; 6727 PyObject *encoding_obj = NULL; 6728 char *encoding; 6729 DWORD err; 6730 int ret = -1; 6731 6732 assert(size > 0); 6733 6734 encoding = code_page_name(code_page, &encoding_obj); 6735 if (encoding == NULL) 6736 return -1; 6737 6738 if (errors == NULL || strcmp(errors, "strict") == 0) { 6739 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6740 UnicodeDecodeError. */ 6741 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6742 if (exc != NULL) { 6743 PyCodec_StrictErrors(exc); 6744 Py_CLEAR(exc); 6745 } 6746 goto error; 6747 } 6748 6749 if (*v == NULL) { 6750 /* Create unicode object */ 6751 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6752 PyErr_NoMemory(); 6753 goto error; 6754 } 6755 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6756 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6757 if (*v == NULL) 6758 goto error; 6759 startout = PyUnicode_AS_UNICODE(*v); 6760 } 6761 else { 6762 /* Extend unicode object */ 6763 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6764 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6765 PyErr_NoMemory(); 6766 goto error; 6767 } 6768 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6769 goto error; 6770 startout = PyUnicode_AS_UNICODE(*v) + n; 6771 } 6772 6773 /* Decode the byte string character per character */ 6774 out = startout; 6775 while (in < endin) 6776 { 6777 /* Decode a character */ 6778 insize = 1; 6779 do 6780 { 6781 outsize = MultiByteToWideChar(code_page, flags, 6782 in, insize, 6783 buffer, Py_ARRAY_LENGTH(buffer)); 6784 if (outsize > 0) 6785 break; 6786 err = GetLastError(); 6787 if (err != ERROR_NO_UNICODE_TRANSLATION 6788 && err != ERROR_INSUFFICIENT_BUFFER) 6789 { 6790 PyErr_SetFromWindowsErr(0); 6791 goto error; 6792 } 6793 insize++; 6794 } 6795 /* 4=maximum length of a UTF-8 sequence */ 6796 while (insize <= 4 && (in + insize) <= endin); 6797 6798 if (outsize <= 0) { 6799 Py_ssize_t startinpos, endinpos, outpos; 6800 6801 startinpos = in - startin; 6802 endinpos = startinpos + 1; 6803 outpos = out - PyUnicode_AS_UNICODE(*v); 6804 if (unicode_decode_call_errorhandler_wchar( 6805 errors, &errorHandler, 6806 encoding, reason, 6807 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6808 v, &outpos)) 6809 { 6810 goto error; 6811 } 6812 out = PyUnicode_AS_UNICODE(*v) + outpos; 6813 } 6814 else { 6815 in += insize; 6816 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6817 out += outsize; 6818 } 6819 } 6820 6821 /* write a NUL character at the end */ 6822 *out = 0; 6823 6824 /* Extend unicode object */ 6825 outsize = out - startout; 6826 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6827 if (unicode_resize(v, outsize) < 0) 6828 goto error; 6829 ret = size; 6830 6831error: 6832 Py_XDECREF(encoding_obj); 6833 Py_XDECREF(errorHandler); 6834 Py_XDECREF(exc); 6835 return ret; 6836} 6837 6838static PyObject * 6839decode_code_page_stateful(int code_page, 6840 const char *s, Py_ssize_t size, 6841 const char *errors, Py_ssize_t *consumed) 6842{ 6843 PyObject *v = NULL; 6844 int chunk_size, final, converted, done; 6845 6846 if (code_page < 0) { 6847 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6848 return NULL; 6849 } 6850 6851 if (consumed) 6852 *consumed = 0; 6853 6854 do 6855 { 6856#ifdef NEED_RETRY 6857 if (size > INT_MAX) { 6858 chunk_size = INT_MAX; 6859 final = 0; 6860 done = 0; 6861 } 6862 else 6863#endif 6864 { 6865 chunk_size = (int)size; 6866 final = (consumed == NULL); 6867 done = 1; 6868 } 6869 6870 /* Skip trailing lead-byte unless 'final' is set */ 6871 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6872 --chunk_size; 6873 6874 if (chunk_size == 0 && done) { 6875 if (v != NULL) 6876 break; 6877 _Py_RETURN_UNICODE_EMPTY(); 6878 } 6879 6880 6881 converted = decode_code_page_strict(code_page, &v, 6882 s, chunk_size); 6883 if (converted == -2) 6884 converted = decode_code_page_errors(code_page, &v, 6885 s, chunk_size, 6886 errors); 6887 assert(converted != 0); 6888 6889 if (converted < 0) { 6890 Py_XDECREF(v); 6891 return NULL; 6892 } 6893 6894 if (consumed) 6895 *consumed += converted; 6896 6897 s += converted; 6898 size -= converted; 6899 } while (!done); 6900 6901 return unicode_result(v); 6902} 6903 6904PyObject * 6905PyUnicode_DecodeCodePageStateful(int code_page, 6906 const char *s, 6907 Py_ssize_t size, 6908 const char *errors, 6909 Py_ssize_t *consumed) 6910{ 6911 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6912} 6913 6914PyObject * 6915PyUnicode_DecodeMBCSStateful(const char *s, 6916 Py_ssize_t size, 6917 const char *errors, 6918 Py_ssize_t *consumed) 6919{ 6920 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6921} 6922 6923PyObject * 6924PyUnicode_DecodeMBCS(const char *s, 6925 Py_ssize_t size, 6926 const char *errors) 6927{ 6928 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6929} 6930 6931static DWORD 6932encode_code_page_flags(UINT code_page, const char *errors) 6933{ 6934 if (code_page == CP_UTF8) { 6935 if (winver.dwMajorVersion >= 6) 6936 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 6937 and later */ 6938 return WC_ERR_INVALID_CHARS; 6939 else 6940 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 6941 return 0; 6942 } 6943 else if (code_page == CP_UTF7) { 6944 /* CP_UTF7 only supports flags=0 */ 6945 return 0; 6946 } 6947 else { 6948 if (errors != NULL && strcmp(errors, "replace") == 0) 6949 return 0; 6950 else 6951 return WC_NO_BEST_FIT_CHARS; 6952 } 6953} 6954 6955/* 6956 * Encode a Unicode string to a Windows code page into a byte string in strict 6957 * mode. 6958 * 6959 * Returns consumed characters if succeed, returns -2 on encode error, or raise 6960 * an OSError and returns -1 on other error. 6961 */ 6962static int 6963encode_code_page_strict(UINT code_page, PyObject **outbytes, 6964 PyObject *unicode, Py_ssize_t offset, int len, 6965 const char* errors) 6966{ 6967 BOOL usedDefaultChar = FALSE; 6968 BOOL *pusedDefaultChar = &usedDefaultChar; 6969 int outsize; 6970 PyObject *exc = NULL; 6971 wchar_t *p; 6972 Py_ssize_t size; 6973 const DWORD flags = encode_code_page_flags(code_page, NULL); 6974 char *out; 6975 /* Create a substring so that we can get the UTF-16 representation 6976 of just the slice under consideration. */ 6977 PyObject *substring; 6978 6979 assert(len > 0); 6980 6981 if (code_page != CP_UTF8 && code_page != CP_UTF7) 6982 pusedDefaultChar = &usedDefaultChar; 6983 else 6984 pusedDefaultChar = NULL; 6985 6986 substring = PyUnicode_Substring(unicode, offset, offset+len); 6987 if (substring == NULL) 6988 return -1; 6989 p = PyUnicode_AsUnicodeAndSize(substring, &size); 6990 if (p == NULL) { 6991 Py_DECREF(substring); 6992 return -1; 6993 } 6994 assert(size <= INT_MAX); 6995 6996 /* First get the size of the result */ 6997 outsize = WideCharToMultiByte(code_page, flags, 6998 p, (int)size, 6999 NULL, 0, 7000 NULL, pusedDefaultChar); 7001 if (outsize <= 0) 7002 goto error; 7003 /* If we used a default char, then we failed! */ 7004 if (pusedDefaultChar && *pusedDefaultChar) { 7005 Py_DECREF(substring); 7006 return -2; 7007 } 7008 7009 if (*outbytes == NULL) { 7010 /* Create string object */ 7011 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7012 if (*outbytes == NULL) { 7013 Py_DECREF(substring); 7014 return -1; 7015 } 7016 out = PyBytes_AS_STRING(*outbytes); 7017 } 7018 else { 7019 /* Extend string object */ 7020 const Py_ssize_t n = PyBytes_Size(*outbytes); 7021 if (outsize > PY_SSIZE_T_MAX - n) { 7022 PyErr_NoMemory(); 7023 Py_DECREF(substring); 7024 return -1; 7025 } 7026 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7027 Py_DECREF(substring); 7028 return -1; 7029 } 7030 out = PyBytes_AS_STRING(*outbytes) + n; 7031 } 7032 7033 /* Do the conversion */ 7034 outsize = WideCharToMultiByte(code_page, flags, 7035 p, (int)size, 7036 out, outsize, 7037 NULL, pusedDefaultChar); 7038 Py_CLEAR(substring); 7039 if (outsize <= 0) 7040 goto error; 7041 if (pusedDefaultChar && *pusedDefaultChar) 7042 return -2; 7043 return 0; 7044 7045error: 7046 Py_XDECREF(substring); 7047 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7048 return -2; 7049 PyErr_SetFromWindowsErr(0); 7050 return -1; 7051} 7052 7053/* 7054 * Encode a Unicode string to a Windows code page into a byte string using a 7055 * error handler. 7056 * 7057 * Returns consumed characters if succeed, or raise an OSError and returns 7058 * -1 on other error. 7059 */ 7060static int 7061encode_code_page_errors(UINT code_page, PyObject **outbytes, 7062 PyObject *unicode, Py_ssize_t unicode_offset, 7063 Py_ssize_t insize, const char* errors) 7064{ 7065 const DWORD flags = encode_code_page_flags(code_page, errors); 7066 Py_ssize_t pos = unicode_offset; 7067 Py_ssize_t endin = unicode_offset + insize; 7068 /* Ideally, we should get reason from FormatMessage. This is the Windows 7069 2000 English version of the message. */ 7070 const char *reason = "invalid character"; 7071 /* 4=maximum length of a UTF-8 sequence */ 7072 char buffer[4]; 7073 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7074 Py_ssize_t outsize; 7075 char *out; 7076 PyObject *errorHandler = NULL; 7077 PyObject *exc = NULL; 7078 PyObject *encoding_obj = NULL; 7079 char *encoding; 7080 Py_ssize_t newpos, newoutsize; 7081 PyObject *rep; 7082 int ret = -1; 7083 7084 assert(insize > 0); 7085 7086 encoding = code_page_name(code_page, &encoding_obj); 7087 if (encoding == NULL) 7088 return -1; 7089 7090 if (errors == NULL || strcmp(errors, "strict") == 0) { 7091 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7092 then we raise a UnicodeEncodeError. */ 7093 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7094 if (exc != NULL) { 7095 PyCodec_StrictErrors(exc); 7096 Py_DECREF(exc); 7097 } 7098 Py_XDECREF(encoding_obj); 7099 return -1; 7100 } 7101 7102 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7103 pusedDefaultChar = &usedDefaultChar; 7104 else 7105 pusedDefaultChar = NULL; 7106 7107 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7108 PyErr_NoMemory(); 7109 goto error; 7110 } 7111 outsize = insize * Py_ARRAY_LENGTH(buffer); 7112 7113 if (*outbytes == NULL) { 7114 /* Create string object */ 7115 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7116 if (*outbytes == NULL) 7117 goto error; 7118 out = PyBytes_AS_STRING(*outbytes); 7119 } 7120 else { 7121 /* Extend string object */ 7122 Py_ssize_t n = PyBytes_Size(*outbytes); 7123 if (n > PY_SSIZE_T_MAX - outsize) { 7124 PyErr_NoMemory(); 7125 goto error; 7126 } 7127 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7128 goto error; 7129 out = PyBytes_AS_STRING(*outbytes) + n; 7130 } 7131 7132 /* Encode the string character per character */ 7133 while (pos < endin) 7134 { 7135 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7136 wchar_t chars[2]; 7137 int charsize; 7138 if (ch < 0x10000) { 7139 chars[0] = (wchar_t)ch; 7140 charsize = 1; 7141 } 7142 else { 7143 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7144 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7145 charsize = 2; 7146 } 7147 7148 outsize = WideCharToMultiByte(code_page, flags, 7149 chars, charsize, 7150 buffer, Py_ARRAY_LENGTH(buffer), 7151 NULL, pusedDefaultChar); 7152 if (outsize > 0) { 7153 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7154 { 7155 pos++; 7156 memcpy(out, buffer, outsize); 7157 out += outsize; 7158 continue; 7159 } 7160 } 7161 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7162 PyErr_SetFromWindowsErr(0); 7163 goto error; 7164 } 7165 7166 rep = unicode_encode_call_errorhandler( 7167 errors, &errorHandler, encoding, reason, 7168 unicode, &exc, 7169 pos, pos + 1, &newpos); 7170 if (rep == NULL) 7171 goto error; 7172 pos = newpos; 7173 7174 if (PyBytes_Check(rep)) { 7175 outsize = PyBytes_GET_SIZE(rep); 7176 if (outsize != 1) { 7177 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7178 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7179 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7180 Py_DECREF(rep); 7181 goto error; 7182 } 7183 out = PyBytes_AS_STRING(*outbytes) + offset; 7184 } 7185 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7186 out += outsize; 7187 } 7188 else { 7189 Py_ssize_t i; 7190 enum PyUnicode_Kind kind; 7191 void *data; 7192 7193 if (PyUnicode_READY(rep) == -1) { 7194 Py_DECREF(rep); 7195 goto error; 7196 } 7197 7198 outsize = PyUnicode_GET_LENGTH(rep); 7199 if (outsize != 1) { 7200 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7201 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7202 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7203 Py_DECREF(rep); 7204 goto error; 7205 } 7206 out = PyBytes_AS_STRING(*outbytes) + offset; 7207 } 7208 kind = PyUnicode_KIND(rep); 7209 data = PyUnicode_DATA(rep); 7210 for (i=0; i < outsize; i++) { 7211 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7212 if (ch > 127) { 7213 raise_encode_exception(&exc, 7214 encoding, unicode, 7215 pos, pos + 1, 7216 "unable to encode error handler result to ASCII"); 7217 Py_DECREF(rep); 7218 goto error; 7219 } 7220 *out = (unsigned char)ch; 7221 out++; 7222 } 7223 } 7224 Py_DECREF(rep); 7225 } 7226 /* write a NUL byte */ 7227 *out = 0; 7228 outsize = out - PyBytes_AS_STRING(*outbytes); 7229 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7230 if (_PyBytes_Resize(outbytes, outsize) < 0) 7231 goto error; 7232 ret = 0; 7233 7234error: 7235 Py_XDECREF(encoding_obj); 7236 Py_XDECREF(errorHandler); 7237 Py_XDECREF(exc); 7238 return ret; 7239} 7240 7241static PyObject * 7242encode_code_page(int code_page, 7243 PyObject *unicode, 7244 const char *errors) 7245{ 7246 Py_ssize_t len; 7247 PyObject *outbytes = NULL; 7248 Py_ssize_t offset; 7249 int chunk_len, ret, done; 7250 7251 if (PyUnicode_READY(unicode) == -1) 7252 return NULL; 7253 len = PyUnicode_GET_LENGTH(unicode); 7254 7255 if (code_page < 0) { 7256 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7257 return NULL; 7258 } 7259 7260 if (len == 0) 7261 return PyBytes_FromStringAndSize(NULL, 0); 7262 7263 offset = 0; 7264 do 7265 { 7266#ifdef NEED_RETRY 7267 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7268 chunks. */ 7269 if (len > INT_MAX/2) { 7270 chunk_len = INT_MAX/2; 7271 done = 0; 7272 } 7273 else 7274#endif 7275 { 7276 chunk_len = (int)len; 7277 done = 1; 7278 } 7279 7280 ret = encode_code_page_strict(code_page, &outbytes, 7281 unicode, offset, chunk_len, 7282 errors); 7283 if (ret == -2) 7284 ret = encode_code_page_errors(code_page, &outbytes, 7285 unicode, offset, 7286 chunk_len, errors); 7287 if (ret < 0) { 7288 Py_XDECREF(outbytes); 7289 return NULL; 7290 } 7291 7292 offset += chunk_len; 7293 len -= chunk_len; 7294 } while (!done); 7295 7296 return outbytes; 7297} 7298 7299PyObject * 7300PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7301 Py_ssize_t size, 7302 const char *errors) 7303{ 7304 PyObject *unicode, *res; 7305 unicode = PyUnicode_FromUnicode(p, size); 7306 if (unicode == NULL) 7307 return NULL; 7308 res = encode_code_page(CP_ACP, unicode, errors); 7309 Py_DECREF(unicode); 7310 return res; 7311} 7312 7313PyObject * 7314PyUnicode_EncodeCodePage(int code_page, 7315 PyObject *unicode, 7316 const char *errors) 7317{ 7318 return encode_code_page(code_page, unicode, errors); 7319} 7320 7321PyObject * 7322PyUnicode_AsMBCSString(PyObject *unicode) 7323{ 7324 if (!PyUnicode_Check(unicode)) { 7325 PyErr_BadArgument(); 7326 return NULL; 7327 } 7328 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7329} 7330 7331#undef NEED_RETRY 7332 7333#endif /* HAVE_MBCS */ 7334 7335/* --- Character Mapping Codec -------------------------------------------- */ 7336 7337static int 7338charmap_decode_string(const char *s, 7339 Py_ssize_t size, 7340 PyObject *mapping, 7341 const char *errors, 7342 _PyUnicodeWriter *writer) 7343{ 7344 const char *starts = s; 7345 const char *e; 7346 Py_ssize_t startinpos, endinpos; 7347 PyObject *errorHandler = NULL, *exc = NULL; 7348 Py_ssize_t maplen; 7349 enum PyUnicode_Kind mapkind; 7350 void *mapdata; 7351 Py_UCS4 x; 7352 unsigned char ch; 7353 7354 if (PyUnicode_READY(mapping) == -1) 7355 return -1; 7356 7357 maplen = PyUnicode_GET_LENGTH(mapping); 7358 mapdata = PyUnicode_DATA(mapping); 7359 mapkind = PyUnicode_KIND(mapping); 7360 7361 e = s + size; 7362 7363 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7364 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7365 * is disabled in encoding aliases, latin1 is preferred because 7366 * its implementation is faster. */ 7367 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7368 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7369 Py_UCS4 maxchar = writer->maxchar; 7370 7371 assert (writer->kind == PyUnicode_1BYTE_KIND); 7372 while (s < e) { 7373 ch = *s; 7374 x = mapdata_ucs1[ch]; 7375 if (x > maxchar) { 7376 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7377 goto onError; 7378 maxchar = writer->maxchar; 7379 outdata = (Py_UCS1 *)writer->data; 7380 } 7381 outdata[writer->pos] = x; 7382 writer->pos++; 7383 ++s; 7384 } 7385 return 0; 7386 } 7387 7388 while (s < e) { 7389 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7390 enum PyUnicode_Kind outkind = writer->kind; 7391 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7392 if (outkind == PyUnicode_1BYTE_KIND) { 7393 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7394 Py_UCS4 maxchar = writer->maxchar; 7395 while (s < e) { 7396 ch = *s; 7397 x = mapdata_ucs2[ch]; 7398 if (x > maxchar) 7399 goto Error; 7400 outdata[writer->pos] = x; 7401 writer->pos++; 7402 ++s; 7403 } 7404 break; 7405 } 7406 else if (outkind == PyUnicode_2BYTE_KIND) { 7407 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7408 while (s < e) { 7409 ch = *s; 7410 x = mapdata_ucs2[ch]; 7411 if (x == 0xFFFE) 7412 goto Error; 7413 outdata[writer->pos] = x; 7414 writer->pos++; 7415 ++s; 7416 } 7417 break; 7418 } 7419 } 7420 ch = *s; 7421 7422 if (ch < maplen) 7423 x = PyUnicode_READ(mapkind, mapdata, ch); 7424 else 7425 x = 0xfffe; /* invalid value */ 7426Error: 7427 if (x == 0xfffe) 7428 { 7429 /* undefined mapping */ 7430 startinpos = s-starts; 7431 endinpos = startinpos+1; 7432 if (unicode_decode_call_errorhandler_writer( 7433 errors, &errorHandler, 7434 "charmap", "character maps to <undefined>", 7435 &starts, &e, &startinpos, &endinpos, &exc, &s, 7436 writer)) { 7437 goto onError; 7438 } 7439 continue; 7440 } 7441 7442 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7443 goto onError; 7444 ++s; 7445 } 7446 Py_XDECREF(errorHandler); 7447 Py_XDECREF(exc); 7448 return 0; 7449 7450onError: 7451 Py_XDECREF(errorHandler); 7452 Py_XDECREF(exc); 7453 return -1; 7454} 7455 7456static int 7457charmap_decode_mapping(const char *s, 7458 Py_ssize_t size, 7459 PyObject *mapping, 7460 const char *errors, 7461 _PyUnicodeWriter *writer) 7462{ 7463 const char *starts = s; 7464 const char *e; 7465 Py_ssize_t startinpos, endinpos; 7466 PyObject *errorHandler = NULL, *exc = NULL; 7467 unsigned char ch; 7468 PyObject *key, *item = NULL; 7469 7470 e = s + size; 7471 7472 while (s < e) { 7473 ch = *s; 7474 7475 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7476 key = PyLong_FromLong((long)ch); 7477 if (key == NULL) 7478 goto onError; 7479 7480 item = PyObject_GetItem(mapping, key); 7481 Py_DECREF(key); 7482 if (item == NULL) { 7483 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7484 /* No mapping found means: mapping is undefined. */ 7485 PyErr_Clear(); 7486 goto Undefined; 7487 } else 7488 goto onError; 7489 } 7490 7491 /* Apply mapping */ 7492 if (item == Py_None) 7493 goto Undefined; 7494 if (PyLong_Check(item)) { 7495 long value = PyLong_AS_LONG(item); 7496 if (value == 0xFFFE) 7497 goto Undefined; 7498 if (value < 0 || value > MAX_UNICODE) { 7499 PyErr_Format(PyExc_TypeError, 7500 "character mapping must be in range(0x%lx)", 7501 (unsigned long)MAX_UNICODE + 1); 7502 goto onError; 7503 } 7504 7505 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7506 goto onError; 7507 } 7508 else if (PyUnicode_Check(item)) { 7509 if (PyUnicode_READY(item) == -1) 7510 goto onError; 7511 if (PyUnicode_GET_LENGTH(item) == 1) { 7512 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7513 if (value == 0xFFFE) 7514 goto Undefined; 7515 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7516 goto onError; 7517 } 7518 else { 7519 writer->overallocate = 1; 7520 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7521 goto onError; 7522 } 7523 } 7524 else { 7525 /* wrong return value */ 7526 PyErr_SetString(PyExc_TypeError, 7527 "character mapping must return integer, None or str"); 7528 goto onError; 7529 } 7530 Py_CLEAR(item); 7531 ++s; 7532 continue; 7533 7534Undefined: 7535 /* undefined mapping */ 7536 Py_CLEAR(item); 7537 startinpos = s-starts; 7538 endinpos = startinpos+1; 7539 if (unicode_decode_call_errorhandler_writer( 7540 errors, &errorHandler, 7541 "charmap", "character maps to <undefined>", 7542 &starts, &e, &startinpos, &endinpos, &exc, &s, 7543 writer)) { 7544 goto onError; 7545 } 7546 } 7547 Py_XDECREF(errorHandler); 7548 Py_XDECREF(exc); 7549 return 0; 7550 7551onError: 7552 Py_XDECREF(item); 7553 Py_XDECREF(errorHandler); 7554 Py_XDECREF(exc); 7555 return -1; 7556} 7557 7558PyObject * 7559PyUnicode_DecodeCharmap(const char *s, 7560 Py_ssize_t size, 7561 PyObject *mapping, 7562 const char *errors) 7563{ 7564 _PyUnicodeWriter writer; 7565 7566 /* Default to Latin-1 */ 7567 if (mapping == NULL) 7568 return PyUnicode_DecodeLatin1(s, size, errors); 7569 7570 if (size == 0) 7571 _Py_RETURN_UNICODE_EMPTY(); 7572 _PyUnicodeWriter_Init(&writer); 7573 writer.min_length = size; 7574 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7575 goto onError; 7576 7577 if (PyUnicode_CheckExact(mapping)) { 7578 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7579 goto onError; 7580 } 7581 else { 7582 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7583 goto onError; 7584 } 7585 return _PyUnicodeWriter_Finish(&writer); 7586 7587 onError: 7588 _PyUnicodeWriter_Dealloc(&writer); 7589 return NULL; 7590} 7591 7592/* Charmap encoding: the lookup table */ 7593 7594struct encoding_map { 7595 PyObject_HEAD 7596 unsigned char level1[32]; 7597 int count2, count3; 7598 unsigned char level23[1]; 7599}; 7600 7601static PyObject* 7602encoding_map_size(PyObject *obj, PyObject* args) 7603{ 7604 struct encoding_map *map = (struct encoding_map*)obj; 7605 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7606 128*map->count3); 7607} 7608 7609static PyMethodDef encoding_map_methods[] = { 7610 {"size", encoding_map_size, METH_NOARGS, 7611 PyDoc_STR("Return the size (in bytes) of this object") }, 7612 { 0 } 7613}; 7614 7615static void 7616encoding_map_dealloc(PyObject* o) 7617{ 7618 PyObject_FREE(o); 7619} 7620 7621static PyTypeObject EncodingMapType = { 7622 PyVarObject_HEAD_INIT(NULL, 0) 7623 "EncodingMap", /*tp_name*/ 7624 sizeof(struct encoding_map), /*tp_basicsize*/ 7625 0, /*tp_itemsize*/ 7626 /* methods */ 7627 encoding_map_dealloc, /*tp_dealloc*/ 7628 0, /*tp_print*/ 7629 0, /*tp_getattr*/ 7630 0, /*tp_setattr*/ 7631 0, /*tp_reserved*/ 7632 0, /*tp_repr*/ 7633 0, /*tp_as_number*/ 7634 0, /*tp_as_sequence*/ 7635 0, /*tp_as_mapping*/ 7636 0, /*tp_hash*/ 7637 0, /*tp_call*/ 7638 0, /*tp_str*/ 7639 0, /*tp_getattro*/ 7640 0, /*tp_setattro*/ 7641 0, /*tp_as_buffer*/ 7642 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7643 0, /*tp_doc*/ 7644 0, /*tp_traverse*/ 7645 0, /*tp_clear*/ 7646 0, /*tp_richcompare*/ 7647 0, /*tp_weaklistoffset*/ 7648 0, /*tp_iter*/ 7649 0, /*tp_iternext*/ 7650 encoding_map_methods, /*tp_methods*/ 7651 0, /*tp_members*/ 7652 0, /*tp_getset*/ 7653 0, /*tp_base*/ 7654 0, /*tp_dict*/ 7655 0, /*tp_descr_get*/ 7656 0, /*tp_descr_set*/ 7657 0, /*tp_dictoffset*/ 7658 0, /*tp_init*/ 7659 0, /*tp_alloc*/ 7660 0, /*tp_new*/ 7661 0, /*tp_free*/ 7662 0, /*tp_is_gc*/ 7663}; 7664 7665PyObject* 7666PyUnicode_BuildEncodingMap(PyObject* string) 7667{ 7668 PyObject *result; 7669 struct encoding_map *mresult; 7670 int i; 7671 int need_dict = 0; 7672 unsigned char level1[32]; 7673 unsigned char level2[512]; 7674 unsigned char *mlevel1, *mlevel2, *mlevel3; 7675 int count2 = 0, count3 = 0; 7676 int kind; 7677 void *data; 7678 Py_ssize_t length; 7679 Py_UCS4 ch; 7680 7681 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7682 PyErr_BadArgument(); 7683 return NULL; 7684 } 7685 kind = PyUnicode_KIND(string); 7686 data = PyUnicode_DATA(string); 7687 length = PyUnicode_GET_LENGTH(string); 7688 length = Py_MIN(length, 256); 7689 memset(level1, 0xFF, sizeof level1); 7690 memset(level2, 0xFF, sizeof level2); 7691 7692 /* If there isn't a one-to-one mapping of NULL to \0, 7693 or if there are non-BMP characters, we need to use 7694 a mapping dictionary. */ 7695 if (PyUnicode_READ(kind, data, 0) != 0) 7696 need_dict = 1; 7697 for (i = 1; i < length; i++) { 7698 int l1, l2; 7699 ch = PyUnicode_READ(kind, data, i); 7700 if (ch == 0 || ch > 0xFFFF) { 7701 need_dict = 1; 7702 break; 7703 } 7704 if (ch == 0xFFFE) 7705 /* unmapped character */ 7706 continue; 7707 l1 = ch >> 11; 7708 l2 = ch >> 7; 7709 if (level1[l1] == 0xFF) 7710 level1[l1] = count2++; 7711 if (level2[l2] == 0xFF) 7712 level2[l2] = count3++; 7713 } 7714 7715 if (count2 >= 0xFF || count3 >= 0xFF) 7716 need_dict = 1; 7717 7718 if (need_dict) { 7719 PyObject *result = PyDict_New(); 7720 PyObject *key, *value; 7721 if (!result) 7722 return NULL; 7723 for (i = 0; i < length; i++) { 7724 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7725 value = PyLong_FromLong(i); 7726 if (!key || !value) 7727 goto failed1; 7728 if (PyDict_SetItem(result, key, value) == -1) 7729 goto failed1; 7730 Py_DECREF(key); 7731 Py_DECREF(value); 7732 } 7733 return result; 7734 failed1: 7735 Py_XDECREF(key); 7736 Py_XDECREF(value); 7737 Py_DECREF(result); 7738 return NULL; 7739 } 7740 7741 /* Create a three-level trie */ 7742 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7743 16*count2 + 128*count3 - 1); 7744 if (!result) 7745 return PyErr_NoMemory(); 7746 PyObject_Init(result, &EncodingMapType); 7747 mresult = (struct encoding_map*)result; 7748 mresult->count2 = count2; 7749 mresult->count3 = count3; 7750 mlevel1 = mresult->level1; 7751 mlevel2 = mresult->level23; 7752 mlevel3 = mresult->level23 + 16*count2; 7753 memcpy(mlevel1, level1, 32); 7754 memset(mlevel2, 0xFF, 16*count2); 7755 memset(mlevel3, 0, 128*count3); 7756 count3 = 0; 7757 for (i = 1; i < length; i++) { 7758 int o1, o2, o3, i2, i3; 7759 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7760 if (ch == 0xFFFE) 7761 /* unmapped character */ 7762 continue; 7763 o1 = ch>>11; 7764 o2 = (ch>>7) & 0xF; 7765 i2 = 16*mlevel1[o1] + o2; 7766 if (mlevel2[i2] == 0xFF) 7767 mlevel2[i2] = count3++; 7768 o3 = ch & 0x7F; 7769 i3 = 128*mlevel2[i2] + o3; 7770 mlevel3[i3] = i; 7771 } 7772 return result; 7773} 7774 7775static int 7776encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7777{ 7778 struct encoding_map *map = (struct encoding_map*)mapping; 7779 int l1 = c>>11; 7780 int l2 = (c>>7) & 0xF; 7781 int l3 = c & 0x7F; 7782 int i; 7783 7784 if (c > 0xFFFF) 7785 return -1; 7786 if (c == 0) 7787 return 0; 7788 /* level 1*/ 7789 i = map->level1[l1]; 7790 if (i == 0xFF) { 7791 return -1; 7792 } 7793 /* level 2*/ 7794 i = map->level23[16*i+l2]; 7795 if (i == 0xFF) { 7796 return -1; 7797 } 7798 /* level 3 */ 7799 i = map->level23[16*map->count2 + 128*i + l3]; 7800 if (i == 0) { 7801 return -1; 7802 } 7803 return i; 7804} 7805 7806/* Lookup the character ch in the mapping. If the character 7807 can't be found, Py_None is returned (or NULL, if another 7808 error occurred). */ 7809static PyObject * 7810charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7811{ 7812 PyObject *w = PyLong_FromLong((long)c); 7813 PyObject *x; 7814 7815 if (w == NULL) 7816 return NULL; 7817 x = PyObject_GetItem(mapping, w); 7818 Py_DECREF(w); 7819 if (x == NULL) { 7820 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7821 /* No mapping found means: mapping is undefined. */ 7822 PyErr_Clear(); 7823 x = Py_None; 7824 Py_INCREF(x); 7825 return x; 7826 } else 7827 return NULL; 7828 } 7829 else if (x == Py_None) 7830 return x; 7831 else if (PyLong_Check(x)) { 7832 long value = PyLong_AS_LONG(x); 7833 if (value < 0 || value > 255) { 7834 PyErr_SetString(PyExc_TypeError, 7835 "character mapping must be in range(256)"); 7836 Py_DECREF(x); 7837 return NULL; 7838 } 7839 return x; 7840 } 7841 else if (PyBytes_Check(x)) 7842 return x; 7843 else { 7844 /* wrong return value */ 7845 PyErr_Format(PyExc_TypeError, 7846 "character mapping must return integer, bytes or None, not %.400s", 7847 x->ob_type->tp_name); 7848 Py_DECREF(x); 7849 return NULL; 7850 } 7851} 7852 7853static int 7854charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7855{ 7856 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7857 /* exponentially overallocate to minimize reallocations */ 7858 if (requiredsize < 2*outsize) 7859 requiredsize = 2*outsize; 7860 if (_PyBytes_Resize(outobj, requiredsize)) 7861 return -1; 7862 return 0; 7863} 7864 7865typedef enum charmapencode_result { 7866 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7867} charmapencode_result; 7868/* lookup the character, put the result in the output string and adjust 7869 various state variables. Resize the output bytes object if not enough 7870 space is available. Return a new reference to the object that 7871 was put in the output buffer, or Py_None, if the mapping was undefined 7872 (in which case no character was written) or NULL, if a 7873 reallocation error occurred. The caller must decref the result */ 7874static charmapencode_result 7875charmapencode_output(Py_UCS4 c, PyObject *mapping, 7876 PyObject **outobj, Py_ssize_t *outpos) 7877{ 7878 PyObject *rep; 7879 char *outstart; 7880 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7881 7882 if (Py_TYPE(mapping) == &EncodingMapType) { 7883 int res = encoding_map_lookup(c, mapping); 7884 Py_ssize_t requiredsize = *outpos+1; 7885 if (res == -1) 7886 return enc_FAILED; 7887 if (outsize<requiredsize) 7888 if (charmapencode_resize(outobj, outpos, requiredsize)) 7889 return enc_EXCEPTION; 7890 outstart = PyBytes_AS_STRING(*outobj); 7891 outstart[(*outpos)++] = (char)res; 7892 return enc_SUCCESS; 7893 } 7894 7895 rep = charmapencode_lookup(c, mapping); 7896 if (rep==NULL) 7897 return enc_EXCEPTION; 7898 else if (rep==Py_None) { 7899 Py_DECREF(rep); 7900 return enc_FAILED; 7901 } else { 7902 if (PyLong_Check(rep)) { 7903 Py_ssize_t requiredsize = *outpos+1; 7904 if (outsize<requiredsize) 7905 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7906 Py_DECREF(rep); 7907 return enc_EXCEPTION; 7908 } 7909 outstart = PyBytes_AS_STRING(*outobj); 7910 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7911 } 7912 else { 7913 const char *repchars = PyBytes_AS_STRING(rep); 7914 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7915 Py_ssize_t requiredsize = *outpos+repsize; 7916 if (outsize<requiredsize) 7917 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7918 Py_DECREF(rep); 7919 return enc_EXCEPTION; 7920 } 7921 outstart = PyBytes_AS_STRING(*outobj); 7922 memcpy(outstart + *outpos, repchars, repsize); 7923 *outpos += repsize; 7924 } 7925 } 7926 Py_DECREF(rep); 7927 return enc_SUCCESS; 7928} 7929 7930/* handle an error in PyUnicode_EncodeCharmap 7931 Return 0 on success, -1 on error */ 7932static int 7933charmap_encoding_error( 7934 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7935 PyObject **exceptionObject, 7936 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7937 PyObject **res, Py_ssize_t *respos) 7938{ 7939 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7940 Py_ssize_t size, repsize; 7941 Py_ssize_t newpos; 7942 enum PyUnicode_Kind kind; 7943 void *data; 7944 Py_ssize_t index; 7945 /* startpos for collecting unencodable chars */ 7946 Py_ssize_t collstartpos = *inpos; 7947 Py_ssize_t collendpos = *inpos+1; 7948 Py_ssize_t collpos; 7949 char *encoding = "charmap"; 7950 char *reason = "character maps to <undefined>"; 7951 charmapencode_result x; 7952 Py_UCS4 ch; 7953 int val; 7954 7955 if (PyUnicode_READY(unicode) == -1) 7956 return -1; 7957 size = PyUnicode_GET_LENGTH(unicode); 7958 /* find all unencodable characters */ 7959 while (collendpos < size) { 7960 PyObject *rep; 7961 if (Py_TYPE(mapping) == &EncodingMapType) { 7962 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7963 val = encoding_map_lookup(ch, mapping); 7964 if (val != -1) 7965 break; 7966 ++collendpos; 7967 continue; 7968 } 7969 7970 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7971 rep = charmapencode_lookup(ch, mapping); 7972 if (rep==NULL) 7973 return -1; 7974 else if (rep!=Py_None) { 7975 Py_DECREF(rep); 7976 break; 7977 } 7978 Py_DECREF(rep); 7979 ++collendpos; 7980 } 7981 /* cache callback name lookup 7982 * (if not done yet, i.e. it's the first error) */ 7983 if (*known_errorHandler==-1) { 7984 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7985 *known_errorHandler = 1; 7986 else if (!strcmp(errors, "replace")) 7987 *known_errorHandler = 2; 7988 else if (!strcmp(errors, "ignore")) 7989 *known_errorHandler = 3; 7990 else if (!strcmp(errors, "xmlcharrefreplace")) 7991 *known_errorHandler = 4; 7992 else 7993 *known_errorHandler = 0; 7994 } 7995 switch (*known_errorHandler) { 7996 case 1: /* strict */ 7997 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7998 return -1; 7999 case 2: /* replace */ 8000 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8001 x = charmapencode_output('?', mapping, res, respos); 8002 if (x==enc_EXCEPTION) { 8003 return -1; 8004 } 8005 else if (x==enc_FAILED) { 8006 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8007 return -1; 8008 } 8009 } 8010 /* fall through */ 8011 case 3: /* ignore */ 8012 *inpos = collendpos; 8013 break; 8014 case 4: /* xmlcharrefreplace */ 8015 /* generate replacement (temporarily (mis)uses p) */ 8016 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8017 char buffer[2+29+1+1]; 8018 char *cp; 8019 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8020 for (cp = buffer; *cp; ++cp) { 8021 x = charmapencode_output(*cp, mapping, res, respos); 8022 if (x==enc_EXCEPTION) 8023 return -1; 8024 else if (x==enc_FAILED) { 8025 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8026 return -1; 8027 } 8028 } 8029 } 8030 *inpos = collendpos; 8031 break; 8032 default: 8033 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8034 encoding, reason, unicode, exceptionObject, 8035 collstartpos, collendpos, &newpos); 8036 if (repunicode == NULL) 8037 return -1; 8038 if (PyBytes_Check(repunicode)) { 8039 /* Directly copy bytes result to output. */ 8040 Py_ssize_t outsize = PyBytes_Size(*res); 8041 Py_ssize_t requiredsize; 8042 repsize = PyBytes_Size(repunicode); 8043 requiredsize = *respos + repsize; 8044 if (requiredsize > outsize) 8045 /* Make room for all additional bytes. */ 8046 if (charmapencode_resize(res, respos, requiredsize)) { 8047 Py_DECREF(repunicode); 8048 return -1; 8049 } 8050 memcpy(PyBytes_AsString(*res) + *respos, 8051 PyBytes_AsString(repunicode), repsize); 8052 *respos += repsize; 8053 *inpos = newpos; 8054 Py_DECREF(repunicode); 8055 break; 8056 } 8057 /* generate replacement */ 8058 if (PyUnicode_READY(repunicode) == -1) { 8059 Py_DECREF(repunicode); 8060 return -1; 8061 } 8062 repsize = PyUnicode_GET_LENGTH(repunicode); 8063 data = PyUnicode_DATA(repunicode); 8064 kind = PyUnicode_KIND(repunicode); 8065 for (index = 0; index < repsize; index++) { 8066 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8067 x = charmapencode_output(repch, mapping, res, respos); 8068 if (x==enc_EXCEPTION) { 8069 Py_DECREF(repunicode); 8070 return -1; 8071 } 8072 else if (x==enc_FAILED) { 8073 Py_DECREF(repunicode); 8074 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8075 return -1; 8076 } 8077 } 8078 *inpos = newpos; 8079 Py_DECREF(repunicode); 8080 } 8081 return 0; 8082} 8083 8084PyObject * 8085_PyUnicode_EncodeCharmap(PyObject *unicode, 8086 PyObject *mapping, 8087 const char *errors) 8088{ 8089 /* output object */ 8090 PyObject *res = NULL; 8091 /* current input position */ 8092 Py_ssize_t inpos = 0; 8093 Py_ssize_t size; 8094 /* current output position */ 8095 Py_ssize_t respos = 0; 8096 PyObject *errorHandler = NULL; 8097 PyObject *exc = NULL; 8098 /* the following variable is used for caching string comparisons 8099 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8100 * 3=ignore, 4=xmlcharrefreplace */ 8101 int known_errorHandler = -1; 8102 void *data; 8103 int kind; 8104 8105 if (PyUnicode_READY(unicode) == -1) 8106 return NULL; 8107 size = PyUnicode_GET_LENGTH(unicode); 8108 data = PyUnicode_DATA(unicode); 8109 kind = PyUnicode_KIND(unicode); 8110 8111 /* Default to Latin-1 */ 8112 if (mapping == NULL) 8113 return unicode_encode_ucs1(unicode, errors, 256); 8114 8115 /* allocate enough for a simple encoding without 8116 replacements, if we need more, we'll resize */ 8117 res = PyBytes_FromStringAndSize(NULL, size); 8118 if (res == NULL) 8119 goto onError; 8120 if (size == 0) 8121 return res; 8122 8123 while (inpos<size) { 8124 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8125 /* try to encode it */ 8126 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8127 if (x==enc_EXCEPTION) /* error */ 8128 goto onError; 8129 if (x==enc_FAILED) { /* unencodable character */ 8130 if (charmap_encoding_error(unicode, &inpos, mapping, 8131 &exc, 8132 &known_errorHandler, &errorHandler, errors, 8133 &res, &respos)) { 8134 goto onError; 8135 } 8136 } 8137 else 8138 /* done with this character => adjust input position */ 8139 ++inpos; 8140 } 8141 8142 /* Resize if we allocated to much */ 8143 if (respos<PyBytes_GET_SIZE(res)) 8144 if (_PyBytes_Resize(&res, respos) < 0) 8145 goto onError; 8146 8147 Py_XDECREF(exc); 8148 Py_XDECREF(errorHandler); 8149 return res; 8150 8151 onError: 8152 Py_XDECREF(res); 8153 Py_XDECREF(exc); 8154 Py_XDECREF(errorHandler); 8155 return NULL; 8156} 8157 8158/* Deprecated */ 8159PyObject * 8160PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8161 Py_ssize_t size, 8162 PyObject *mapping, 8163 const char *errors) 8164{ 8165 PyObject *result; 8166 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8167 if (unicode == NULL) 8168 return NULL; 8169 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8170 Py_DECREF(unicode); 8171 return result; 8172} 8173 8174PyObject * 8175PyUnicode_AsCharmapString(PyObject *unicode, 8176 PyObject *mapping) 8177{ 8178 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8179 PyErr_BadArgument(); 8180 return NULL; 8181 } 8182 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8183} 8184 8185/* create or adjust a UnicodeTranslateError */ 8186static void 8187make_translate_exception(PyObject **exceptionObject, 8188 PyObject *unicode, 8189 Py_ssize_t startpos, Py_ssize_t endpos, 8190 const char *reason) 8191{ 8192 if (*exceptionObject == NULL) { 8193 *exceptionObject = _PyUnicodeTranslateError_Create( 8194 unicode, startpos, endpos, reason); 8195 } 8196 else { 8197 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8198 goto onError; 8199 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8200 goto onError; 8201 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8202 goto onError; 8203 return; 8204 onError: 8205 Py_DECREF(*exceptionObject); 8206 *exceptionObject = NULL; 8207 } 8208} 8209 8210/* error handling callback helper: 8211 build arguments, call the callback and check the arguments, 8212 put the result into newpos and return the replacement string, which 8213 has to be freed by the caller */ 8214static PyObject * 8215unicode_translate_call_errorhandler(const char *errors, 8216 PyObject **errorHandler, 8217 const char *reason, 8218 PyObject *unicode, PyObject **exceptionObject, 8219 Py_ssize_t startpos, Py_ssize_t endpos, 8220 Py_ssize_t *newpos) 8221{ 8222 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8223 8224 Py_ssize_t i_newpos; 8225 PyObject *restuple; 8226 PyObject *resunicode; 8227 8228 if (*errorHandler == NULL) { 8229 *errorHandler = PyCodec_LookupError(errors); 8230 if (*errorHandler == NULL) 8231 return NULL; 8232 } 8233 8234 make_translate_exception(exceptionObject, 8235 unicode, startpos, endpos, reason); 8236 if (*exceptionObject == NULL) 8237 return NULL; 8238 8239 restuple = PyObject_CallFunctionObjArgs( 8240 *errorHandler, *exceptionObject, NULL); 8241 if (restuple == NULL) 8242 return NULL; 8243 if (!PyTuple_Check(restuple)) { 8244 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8245 Py_DECREF(restuple); 8246 return NULL; 8247 } 8248 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8249 &resunicode, &i_newpos)) { 8250 Py_DECREF(restuple); 8251 return NULL; 8252 } 8253 if (i_newpos<0) 8254 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8255 else 8256 *newpos = i_newpos; 8257 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8258 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8259 Py_DECREF(restuple); 8260 return NULL; 8261 } 8262 Py_INCREF(resunicode); 8263 Py_DECREF(restuple); 8264 return resunicode; 8265} 8266 8267/* Lookup the character ch in the mapping and put the result in result, 8268 which must be decrefed by the caller. 8269 Return 0 on success, -1 on error */ 8270static int 8271charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8272{ 8273 PyObject *w = PyLong_FromLong((long)c); 8274 PyObject *x; 8275 8276 if (w == NULL) 8277 return -1; 8278 x = PyObject_GetItem(mapping, w); 8279 Py_DECREF(w); 8280 if (x == NULL) { 8281 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8282 /* No mapping found means: use 1:1 mapping. */ 8283 PyErr_Clear(); 8284 *result = NULL; 8285 return 0; 8286 } else 8287 return -1; 8288 } 8289 else if (x == Py_None) { 8290 *result = x; 8291 return 0; 8292 } 8293 else if (PyLong_Check(x)) { 8294 long value = PyLong_AS_LONG(x); 8295 long max = PyUnicode_GetMax(); 8296 if (value < 0 || value > max) { 8297 PyErr_Format(PyExc_TypeError, 8298 "character mapping must be in range(0x%x)", max+1); 8299 Py_DECREF(x); 8300 return -1; 8301 } 8302 *result = x; 8303 return 0; 8304 } 8305 else if (PyUnicode_Check(x)) { 8306 *result = x; 8307 return 0; 8308 } 8309 else { 8310 /* wrong return value */ 8311 PyErr_SetString(PyExc_TypeError, 8312 "character mapping must return integer, None or str"); 8313 Py_DECREF(x); 8314 return -1; 8315 } 8316} 8317/* ensure that *outobj is at least requiredsize characters long, 8318 if not reallocate and adjust various state variables. 8319 Return 0 on success, -1 on error */ 8320static int 8321charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8322 Py_ssize_t requiredsize) 8323{ 8324 Py_ssize_t oldsize = *psize; 8325 Py_UCS4 *new_outobj; 8326 if (requiredsize > oldsize) { 8327 /* exponentially overallocate to minimize reallocations */ 8328 if (requiredsize < 2 * oldsize) 8329 requiredsize = 2 * oldsize; 8330 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8331 if (new_outobj == 0) 8332 return -1; 8333 *outobj = new_outobj; 8334 *psize = requiredsize; 8335 } 8336 return 0; 8337} 8338/* lookup the character, put the result in the output string and adjust 8339 various state variables. Return a new reference to the object that 8340 was put in the output buffer in *result, or Py_None, if the mapping was 8341 undefined (in which case no character was written). 8342 The called must decref result. 8343 Return 0 on success, -1 on error. */ 8344static int 8345charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8346 PyObject *mapping, Py_UCS4 **output, 8347 Py_ssize_t *osize, Py_ssize_t *opos, 8348 PyObject **res) 8349{ 8350 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8351 if (charmaptranslate_lookup(curinp, mapping, res)) 8352 return -1; 8353 if (*res==NULL) { 8354 /* not found => default to 1:1 mapping */ 8355 (*output)[(*opos)++] = curinp; 8356 } 8357 else if (*res==Py_None) 8358 ; 8359 else if (PyLong_Check(*res)) { 8360 /* no overflow check, because we know that the space is enough */ 8361 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8362 } 8363 else if (PyUnicode_Check(*res)) { 8364 Py_ssize_t repsize; 8365 if (PyUnicode_READY(*res) == -1) 8366 return -1; 8367 repsize = PyUnicode_GET_LENGTH(*res); 8368 if (repsize==1) { 8369 /* no overflow check, because we know that the space is enough */ 8370 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8371 } 8372 else if (repsize!=0) { 8373 /* more than one character */ 8374 Py_ssize_t requiredsize = *opos + 8375 (PyUnicode_GET_LENGTH(input) - ipos) + 8376 repsize - 1; 8377 Py_ssize_t i; 8378 if (charmaptranslate_makespace(output, osize, requiredsize)) 8379 return -1; 8380 for(i = 0; i < repsize; i++) 8381 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8382 } 8383 } 8384 else 8385 return -1; 8386 return 0; 8387} 8388 8389PyObject * 8390_PyUnicode_TranslateCharmap(PyObject *input, 8391 PyObject *mapping, 8392 const char *errors) 8393{ 8394 /* input object */ 8395 char *idata; 8396 Py_ssize_t size, i; 8397 int kind; 8398 /* output buffer */ 8399 Py_UCS4 *output = NULL; 8400 Py_ssize_t osize; 8401 PyObject *res; 8402 /* current output position */ 8403 Py_ssize_t opos; 8404 char *reason = "character maps to <undefined>"; 8405 PyObject *errorHandler = NULL; 8406 PyObject *exc = NULL; 8407 /* the following variable is used for caching string comparisons 8408 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8409 * 3=ignore, 4=xmlcharrefreplace */ 8410 int known_errorHandler = -1; 8411 8412 if (mapping == NULL) { 8413 PyErr_BadArgument(); 8414 return NULL; 8415 } 8416 8417 if (PyUnicode_READY(input) == -1) 8418 return NULL; 8419 idata = (char*)PyUnicode_DATA(input); 8420 kind = PyUnicode_KIND(input); 8421 size = PyUnicode_GET_LENGTH(input); 8422 i = 0; 8423 8424 if (size == 0) { 8425 Py_INCREF(input); 8426 return input; 8427 } 8428 8429 /* allocate enough for a simple 1:1 translation without 8430 replacements, if we need more, we'll resize */ 8431 osize = size; 8432 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8433 opos = 0; 8434 if (output == NULL) { 8435 PyErr_NoMemory(); 8436 goto onError; 8437 } 8438 8439 while (i<size) { 8440 /* try to encode it */ 8441 PyObject *x = NULL; 8442 if (charmaptranslate_output(input, i, mapping, 8443 &output, &osize, &opos, &x)) { 8444 Py_XDECREF(x); 8445 goto onError; 8446 } 8447 Py_XDECREF(x); 8448 if (x!=Py_None) /* it worked => adjust input pointer */ 8449 ++i; 8450 else { /* untranslatable character */ 8451 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8452 Py_ssize_t repsize; 8453 Py_ssize_t newpos; 8454 Py_ssize_t uni2; 8455 /* startpos for collecting untranslatable chars */ 8456 Py_ssize_t collstart = i; 8457 Py_ssize_t collend = i+1; 8458 Py_ssize_t coll; 8459 8460 /* find all untranslatable characters */ 8461 while (collend < size) { 8462 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8463 goto onError; 8464 Py_XDECREF(x); 8465 if (x!=Py_None) 8466 break; 8467 ++collend; 8468 } 8469 /* cache callback name lookup 8470 * (if not done yet, i.e. it's the first error) */ 8471 if (known_errorHandler==-1) { 8472 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8473 known_errorHandler = 1; 8474 else if (!strcmp(errors, "replace")) 8475 known_errorHandler = 2; 8476 else if (!strcmp(errors, "ignore")) 8477 known_errorHandler = 3; 8478 else if (!strcmp(errors, "xmlcharrefreplace")) 8479 known_errorHandler = 4; 8480 else 8481 known_errorHandler = 0; 8482 } 8483 switch (known_errorHandler) { 8484 case 1: /* strict */ 8485 make_translate_exception(&exc, 8486 input, collstart, collend, reason); 8487 if (exc != NULL) 8488 PyCodec_StrictErrors(exc); 8489 goto onError; 8490 case 2: /* replace */ 8491 /* No need to check for space, this is a 1:1 replacement */ 8492 for (coll = collstart; coll<collend; coll++) 8493 output[opos++] = '?'; 8494 /* fall through */ 8495 case 3: /* ignore */ 8496 i = collend; 8497 break; 8498 case 4: /* xmlcharrefreplace */ 8499 /* generate replacement (temporarily (mis)uses i) */ 8500 for (i = collstart; i < collend; ++i) { 8501 char buffer[2+29+1+1]; 8502 char *cp; 8503 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8504 if (charmaptranslate_makespace(&output, &osize, 8505 opos+strlen(buffer)+(size-collend))) 8506 goto onError; 8507 for (cp = buffer; *cp; ++cp) 8508 output[opos++] = *cp; 8509 } 8510 i = collend; 8511 break; 8512 default: 8513 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8514 reason, input, &exc, 8515 collstart, collend, &newpos); 8516 if (repunicode == NULL) 8517 goto onError; 8518 if (PyUnicode_READY(repunicode) == -1) { 8519 Py_DECREF(repunicode); 8520 goto onError; 8521 } 8522 /* generate replacement */ 8523 repsize = PyUnicode_GET_LENGTH(repunicode); 8524 if (charmaptranslate_makespace(&output, &osize, 8525 opos+repsize+(size-collend))) { 8526 Py_DECREF(repunicode); 8527 goto onError; 8528 } 8529 for (uni2 = 0; repsize-->0; ++uni2) 8530 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8531 i = newpos; 8532 Py_DECREF(repunicode); 8533 } 8534 } 8535 } 8536 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8537 if (!res) 8538 goto onError; 8539 PyMem_Free(output); 8540 Py_XDECREF(exc); 8541 Py_XDECREF(errorHandler); 8542 return res; 8543 8544 onError: 8545 PyMem_Free(output); 8546 Py_XDECREF(exc); 8547 Py_XDECREF(errorHandler); 8548 return NULL; 8549} 8550 8551/* Deprecated. Use PyUnicode_Translate instead. */ 8552PyObject * 8553PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8554 Py_ssize_t size, 8555 PyObject *mapping, 8556 const char *errors) 8557{ 8558 PyObject *result; 8559 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8560 if (!unicode) 8561 return NULL; 8562 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8563 Py_DECREF(unicode); 8564 return result; 8565} 8566 8567PyObject * 8568PyUnicode_Translate(PyObject *str, 8569 PyObject *mapping, 8570 const char *errors) 8571{ 8572 PyObject *result; 8573 8574 str = PyUnicode_FromObject(str); 8575 if (str == NULL) 8576 return NULL; 8577 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8578 Py_DECREF(str); 8579 return result; 8580} 8581 8582static Py_UCS4 8583fix_decimal_and_space_to_ascii(PyObject *self) 8584{ 8585 /* No need to call PyUnicode_READY(self) because this function is only 8586 called as a callback from fixup() which does it already. */ 8587 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8588 const int kind = PyUnicode_KIND(self); 8589 void *data = PyUnicode_DATA(self); 8590 Py_UCS4 maxchar = 127, ch, fixed; 8591 int modified = 0; 8592 Py_ssize_t i; 8593 8594 for (i = 0; i < len; ++i) { 8595 ch = PyUnicode_READ(kind, data, i); 8596 fixed = 0; 8597 if (ch > 127) { 8598 if (Py_UNICODE_ISSPACE(ch)) 8599 fixed = ' '; 8600 else { 8601 const int decimal = Py_UNICODE_TODECIMAL(ch); 8602 if (decimal >= 0) 8603 fixed = '0' + decimal; 8604 } 8605 if (fixed != 0) { 8606 modified = 1; 8607 maxchar = Py_MAX(maxchar, fixed); 8608 PyUnicode_WRITE(kind, data, i, fixed); 8609 } 8610 else 8611 maxchar = Py_MAX(maxchar, ch); 8612 } 8613 } 8614 8615 return (modified) ? maxchar : 0; 8616} 8617 8618PyObject * 8619_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8620{ 8621 if (!PyUnicode_Check(unicode)) { 8622 PyErr_BadInternalCall(); 8623 return NULL; 8624 } 8625 if (PyUnicode_READY(unicode) == -1) 8626 return NULL; 8627 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8628 /* If the string is already ASCII, just return the same string */ 8629 Py_INCREF(unicode); 8630 return unicode; 8631 } 8632 return fixup(unicode, fix_decimal_and_space_to_ascii); 8633} 8634 8635PyObject * 8636PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8637 Py_ssize_t length) 8638{ 8639 PyObject *decimal; 8640 Py_ssize_t i; 8641 Py_UCS4 maxchar; 8642 enum PyUnicode_Kind kind; 8643 void *data; 8644 8645 maxchar = 127; 8646 for (i = 0; i < length; i++) { 8647 Py_UNICODE ch = s[i]; 8648 if (ch > 127) { 8649 int decimal = Py_UNICODE_TODECIMAL(ch); 8650 if (decimal >= 0) 8651 ch = '0' + decimal; 8652 maxchar = Py_MAX(maxchar, ch); 8653 } 8654 } 8655 8656 /* Copy to a new string */ 8657 decimal = PyUnicode_New(length, maxchar); 8658 if (decimal == NULL) 8659 return decimal; 8660 kind = PyUnicode_KIND(decimal); 8661 data = PyUnicode_DATA(decimal); 8662 /* Iterate over code points */ 8663 for (i = 0; i < length; i++) { 8664 Py_UNICODE ch = s[i]; 8665 if (ch > 127) { 8666 int decimal = Py_UNICODE_TODECIMAL(ch); 8667 if (decimal >= 0) 8668 ch = '0' + decimal; 8669 } 8670 PyUnicode_WRITE(kind, data, i, ch); 8671 } 8672 return unicode_result(decimal); 8673} 8674/* --- Decimal Encoder ---------------------------------------------------- */ 8675 8676int 8677PyUnicode_EncodeDecimal(Py_UNICODE *s, 8678 Py_ssize_t length, 8679 char *output, 8680 const char *errors) 8681{ 8682 PyObject *unicode; 8683 Py_ssize_t i; 8684 enum PyUnicode_Kind kind; 8685 void *data; 8686 8687 if (output == NULL) { 8688 PyErr_BadArgument(); 8689 return -1; 8690 } 8691 8692 unicode = PyUnicode_FromUnicode(s, length); 8693 if (unicode == NULL) 8694 return -1; 8695 8696 if (PyUnicode_READY(unicode) == -1) { 8697 Py_DECREF(unicode); 8698 return -1; 8699 } 8700 kind = PyUnicode_KIND(unicode); 8701 data = PyUnicode_DATA(unicode); 8702 8703 for (i=0; i < length; ) { 8704 PyObject *exc; 8705 Py_UCS4 ch; 8706 int decimal; 8707 Py_ssize_t startpos; 8708 8709 ch = PyUnicode_READ(kind, data, i); 8710 8711 if (Py_UNICODE_ISSPACE(ch)) { 8712 *output++ = ' '; 8713 i++; 8714 continue; 8715 } 8716 decimal = Py_UNICODE_TODECIMAL(ch); 8717 if (decimal >= 0) { 8718 *output++ = '0' + decimal; 8719 i++; 8720 continue; 8721 } 8722 if (0 < ch && ch < 256) { 8723 *output++ = (char)ch; 8724 i++; 8725 continue; 8726 } 8727 8728 startpos = i; 8729 exc = NULL; 8730 raise_encode_exception(&exc, "decimal", unicode, 8731 startpos, startpos+1, 8732 "invalid decimal Unicode string"); 8733 Py_XDECREF(exc); 8734 Py_DECREF(unicode); 8735 return -1; 8736 } 8737 /* 0-terminate the output string */ 8738 *output++ = '\0'; 8739 Py_DECREF(unicode); 8740 return 0; 8741} 8742 8743/* --- Helpers ------------------------------------------------------------ */ 8744 8745static Py_ssize_t 8746any_find_slice(int direction, PyObject* s1, PyObject* s2, 8747 Py_ssize_t start, 8748 Py_ssize_t end) 8749{ 8750 int kind1, kind2, kind; 8751 void *buf1, *buf2; 8752 Py_ssize_t len1, len2, result; 8753 8754 kind1 = PyUnicode_KIND(s1); 8755 kind2 = PyUnicode_KIND(s2); 8756 kind = kind1 > kind2 ? kind1 : kind2; 8757 buf1 = PyUnicode_DATA(s1); 8758 buf2 = PyUnicode_DATA(s2); 8759 if (kind1 != kind) 8760 buf1 = _PyUnicode_AsKind(s1, kind); 8761 if (!buf1) 8762 return -2; 8763 if (kind2 != kind) 8764 buf2 = _PyUnicode_AsKind(s2, kind); 8765 if (!buf2) { 8766 if (kind1 != kind) PyMem_Free(buf1); 8767 return -2; 8768 } 8769 len1 = PyUnicode_GET_LENGTH(s1); 8770 len2 = PyUnicode_GET_LENGTH(s2); 8771 8772 if (direction > 0) { 8773 switch (kind) { 8774 case PyUnicode_1BYTE_KIND: 8775 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8776 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8777 else 8778 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8779 break; 8780 case PyUnicode_2BYTE_KIND: 8781 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8782 break; 8783 case PyUnicode_4BYTE_KIND: 8784 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8785 break; 8786 default: 8787 assert(0); result = -2; 8788 } 8789 } 8790 else { 8791 switch (kind) { 8792 case PyUnicode_1BYTE_KIND: 8793 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8794 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8795 else 8796 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8797 break; 8798 case PyUnicode_2BYTE_KIND: 8799 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8800 break; 8801 case PyUnicode_4BYTE_KIND: 8802 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8803 break; 8804 default: 8805 assert(0); result = -2; 8806 } 8807 } 8808 8809 if (kind1 != kind) 8810 PyMem_Free(buf1); 8811 if (kind2 != kind) 8812 PyMem_Free(buf2); 8813 8814 return result; 8815} 8816 8817Py_ssize_t 8818_PyUnicode_InsertThousandsGrouping( 8819 PyObject *unicode, Py_ssize_t index, 8820 Py_ssize_t n_buffer, 8821 void *digits, Py_ssize_t n_digits, 8822 Py_ssize_t min_width, 8823 const char *grouping, PyObject *thousands_sep, 8824 Py_UCS4 *maxchar) 8825{ 8826 unsigned int kind, thousands_sep_kind; 8827 char *data, *thousands_sep_data; 8828 Py_ssize_t thousands_sep_len; 8829 Py_ssize_t len; 8830 8831 if (unicode != NULL) { 8832 kind = PyUnicode_KIND(unicode); 8833 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8834 } 8835 else { 8836 kind = PyUnicode_1BYTE_KIND; 8837 data = NULL; 8838 } 8839 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8840 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8841 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8842 if (unicode != NULL && thousands_sep_kind != kind) { 8843 if (thousands_sep_kind < kind) { 8844 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8845 if (!thousands_sep_data) 8846 return -1; 8847 } 8848 else { 8849 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8850 if (!data) 8851 return -1; 8852 } 8853 } 8854 8855 switch (kind) { 8856 case PyUnicode_1BYTE_KIND: 8857 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8858 len = asciilib_InsertThousandsGrouping( 8859 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8860 min_width, grouping, 8861 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8862 else 8863 len = ucs1lib_InsertThousandsGrouping( 8864 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8865 min_width, grouping, 8866 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8867 break; 8868 case PyUnicode_2BYTE_KIND: 8869 len = ucs2lib_InsertThousandsGrouping( 8870 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8871 min_width, grouping, 8872 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8873 break; 8874 case PyUnicode_4BYTE_KIND: 8875 len = ucs4lib_InsertThousandsGrouping( 8876 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8877 min_width, grouping, 8878 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8879 break; 8880 default: 8881 assert(0); 8882 return -1; 8883 } 8884 if (unicode != NULL && thousands_sep_kind != kind) { 8885 if (thousands_sep_kind < kind) 8886 PyMem_Free(thousands_sep_data); 8887 else 8888 PyMem_Free(data); 8889 } 8890 if (unicode == NULL) { 8891 *maxchar = 127; 8892 if (len != n_digits) { 8893 *maxchar = Py_MAX(*maxchar, 8894 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8895 } 8896 } 8897 return len; 8898} 8899 8900 8901/* helper macro to fixup start/end slice values */ 8902#define ADJUST_INDICES(start, end, len) \ 8903 if (end > len) \ 8904 end = len; \ 8905 else if (end < 0) { \ 8906 end += len; \ 8907 if (end < 0) \ 8908 end = 0; \ 8909 } \ 8910 if (start < 0) { \ 8911 start += len; \ 8912 if (start < 0) \ 8913 start = 0; \ 8914 } 8915 8916Py_ssize_t 8917PyUnicode_Count(PyObject *str, 8918 PyObject *substr, 8919 Py_ssize_t start, 8920 Py_ssize_t end) 8921{ 8922 Py_ssize_t result; 8923 PyObject* str_obj; 8924 PyObject* sub_obj; 8925 int kind1, kind2, kind; 8926 void *buf1 = NULL, *buf2 = NULL; 8927 Py_ssize_t len1, len2; 8928 8929 str_obj = PyUnicode_FromObject(str); 8930 if (!str_obj) 8931 return -1; 8932 sub_obj = PyUnicode_FromObject(substr); 8933 if (!sub_obj) { 8934 Py_DECREF(str_obj); 8935 return -1; 8936 } 8937 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8938 Py_DECREF(sub_obj); 8939 Py_DECREF(str_obj); 8940 return -1; 8941 } 8942 8943 kind1 = PyUnicode_KIND(str_obj); 8944 kind2 = PyUnicode_KIND(sub_obj); 8945 kind = kind1; 8946 buf1 = PyUnicode_DATA(str_obj); 8947 buf2 = PyUnicode_DATA(sub_obj); 8948 if (kind2 != kind) { 8949 if (kind2 > kind) { 8950 Py_DECREF(sub_obj); 8951 Py_DECREF(str_obj); 8952 return 0; 8953 } 8954 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8955 } 8956 if (!buf2) 8957 goto onError; 8958 len1 = PyUnicode_GET_LENGTH(str_obj); 8959 len2 = PyUnicode_GET_LENGTH(sub_obj); 8960 8961 ADJUST_INDICES(start, end, len1); 8962 switch (kind) { 8963 case PyUnicode_1BYTE_KIND: 8964 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8965 result = asciilib_count( 8966 ((Py_UCS1*)buf1) + start, end - start, 8967 buf2, len2, PY_SSIZE_T_MAX 8968 ); 8969 else 8970 result = ucs1lib_count( 8971 ((Py_UCS1*)buf1) + start, end - start, 8972 buf2, len2, PY_SSIZE_T_MAX 8973 ); 8974 break; 8975 case PyUnicode_2BYTE_KIND: 8976 result = ucs2lib_count( 8977 ((Py_UCS2*)buf1) + start, end - start, 8978 buf2, len2, PY_SSIZE_T_MAX 8979 ); 8980 break; 8981 case PyUnicode_4BYTE_KIND: 8982 result = ucs4lib_count( 8983 ((Py_UCS4*)buf1) + start, end - start, 8984 buf2, len2, PY_SSIZE_T_MAX 8985 ); 8986 break; 8987 default: 8988 assert(0); result = 0; 8989 } 8990 8991 Py_DECREF(sub_obj); 8992 Py_DECREF(str_obj); 8993 8994 if (kind2 != kind) 8995 PyMem_Free(buf2); 8996 8997 return result; 8998 onError: 8999 Py_DECREF(sub_obj); 9000 Py_DECREF(str_obj); 9001 if (kind2 != kind && buf2) 9002 PyMem_Free(buf2); 9003 return -1; 9004} 9005 9006Py_ssize_t 9007PyUnicode_Find(PyObject *str, 9008 PyObject *sub, 9009 Py_ssize_t start, 9010 Py_ssize_t end, 9011 int direction) 9012{ 9013 Py_ssize_t result; 9014 9015 str = PyUnicode_FromObject(str); 9016 if (!str) 9017 return -2; 9018 sub = PyUnicode_FromObject(sub); 9019 if (!sub) { 9020 Py_DECREF(str); 9021 return -2; 9022 } 9023 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9024 Py_DECREF(sub); 9025 Py_DECREF(str); 9026 return -2; 9027 } 9028 9029 result = any_find_slice(direction, 9030 str, sub, start, end 9031 ); 9032 9033 Py_DECREF(str); 9034 Py_DECREF(sub); 9035 9036 return result; 9037} 9038 9039Py_ssize_t 9040PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9041 Py_ssize_t start, Py_ssize_t end, 9042 int direction) 9043{ 9044 int kind; 9045 Py_ssize_t result; 9046 if (PyUnicode_READY(str) == -1) 9047 return -2; 9048 if (start < 0 || end < 0) { 9049 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9050 return -2; 9051 } 9052 if (end > PyUnicode_GET_LENGTH(str)) 9053 end = PyUnicode_GET_LENGTH(str); 9054 kind = PyUnicode_KIND(str); 9055 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9056 kind, end-start, ch, direction); 9057 if (result == -1) 9058 return -1; 9059 else 9060 return start + result; 9061} 9062 9063static int 9064tailmatch(PyObject *self, 9065 PyObject *substring, 9066 Py_ssize_t start, 9067 Py_ssize_t end, 9068 int direction) 9069{ 9070 int kind_self; 9071 int kind_sub; 9072 void *data_self; 9073 void *data_sub; 9074 Py_ssize_t offset; 9075 Py_ssize_t i; 9076 Py_ssize_t end_sub; 9077 9078 if (PyUnicode_READY(self) == -1 || 9079 PyUnicode_READY(substring) == -1) 9080 return -1; 9081 9082 if (PyUnicode_GET_LENGTH(substring) == 0) 9083 return 1; 9084 9085 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9086 end -= PyUnicode_GET_LENGTH(substring); 9087 if (end < start) 9088 return 0; 9089 9090 kind_self = PyUnicode_KIND(self); 9091 data_self = PyUnicode_DATA(self); 9092 kind_sub = PyUnicode_KIND(substring); 9093 data_sub = PyUnicode_DATA(substring); 9094 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9095 9096 if (direction > 0) 9097 offset = end; 9098 else 9099 offset = start; 9100 9101 if (PyUnicode_READ(kind_self, data_self, offset) == 9102 PyUnicode_READ(kind_sub, data_sub, 0) && 9103 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9104 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9105 /* If both are of the same kind, memcmp is sufficient */ 9106 if (kind_self == kind_sub) { 9107 return ! memcmp((char *)data_self + 9108 (offset * PyUnicode_KIND(substring)), 9109 data_sub, 9110 PyUnicode_GET_LENGTH(substring) * 9111 PyUnicode_KIND(substring)); 9112 } 9113 /* otherwise we have to compare each character by first accesing it */ 9114 else { 9115 /* We do not need to compare 0 and len(substring)-1 because 9116 the if statement above ensured already that they are equal 9117 when we end up here. */ 9118 for (i = 1; i < end_sub; ++i) { 9119 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9120 PyUnicode_READ(kind_sub, data_sub, i)) 9121 return 0; 9122 } 9123 return 1; 9124 } 9125 } 9126 9127 return 0; 9128} 9129 9130Py_ssize_t 9131PyUnicode_Tailmatch(PyObject *str, 9132 PyObject *substr, 9133 Py_ssize_t start, 9134 Py_ssize_t end, 9135 int direction) 9136{ 9137 Py_ssize_t result; 9138 9139 str = PyUnicode_FromObject(str); 9140 if (str == NULL) 9141 return -1; 9142 substr = PyUnicode_FromObject(substr); 9143 if (substr == NULL) { 9144 Py_DECREF(str); 9145 return -1; 9146 } 9147 9148 result = tailmatch(str, substr, 9149 start, end, direction); 9150 Py_DECREF(str); 9151 Py_DECREF(substr); 9152 return result; 9153} 9154 9155/* Apply fixfct filter to the Unicode object self and return a 9156 reference to the modified object */ 9157 9158static PyObject * 9159fixup(PyObject *self, 9160 Py_UCS4 (*fixfct)(PyObject *s)) 9161{ 9162 PyObject *u; 9163 Py_UCS4 maxchar_old, maxchar_new = 0; 9164 PyObject *v; 9165 9166 u = _PyUnicode_Copy(self); 9167 if (u == NULL) 9168 return NULL; 9169 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9170 9171 /* fix functions return the new maximum character in a string, 9172 if the kind of the resulting unicode object does not change, 9173 everything is fine. Otherwise we need to change the string kind 9174 and re-run the fix function. */ 9175 maxchar_new = fixfct(u); 9176 9177 if (maxchar_new == 0) { 9178 /* no changes */; 9179 if (PyUnicode_CheckExact(self)) { 9180 Py_DECREF(u); 9181 Py_INCREF(self); 9182 return self; 9183 } 9184 else 9185 return u; 9186 } 9187 9188 maxchar_new = align_maxchar(maxchar_new); 9189 9190 if (maxchar_new == maxchar_old) 9191 return u; 9192 9193 /* In case the maximum character changed, we need to 9194 convert the string to the new category. */ 9195 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9196 if (v == NULL) { 9197 Py_DECREF(u); 9198 return NULL; 9199 } 9200 if (maxchar_new > maxchar_old) { 9201 /* If the maxchar increased so that the kind changed, not all 9202 characters are representable anymore and we need to fix the 9203 string again. This only happens in very few cases. */ 9204 _PyUnicode_FastCopyCharacters(v, 0, 9205 self, 0, PyUnicode_GET_LENGTH(self)); 9206 maxchar_old = fixfct(v); 9207 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9208 } 9209 else { 9210 _PyUnicode_FastCopyCharacters(v, 0, 9211 u, 0, PyUnicode_GET_LENGTH(self)); 9212 } 9213 Py_DECREF(u); 9214 assert(_PyUnicode_CheckConsistency(v, 1)); 9215 return v; 9216} 9217 9218static PyObject * 9219ascii_upper_or_lower(PyObject *self, int lower) 9220{ 9221 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9222 char *resdata, *data = PyUnicode_DATA(self); 9223 PyObject *res; 9224 9225 res = PyUnicode_New(len, 127); 9226 if (res == NULL) 9227 return NULL; 9228 resdata = PyUnicode_DATA(res); 9229 if (lower) 9230 _Py_bytes_lower(resdata, data, len); 9231 else 9232 _Py_bytes_upper(resdata, data, len); 9233 return res; 9234} 9235 9236static Py_UCS4 9237handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9238{ 9239 Py_ssize_t j; 9240 int final_sigma; 9241 Py_UCS4 c; 9242 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9243 9244 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9245 9246 where ! is a negation and \p{xxx} is a character with property xxx. 9247 */ 9248 for (j = i - 1; j >= 0; j--) { 9249 c = PyUnicode_READ(kind, data, j); 9250 if (!_PyUnicode_IsCaseIgnorable(c)) 9251 break; 9252 } 9253 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9254 if (final_sigma) { 9255 for (j = i + 1; j < length; j++) { 9256 c = PyUnicode_READ(kind, data, j); 9257 if (!_PyUnicode_IsCaseIgnorable(c)) 9258 break; 9259 } 9260 final_sigma = j == length || !_PyUnicode_IsCased(c); 9261 } 9262 return (final_sigma) ? 0x3C2 : 0x3C3; 9263} 9264 9265static int 9266lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9267 Py_UCS4 c, Py_UCS4 *mapped) 9268{ 9269 /* Obscure special case. */ 9270 if (c == 0x3A3) { 9271 mapped[0] = handle_capital_sigma(kind, data, length, i); 9272 return 1; 9273 } 9274 return _PyUnicode_ToLowerFull(c, mapped); 9275} 9276 9277static Py_ssize_t 9278do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9279{ 9280 Py_ssize_t i, k = 0; 9281 int n_res, j; 9282 Py_UCS4 c, mapped[3]; 9283 9284 c = PyUnicode_READ(kind, data, 0); 9285 n_res = _PyUnicode_ToUpperFull(c, mapped); 9286 for (j = 0; j < n_res; j++) { 9287 *maxchar = Py_MAX(*maxchar, mapped[j]); 9288 res[k++] = mapped[j]; 9289 } 9290 for (i = 1; i < length; i++) { 9291 c = PyUnicode_READ(kind, data, i); 9292 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9293 for (j = 0; j < n_res; j++) { 9294 *maxchar = Py_MAX(*maxchar, mapped[j]); 9295 res[k++] = mapped[j]; 9296 } 9297 } 9298 return k; 9299} 9300 9301static Py_ssize_t 9302do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9303 Py_ssize_t i, k = 0; 9304 9305 for (i = 0; i < length; i++) { 9306 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9307 int n_res, j; 9308 if (Py_UNICODE_ISUPPER(c)) { 9309 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9310 } 9311 else if (Py_UNICODE_ISLOWER(c)) { 9312 n_res = _PyUnicode_ToUpperFull(c, mapped); 9313 } 9314 else { 9315 n_res = 1; 9316 mapped[0] = c; 9317 } 9318 for (j = 0; j < n_res; j++) { 9319 *maxchar = Py_MAX(*maxchar, mapped[j]); 9320 res[k++] = mapped[j]; 9321 } 9322 } 9323 return k; 9324} 9325 9326static Py_ssize_t 9327do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9328 Py_UCS4 *maxchar, int lower) 9329{ 9330 Py_ssize_t i, k = 0; 9331 9332 for (i = 0; i < length; i++) { 9333 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9334 int n_res, j; 9335 if (lower) 9336 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9337 else 9338 n_res = _PyUnicode_ToUpperFull(c, mapped); 9339 for (j = 0; j < n_res; j++) { 9340 *maxchar = Py_MAX(*maxchar, mapped[j]); 9341 res[k++] = mapped[j]; 9342 } 9343 } 9344 return k; 9345} 9346 9347static Py_ssize_t 9348do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9349{ 9350 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9351} 9352 9353static Py_ssize_t 9354do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9355{ 9356 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9357} 9358 9359static Py_ssize_t 9360do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9361{ 9362 Py_ssize_t i, k = 0; 9363 9364 for (i = 0; i < length; i++) { 9365 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9366 Py_UCS4 mapped[3]; 9367 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9368 for (j = 0; j < n_res; j++) { 9369 *maxchar = Py_MAX(*maxchar, mapped[j]); 9370 res[k++] = mapped[j]; 9371 } 9372 } 9373 return k; 9374} 9375 9376static Py_ssize_t 9377do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9378{ 9379 Py_ssize_t i, k = 0; 9380 int previous_is_cased; 9381 9382 previous_is_cased = 0; 9383 for (i = 0; i < length; i++) { 9384 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9385 Py_UCS4 mapped[3]; 9386 int n_res, j; 9387 9388 if (previous_is_cased) 9389 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9390 else 9391 n_res = _PyUnicode_ToTitleFull(c, mapped); 9392 9393 for (j = 0; j < n_res; j++) { 9394 *maxchar = Py_MAX(*maxchar, mapped[j]); 9395 res[k++] = mapped[j]; 9396 } 9397 9398 previous_is_cased = _PyUnicode_IsCased(c); 9399 } 9400 return k; 9401} 9402 9403static PyObject * 9404case_operation(PyObject *self, 9405 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9406{ 9407 PyObject *res = NULL; 9408 Py_ssize_t length, newlength = 0; 9409 int kind, outkind; 9410 void *data, *outdata; 9411 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9412 9413 assert(PyUnicode_IS_READY(self)); 9414 9415 kind = PyUnicode_KIND(self); 9416 data = PyUnicode_DATA(self); 9417 length = PyUnicode_GET_LENGTH(self); 9418 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9419 if (tmp == NULL) 9420 return PyErr_NoMemory(); 9421 newlength = perform(kind, data, length, tmp, &maxchar); 9422 res = PyUnicode_New(newlength, maxchar); 9423 if (res == NULL) 9424 goto leave; 9425 tmpend = tmp + newlength; 9426 outdata = PyUnicode_DATA(res); 9427 outkind = PyUnicode_KIND(res); 9428 switch (outkind) { 9429 case PyUnicode_1BYTE_KIND: 9430 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9431 break; 9432 case PyUnicode_2BYTE_KIND: 9433 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9434 break; 9435 case PyUnicode_4BYTE_KIND: 9436 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9437 break; 9438 default: 9439 assert(0); 9440 break; 9441 } 9442 leave: 9443 PyMem_FREE(tmp); 9444 return res; 9445} 9446 9447PyObject * 9448PyUnicode_Join(PyObject *separator, PyObject *seq) 9449{ 9450 PyObject *sep = NULL; 9451 Py_ssize_t seplen; 9452 PyObject *res = NULL; /* the result */ 9453 PyObject *fseq; /* PySequence_Fast(seq) */ 9454 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9455 PyObject **items; 9456 PyObject *item; 9457 Py_ssize_t sz, i, res_offset; 9458 Py_UCS4 maxchar; 9459 Py_UCS4 item_maxchar; 9460 int use_memcpy; 9461 unsigned char *res_data = NULL, *sep_data = NULL; 9462 PyObject *last_obj; 9463 unsigned int kind = 0; 9464 9465 fseq = PySequence_Fast(seq, ""); 9466 if (fseq == NULL) { 9467 return NULL; 9468 } 9469 9470 /* NOTE: the following code can't call back into Python code, 9471 * so we are sure that fseq won't be mutated. 9472 */ 9473 9474 seqlen = PySequence_Fast_GET_SIZE(fseq); 9475 /* If empty sequence, return u"". */ 9476 if (seqlen == 0) { 9477 Py_DECREF(fseq); 9478 _Py_RETURN_UNICODE_EMPTY(); 9479 } 9480 9481 /* If singleton sequence with an exact Unicode, return that. */ 9482 last_obj = NULL; 9483 items = PySequence_Fast_ITEMS(fseq); 9484 if (seqlen == 1) { 9485 if (PyUnicode_CheckExact(items[0])) { 9486 res = items[0]; 9487 Py_INCREF(res); 9488 Py_DECREF(fseq); 9489 return res; 9490 } 9491 seplen = 0; 9492 maxchar = 0; 9493 } 9494 else { 9495 /* Set up sep and seplen */ 9496 if (separator == NULL) { 9497 /* fall back to a blank space separator */ 9498 sep = PyUnicode_FromOrdinal(' '); 9499 if (!sep) 9500 goto onError; 9501 seplen = 1; 9502 maxchar = 32; 9503 } 9504 else { 9505 if (!PyUnicode_Check(separator)) { 9506 PyErr_Format(PyExc_TypeError, 9507 "separator: expected str instance," 9508 " %.80s found", 9509 Py_TYPE(separator)->tp_name); 9510 goto onError; 9511 } 9512 if (PyUnicode_READY(separator)) 9513 goto onError; 9514 sep = separator; 9515 seplen = PyUnicode_GET_LENGTH(separator); 9516 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9517 /* inc refcount to keep this code path symmetric with the 9518 above case of a blank separator */ 9519 Py_INCREF(sep); 9520 } 9521 last_obj = sep; 9522 } 9523 9524 /* There are at least two things to join, or else we have a subclass 9525 * of str in the sequence. 9526 * Do a pre-pass to figure out the total amount of space we'll 9527 * need (sz), and see whether all argument are strings. 9528 */ 9529 sz = 0; 9530#ifdef Py_DEBUG 9531 use_memcpy = 0; 9532#else 9533 use_memcpy = 1; 9534#endif 9535 for (i = 0; i < seqlen; i++) { 9536 const Py_ssize_t old_sz = sz; 9537 item = items[i]; 9538 if (!PyUnicode_Check(item)) { 9539 PyErr_Format(PyExc_TypeError, 9540 "sequence item %zd: expected str instance," 9541 " %.80s found", 9542 i, Py_TYPE(item)->tp_name); 9543 goto onError; 9544 } 9545 if (PyUnicode_READY(item) == -1) 9546 goto onError; 9547 sz += PyUnicode_GET_LENGTH(item); 9548 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9549 maxchar = Py_MAX(maxchar, item_maxchar); 9550 if (i != 0) 9551 sz += seplen; 9552 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9553 PyErr_SetString(PyExc_OverflowError, 9554 "join() result is too long for a Python string"); 9555 goto onError; 9556 } 9557 if (use_memcpy && last_obj != NULL) { 9558 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9559 use_memcpy = 0; 9560 } 9561 last_obj = item; 9562 } 9563 9564 res = PyUnicode_New(sz, maxchar); 9565 if (res == NULL) 9566 goto onError; 9567 9568 /* Catenate everything. */ 9569#ifdef Py_DEBUG 9570 use_memcpy = 0; 9571#else 9572 if (use_memcpy) { 9573 res_data = PyUnicode_1BYTE_DATA(res); 9574 kind = PyUnicode_KIND(res); 9575 if (seplen != 0) 9576 sep_data = PyUnicode_1BYTE_DATA(sep); 9577 } 9578#endif 9579 if (use_memcpy) { 9580 for (i = 0; i < seqlen; ++i) { 9581 Py_ssize_t itemlen; 9582 item = items[i]; 9583 9584 /* Copy item, and maybe the separator. */ 9585 if (i && seplen != 0) { 9586 Py_MEMCPY(res_data, 9587 sep_data, 9588 kind * seplen); 9589 res_data += kind * seplen; 9590 } 9591 9592 itemlen = PyUnicode_GET_LENGTH(item); 9593 if (itemlen != 0) { 9594 Py_MEMCPY(res_data, 9595 PyUnicode_DATA(item), 9596 kind * itemlen); 9597 res_data += kind * itemlen; 9598 } 9599 } 9600 assert(res_data == PyUnicode_1BYTE_DATA(res) 9601 + kind * PyUnicode_GET_LENGTH(res)); 9602 } 9603 else { 9604 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9605 Py_ssize_t itemlen; 9606 item = items[i]; 9607 9608 /* Copy item, and maybe the separator. */ 9609 if (i && seplen != 0) { 9610 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9611 res_offset += seplen; 9612 } 9613 9614 itemlen = PyUnicode_GET_LENGTH(item); 9615 if (itemlen != 0) { 9616 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9617 res_offset += itemlen; 9618 } 9619 } 9620 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9621 } 9622 9623 Py_DECREF(fseq); 9624 Py_XDECREF(sep); 9625 assert(_PyUnicode_CheckConsistency(res, 1)); 9626 return res; 9627 9628 onError: 9629 Py_DECREF(fseq); 9630 Py_XDECREF(sep); 9631 Py_XDECREF(res); 9632 return NULL; 9633} 9634 9635#define FILL(kind, data, value, start, length) \ 9636 do { \ 9637 Py_ssize_t i_ = 0; \ 9638 assert(kind != PyUnicode_WCHAR_KIND); \ 9639 switch ((kind)) { \ 9640 case PyUnicode_1BYTE_KIND: { \ 9641 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9642 memset(to_, (unsigned char)value, (length)); \ 9643 break; \ 9644 } \ 9645 case PyUnicode_2BYTE_KIND: { \ 9646 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9647 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9648 break; \ 9649 } \ 9650 case PyUnicode_4BYTE_KIND: { \ 9651 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9652 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9653 break; \ 9654 default: assert(0); \ 9655 } \ 9656 } \ 9657 } while (0) 9658 9659void 9660_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9661 Py_UCS4 fill_char) 9662{ 9663 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9664 const void *data = PyUnicode_DATA(unicode); 9665 assert(PyUnicode_IS_READY(unicode)); 9666 assert(unicode_modifiable(unicode)); 9667 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9668 assert(start >= 0); 9669 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9670 FILL(kind, data, fill_char, start, length); 9671} 9672 9673Py_ssize_t 9674PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9675 Py_UCS4 fill_char) 9676{ 9677 Py_ssize_t maxlen; 9678 9679 if (!PyUnicode_Check(unicode)) { 9680 PyErr_BadInternalCall(); 9681 return -1; 9682 } 9683 if (PyUnicode_READY(unicode) == -1) 9684 return -1; 9685 if (unicode_check_modifiable(unicode)) 9686 return -1; 9687 9688 if (start < 0) { 9689 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9690 return -1; 9691 } 9692 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9693 PyErr_SetString(PyExc_ValueError, 9694 "fill character is bigger than " 9695 "the string maximum character"); 9696 return -1; 9697 } 9698 9699 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9700 length = Py_MIN(maxlen, length); 9701 if (length <= 0) 9702 return 0; 9703 9704 _PyUnicode_FastFill(unicode, start, length, fill_char); 9705 return length; 9706} 9707 9708static PyObject * 9709pad(PyObject *self, 9710 Py_ssize_t left, 9711 Py_ssize_t right, 9712 Py_UCS4 fill) 9713{ 9714 PyObject *u; 9715 Py_UCS4 maxchar; 9716 int kind; 9717 void *data; 9718 9719 if (left < 0) 9720 left = 0; 9721 if (right < 0) 9722 right = 0; 9723 9724 if (left == 0 && right == 0) 9725 return unicode_result_unchanged(self); 9726 9727 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9728 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9729 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9730 return NULL; 9731 } 9732 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9733 maxchar = Py_MAX(maxchar, fill); 9734 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9735 if (!u) 9736 return NULL; 9737 9738 kind = PyUnicode_KIND(u); 9739 data = PyUnicode_DATA(u); 9740 if (left) 9741 FILL(kind, data, fill, 0, left); 9742 if (right) 9743 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9744 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9745 assert(_PyUnicode_CheckConsistency(u, 1)); 9746 return u; 9747} 9748 9749PyObject * 9750PyUnicode_Splitlines(PyObject *string, int keepends) 9751{ 9752 PyObject *list; 9753 9754 string = PyUnicode_FromObject(string); 9755 if (string == NULL) 9756 return NULL; 9757 if (PyUnicode_READY(string) == -1) { 9758 Py_DECREF(string); 9759 return NULL; 9760 } 9761 9762 switch (PyUnicode_KIND(string)) { 9763 case PyUnicode_1BYTE_KIND: 9764 if (PyUnicode_IS_ASCII(string)) 9765 list = asciilib_splitlines( 9766 string, PyUnicode_1BYTE_DATA(string), 9767 PyUnicode_GET_LENGTH(string), keepends); 9768 else 9769 list = ucs1lib_splitlines( 9770 string, PyUnicode_1BYTE_DATA(string), 9771 PyUnicode_GET_LENGTH(string), keepends); 9772 break; 9773 case PyUnicode_2BYTE_KIND: 9774 list = ucs2lib_splitlines( 9775 string, PyUnicode_2BYTE_DATA(string), 9776 PyUnicode_GET_LENGTH(string), keepends); 9777 break; 9778 case PyUnicode_4BYTE_KIND: 9779 list = ucs4lib_splitlines( 9780 string, PyUnicode_4BYTE_DATA(string), 9781 PyUnicode_GET_LENGTH(string), keepends); 9782 break; 9783 default: 9784 assert(0); 9785 list = 0; 9786 } 9787 Py_DECREF(string); 9788 return list; 9789} 9790 9791static PyObject * 9792split(PyObject *self, 9793 PyObject *substring, 9794 Py_ssize_t maxcount) 9795{ 9796 int kind1, kind2, kind; 9797 void *buf1, *buf2; 9798 Py_ssize_t len1, len2; 9799 PyObject* out; 9800 9801 if (maxcount < 0) 9802 maxcount = PY_SSIZE_T_MAX; 9803 9804 if (PyUnicode_READY(self) == -1) 9805 return NULL; 9806 9807 if (substring == NULL) 9808 switch (PyUnicode_KIND(self)) { 9809 case PyUnicode_1BYTE_KIND: 9810 if (PyUnicode_IS_ASCII(self)) 9811 return asciilib_split_whitespace( 9812 self, PyUnicode_1BYTE_DATA(self), 9813 PyUnicode_GET_LENGTH(self), maxcount 9814 ); 9815 else 9816 return ucs1lib_split_whitespace( 9817 self, PyUnicode_1BYTE_DATA(self), 9818 PyUnicode_GET_LENGTH(self), maxcount 9819 ); 9820 case PyUnicode_2BYTE_KIND: 9821 return ucs2lib_split_whitespace( 9822 self, PyUnicode_2BYTE_DATA(self), 9823 PyUnicode_GET_LENGTH(self), maxcount 9824 ); 9825 case PyUnicode_4BYTE_KIND: 9826 return ucs4lib_split_whitespace( 9827 self, PyUnicode_4BYTE_DATA(self), 9828 PyUnicode_GET_LENGTH(self), maxcount 9829 ); 9830 default: 9831 assert(0); 9832 return NULL; 9833 } 9834 9835 if (PyUnicode_READY(substring) == -1) 9836 return NULL; 9837 9838 kind1 = PyUnicode_KIND(self); 9839 kind2 = PyUnicode_KIND(substring); 9840 kind = kind1 > kind2 ? kind1 : kind2; 9841 buf1 = PyUnicode_DATA(self); 9842 buf2 = PyUnicode_DATA(substring); 9843 if (kind1 != kind) 9844 buf1 = _PyUnicode_AsKind(self, kind); 9845 if (!buf1) 9846 return NULL; 9847 if (kind2 != kind) 9848 buf2 = _PyUnicode_AsKind(substring, kind); 9849 if (!buf2) { 9850 if (kind1 != kind) PyMem_Free(buf1); 9851 return NULL; 9852 } 9853 len1 = PyUnicode_GET_LENGTH(self); 9854 len2 = PyUnicode_GET_LENGTH(substring); 9855 9856 switch (kind) { 9857 case PyUnicode_1BYTE_KIND: 9858 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9859 out = asciilib_split( 9860 self, buf1, len1, buf2, len2, maxcount); 9861 else 9862 out = ucs1lib_split( 9863 self, buf1, len1, buf2, len2, maxcount); 9864 break; 9865 case PyUnicode_2BYTE_KIND: 9866 out = ucs2lib_split( 9867 self, buf1, len1, buf2, len2, maxcount); 9868 break; 9869 case PyUnicode_4BYTE_KIND: 9870 out = ucs4lib_split( 9871 self, buf1, len1, buf2, len2, maxcount); 9872 break; 9873 default: 9874 out = NULL; 9875 } 9876 if (kind1 != kind) 9877 PyMem_Free(buf1); 9878 if (kind2 != kind) 9879 PyMem_Free(buf2); 9880 return out; 9881} 9882 9883static PyObject * 9884rsplit(PyObject *self, 9885 PyObject *substring, 9886 Py_ssize_t maxcount) 9887{ 9888 int kind1, kind2, kind; 9889 void *buf1, *buf2; 9890 Py_ssize_t len1, len2; 9891 PyObject* out; 9892 9893 if (maxcount < 0) 9894 maxcount = PY_SSIZE_T_MAX; 9895 9896 if (PyUnicode_READY(self) == -1) 9897 return NULL; 9898 9899 if (substring == NULL) 9900 switch (PyUnicode_KIND(self)) { 9901 case PyUnicode_1BYTE_KIND: 9902 if (PyUnicode_IS_ASCII(self)) 9903 return asciilib_rsplit_whitespace( 9904 self, PyUnicode_1BYTE_DATA(self), 9905 PyUnicode_GET_LENGTH(self), maxcount 9906 ); 9907 else 9908 return ucs1lib_rsplit_whitespace( 9909 self, PyUnicode_1BYTE_DATA(self), 9910 PyUnicode_GET_LENGTH(self), maxcount 9911 ); 9912 case PyUnicode_2BYTE_KIND: 9913 return ucs2lib_rsplit_whitespace( 9914 self, PyUnicode_2BYTE_DATA(self), 9915 PyUnicode_GET_LENGTH(self), maxcount 9916 ); 9917 case PyUnicode_4BYTE_KIND: 9918 return ucs4lib_rsplit_whitespace( 9919 self, PyUnicode_4BYTE_DATA(self), 9920 PyUnicode_GET_LENGTH(self), maxcount 9921 ); 9922 default: 9923 assert(0); 9924 return NULL; 9925 } 9926 9927 if (PyUnicode_READY(substring) == -1) 9928 return NULL; 9929 9930 kind1 = PyUnicode_KIND(self); 9931 kind2 = PyUnicode_KIND(substring); 9932 kind = kind1 > kind2 ? kind1 : kind2; 9933 buf1 = PyUnicode_DATA(self); 9934 buf2 = PyUnicode_DATA(substring); 9935 if (kind1 != kind) 9936 buf1 = _PyUnicode_AsKind(self, kind); 9937 if (!buf1) 9938 return NULL; 9939 if (kind2 != kind) 9940 buf2 = _PyUnicode_AsKind(substring, kind); 9941 if (!buf2) { 9942 if (kind1 != kind) PyMem_Free(buf1); 9943 return NULL; 9944 } 9945 len1 = PyUnicode_GET_LENGTH(self); 9946 len2 = PyUnicode_GET_LENGTH(substring); 9947 9948 switch (kind) { 9949 case PyUnicode_1BYTE_KIND: 9950 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9951 out = asciilib_rsplit( 9952 self, buf1, len1, buf2, len2, maxcount); 9953 else 9954 out = ucs1lib_rsplit( 9955 self, buf1, len1, buf2, len2, maxcount); 9956 break; 9957 case PyUnicode_2BYTE_KIND: 9958 out = ucs2lib_rsplit( 9959 self, buf1, len1, buf2, len2, maxcount); 9960 break; 9961 case PyUnicode_4BYTE_KIND: 9962 out = ucs4lib_rsplit( 9963 self, buf1, len1, buf2, len2, maxcount); 9964 break; 9965 default: 9966 out = NULL; 9967 } 9968 if (kind1 != kind) 9969 PyMem_Free(buf1); 9970 if (kind2 != kind) 9971 PyMem_Free(buf2); 9972 return out; 9973} 9974 9975static Py_ssize_t 9976anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9977 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9978{ 9979 switch (kind) { 9980 case PyUnicode_1BYTE_KIND: 9981 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9982 return asciilib_find(buf1, len1, buf2, len2, offset); 9983 else 9984 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9985 case PyUnicode_2BYTE_KIND: 9986 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9987 case PyUnicode_4BYTE_KIND: 9988 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9989 } 9990 assert(0); 9991 return -1; 9992} 9993 9994static Py_ssize_t 9995anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9996 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9997{ 9998 switch (kind) { 9999 case PyUnicode_1BYTE_KIND: 10000 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10001 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10002 else 10003 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10004 case PyUnicode_2BYTE_KIND: 10005 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10006 case PyUnicode_4BYTE_KIND: 10007 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10008 } 10009 assert(0); 10010 return 0; 10011} 10012 10013static void 10014replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10015 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10016{ 10017 int kind = PyUnicode_KIND(u); 10018 void *data = PyUnicode_DATA(u); 10019 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10020 if (kind == PyUnicode_1BYTE_KIND) { 10021 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10022 (Py_UCS1 *)data + len, 10023 u1, u2, maxcount); 10024 } 10025 else if (kind == PyUnicode_2BYTE_KIND) { 10026 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10027 (Py_UCS2 *)data + len, 10028 u1, u2, maxcount); 10029 } 10030 else { 10031 assert(kind == PyUnicode_4BYTE_KIND); 10032 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10033 (Py_UCS4 *)data + len, 10034 u1, u2, maxcount); 10035 } 10036} 10037 10038static PyObject * 10039replace(PyObject *self, PyObject *str1, 10040 PyObject *str2, Py_ssize_t maxcount) 10041{ 10042 PyObject *u; 10043 char *sbuf = PyUnicode_DATA(self); 10044 char *buf1 = PyUnicode_DATA(str1); 10045 char *buf2 = PyUnicode_DATA(str2); 10046 int srelease = 0, release1 = 0, release2 = 0; 10047 int skind = PyUnicode_KIND(self); 10048 int kind1 = PyUnicode_KIND(str1); 10049 int kind2 = PyUnicode_KIND(str2); 10050 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10051 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10052 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10053 int mayshrink; 10054 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10055 10056 if (maxcount < 0) 10057 maxcount = PY_SSIZE_T_MAX; 10058 else if (maxcount == 0 || slen == 0) 10059 goto nothing; 10060 10061 if (str1 == str2) 10062 goto nothing; 10063 10064 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10065 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10066 if (maxchar < maxchar_str1) 10067 /* substring too wide to be present */ 10068 goto nothing; 10069 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10070 /* Replacing str1 with str2 may cause a maxchar reduction in the 10071 result string. */ 10072 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10073 maxchar = Py_MAX(maxchar, maxchar_str2); 10074 10075 if (len1 == len2) { 10076 /* same length */ 10077 if (len1 == 0) 10078 goto nothing; 10079 if (len1 == 1) { 10080 /* replace characters */ 10081 Py_UCS4 u1, u2; 10082 Py_ssize_t pos; 10083 10084 u1 = PyUnicode_READ(kind1, buf1, 0); 10085 pos = findchar(sbuf, skind, slen, u1, 1); 10086 if (pos < 0) 10087 goto nothing; 10088 u2 = PyUnicode_READ(kind2, buf2, 0); 10089 u = PyUnicode_New(slen, maxchar); 10090 if (!u) 10091 goto error; 10092 10093 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10094 replace_1char_inplace(u, pos, u1, u2, maxcount); 10095 } 10096 else { 10097 int rkind = skind; 10098 char *res; 10099 Py_ssize_t i; 10100 10101 if (kind1 < rkind) { 10102 /* widen substring */ 10103 buf1 = _PyUnicode_AsKind(str1, rkind); 10104 if (!buf1) goto error; 10105 release1 = 1; 10106 } 10107 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10108 if (i < 0) 10109 goto nothing; 10110 if (rkind > kind2) { 10111 /* widen replacement */ 10112 buf2 = _PyUnicode_AsKind(str2, rkind); 10113 if (!buf2) goto error; 10114 release2 = 1; 10115 } 10116 else if (rkind < kind2) { 10117 /* widen self and buf1 */ 10118 rkind = kind2; 10119 if (release1) PyMem_Free(buf1); 10120 release1 = 0; 10121 sbuf = _PyUnicode_AsKind(self, rkind); 10122 if (!sbuf) goto error; 10123 srelease = 1; 10124 buf1 = _PyUnicode_AsKind(str1, rkind); 10125 if (!buf1) goto error; 10126 release1 = 1; 10127 } 10128 u = PyUnicode_New(slen, maxchar); 10129 if (!u) 10130 goto error; 10131 assert(PyUnicode_KIND(u) == rkind); 10132 res = PyUnicode_DATA(u); 10133 10134 memcpy(res, sbuf, rkind * slen); 10135 /* change everything in-place, starting with this one */ 10136 memcpy(res + rkind * i, 10137 buf2, 10138 rkind * len2); 10139 i += len1; 10140 10141 while ( --maxcount > 0) { 10142 i = anylib_find(rkind, self, 10143 sbuf+rkind*i, slen-i, 10144 str1, buf1, len1, i); 10145 if (i == -1) 10146 break; 10147 memcpy(res + rkind * i, 10148 buf2, 10149 rkind * len2); 10150 i += len1; 10151 } 10152 } 10153 } 10154 else { 10155 Py_ssize_t n, i, j, ires; 10156 Py_ssize_t new_size; 10157 int rkind = skind; 10158 char *res; 10159 10160 if (kind1 < rkind) { 10161 /* widen substring */ 10162 buf1 = _PyUnicode_AsKind(str1, rkind); 10163 if (!buf1) goto error; 10164 release1 = 1; 10165 } 10166 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10167 if (n == 0) 10168 goto nothing; 10169 if (kind2 < rkind) { 10170 /* widen replacement */ 10171 buf2 = _PyUnicode_AsKind(str2, rkind); 10172 if (!buf2) goto error; 10173 release2 = 1; 10174 } 10175 else if (kind2 > rkind) { 10176 /* widen self and buf1 */ 10177 rkind = kind2; 10178 sbuf = _PyUnicode_AsKind(self, rkind); 10179 if (!sbuf) goto error; 10180 srelease = 1; 10181 if (release1) PyMem_Free(buf1); 10182 release1 = 0; 10183 buf1 = _PyUnicode_AsKind(str1, rkind); 10184 if (!buf1) goto error; 10185 release1 = 1; 10186 } 10187 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10188 PyUnicode_GET_LENGTH(str1))); */ 10189 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10190 PyErr_SetString(PyExc_OverflowError, 10191 "replace string is too long"); 10192 goto error; 10193 } 10194 new_size = slen + n * (len2 - len1); 10195 if (new_size == 0) { 10196 _Py_INCREF_UNICODE_EMPTY(); 10197 if (!unicode_empty) 10198 goto error; 10199 u = unicode_empty; 10200 goto done; 10201 } 10202 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10203 PyErr_SetString(PyExc_OverflowError, 10204 "replace string is too long"); 10205 goto error; 10206 } 10207 u = PyUnicode_New(new_size, maxchar); 10208 if (!u) 10209 goto error; 10210 assert(PyUnicode_KIND(u) == rkind); 10211 res = PyUnicode_DATA(u); 10212 ires = i = 0; 10213 if (len1 > 0) { 10214 while (n-- > 0) { 10215 /* look for next match */ 10216 j = anylib_find(rkind, self, 10217 sbuf + rkind * i, slen-i, 10218 str1, buf1, len1, i); 10219 if (j == -1) 10220 break; 10221 else if (j > i) { 10222 /* copy unchanged part [i:j] */ 10223 memcpy(res + rkind * ires, 10224 sbuf + rkind * i, 10225 rkind * (j-i)); 10226 ires += j - i; 10227 } 10228 /* copy substitution string */ 10229 if (len2 > 0) { 10230 memcpy(res + rkind * ires, 10231 buf2, 10232 rkind * len2); 10233 ires += len2; 10234 } 10235 i = j + len1; 10236 } 10237 if (i < slen) 10238 /* copy tail [i:] */ 10239 memcpy(res + rkind * ires, 10240 sbuf + rkind * i, 10241 rkind * (slen-i)); 10242 } 10243 else { 10244 /* interleave */ 10245 while (n > 0) { 10246 memcpy(res + rkind * ires, 10247 buf2, 10248 rkind * len2); 10249 ires += len2; 10250 if (--n <= 0) 10251 break; 10252 memcpy(res + rkind * ires, 10253 sbuf + rkind * i, 10254 rkind); 10255 ires++; 10256 i++; 10257 } 10258 memcpy(res + rkind * ires, 10259 sbuf + rkind * i, 10260 rkind * (slen-i)); 10261 } 10262 } 10263 10264 if (mayshrink) { 10265 unicode_adjust_maxchar(&u); 10266 if (u == NULL) 10267 goto error; 10268 } 10269 10270 done: 10271 if (srelease) 10272 PyMem_FREE(sbuf); 10273 if (release1) 10274 PyMem_FREE(buf1); 10275 if (release2) 10276 PyMem_FREE(buf2); 10277 assert(_PyUnicode_CheckConsistency(u, 1)); 10278 return u; 10279 10280 nothing: 10281 /* nothing to replace; return original string (when possible) */ 10282 if (srelease) 10283 PyMem_FREE(sbuf); 10284 if (release1) 10285 PyMem_FREE(buf1); 10286 if (release2) 10287 PyMem_FREE(buf2); 10288 return unicode_result_unchanged(self); 10289 10290 error: 10291 if (srelease && sbuf) 10292 PyMem_FREE(sbuf); 10293 if (release1 && buf1) 10294 PyMem_FREE(buf1); 10295 if (release2 && buf2) 10296 PyMem_FREE(buf2); 10297 return NULL; 10298} 10299 10300/* --- Unicode Object Methods --------------------------------------------- */ 10301 10302PyDoc_STRVAR(title__doc__, 10303 "S.title() -> str\n\ 10304\n\ 10305Return a titlecased version of S, i.e. words start with title case\n\ 10306characters, all remaining cased characters have lower case."); 10307 10308static PyObject* 10309unicode_title(PyObject *self) 10310{ 10311 if (PyUnicode_READY(self) == -1) 10312 return NULL; 10313 return case_operation(self, do_title); 10314} 10315 10316PyDoc_STRVAR(capitalize__doc__, 10317 "S.capitalize() -> str\n\ 10318\n\ 10319Return a capitalized version of S, i.e. make the first character\n\ 10320have upper case and the rest lower case."); 10321 10322static PyObject* 10323unicode_capitalize(PyObject *self) 10324{ 10325 if (PyUnicode_READY(self) == -1) 10326 return NULL; 10327 if (PyUnicode_GET_LENGTH(self) == 0) 10328 return unicode_result_unchanged(self); 10329 return case_operation(self, do_capitalize); 10330} 10331 10332PyDoc_STRVAR(casefold__doc__, 10333 "S.casefold() -> str\n\ 10334\n\ 10335Return a version of S suitable for caseless comparisons."); 10336 10337static PyObject * 10338unicode_casefold(PyObject *self) 10339{ 10340 if (PyUnicode_READY(self) == -1) 10341 return NULL; 10342 if (PyUnicode_IS_ASCII(self)) 10343 return ascii_upper_or_lower(self, 1); 10344 return case_operation(self, do_casefold); 10345} 10346 10347 10348/* Argument converter. Coerces to a single unicode character */ 10349 10350static int 10351convert_uc(PyObject *obj, void *addr) 10352{ 10353 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10354 PyObject *uniobj; 10355 10356 uniobj = PyUnicode_FromObject(obj); 10357 if (uniobj == NULL) { 10358 PyErr_SetString(PyExc_TypeError, 10359 "The fill character cannot be converted to Unicode"); 10360 return 0; 10361 } 10362 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10363 PyErr_SetString(PyExc_TypeError, 10364 "The fill character must be exactly one character long"); 10365 Py_DECREF(uniobj); 10366 return 0; 10367 } 10368 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10369 Py_DECREF(uniobj); 10370 return 1; 10371} 10372 10373PyDoc_STRVAR(center__doc__, 10374 "S.center(width[, fillchar]) -> str\n\ 10375\n\ 10376Return S centered in a string of length width. Padding is\n\ 10377done using the specified fill character (default is a space)"); 10378 10379static PyObject * 10380unicode_center(PyObject *self, PyObject *args) 10381{ 10382 Py_ssize_t marg, left; 10383 Py_ssize_t width; 10384 Py_UCS4 fillchar = ' '; 10385 10386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10387 return NULL; 10388 10389 if (PyUnicode_READY(self) == -1) 10390 return NULL; 10391 10392 if (PyUnicode_GET_LENGTH(self) >= width) 10393 return unicode_result_unchanged(self); 10394 10395 marg = width - PyUnicode_GET_LENGTH(self); 10396 left = marg / 2 + (marg & width & 1); 10397 10398 return pad(self, left, marg - left, fillchar); 10399} 10400 10401/* This function assumes that str1 and str2 are readied by the caller. */ 10402 10403static int 10404unicode_compare(PyObject *str1, PyObject *str2) 10405{ 10406#define COMPARE(TYPE1, TYPE2) \ 10407 do { \ 10408 TYPE1* p1 = (TYPE1 *)data1; \ 10409 TYPE2* p2 = (TYPE2 *)data2; \ 10410 TYPE1* end = p1 + len; \ 10411 Py_UCS4 c1, c2; \ 10412 for (; p1 != end; p1++, p2++) { \ 10413 c1 = *p1; \ 10414 c2 = *p2; \ 10415 if (c1 != c2) \ 10416 return (c1 < c2) ? -1 : 1; \ 10417 } \ 10418 } \ 10419 while (0) 10420 10421 int kind1, kind2; 10422 void *data1, *data2; 10423 Py_ssize_t len1, len2, len; 10424 10425 /* a string is equal to itself */ 10426 if (str1 == str2) 10427 return 0; 10428 10429 kind1 = PyUnicode_KIND(str1); 10430 kind2 = PyUnicode_KIND(str2); 10431 data1 = PyUnicode_DATA(str1); 10432 data2 = PyUnicode_DATA(str2); 10433 len1 = PyUnicode_GET_LENGTH(str1); 10434 len2 = PyUnicode_GET_LENGTH(str2); 10435 len = Py_MIN(len1, len2); 10436 10437 switch(kind1) { 10438 case PyUnicode_1BYTE_KIND: 10439 { 10440 switch(kind2) { 10441 case PyUnicode_1BYTE_KIND: 10442 { 10443 int cmp = memcmp(data1, data2, len); 10444 /* normalize result of memcmp() into the range [-1; 1] */ 10445 if (cmp < 0) 10446 return -1; 10447 if (cmp > 0) 10448 return 1; 10449 break; 10450 } 10451 case PyUnicode_2BYTE_KIND: 10452 COMPARE(Py_UCS1, Py_UCS2); 10453 break; 10454 case PyUnicode_4BYTE_KIND: 10455 COMPARE(Py_UCS1, Py_UCS4); 10456 break; 10457 default: 10458 assert(0); 10459 } 10460 break; 10461 } 10462 case PyUnicode_2BYTE_KIND: 10463 { 10464 switch(kind2) { 10465 case PyUnicode_1BYTE_KIND: 10466 COMPARE(Py_UCS2, Py_UCS1); 10467 break; 10468 case PyUnicode_2BYTE_KIND: 10469 { 10470 COMPARE(Py_UCS2, Py_UCS2); 10471 break; 10472 } 10473 case PyUnicode_4BYTE_KIND: 10474 COMPARE(Py_UCS2, Py_UCS4); 10475 break; 10476 default: 10477 assert(0); 10478 } 10479 break; 10480 } 10481 case PyUnicode_4BYTE_KIND: 10482 { 10483 switch(kind2) { 10484 case PyUnicode_1BYTE_KIND: 10485 COMPARE(Py_UCS4, Py_UCS1); 10486 break; 10487 case PyUnicode_2BYTE_KIND: 10488 COMPARE(Py_UCS4, Py_UCS2); 10489 break; 10490 case PyUnicode_4BYTE_KIND: 10491 { 10492#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10493 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10494 /* normalize result of wmemcmp() into the range [-1; 1] */ 10495 if (cmp < 0) 10496 return -1; 10497 if (cmp > 0) 10498 return 1; 10499#else 10500 COMPARE(Py_UCS4, Py_UCS4); 10501#endif 10502 break; 10503 } 10504 default: 10505 assert(0); 10506 } 10507 break; 10508 } 10509 default: 10510 assert(0); 10511 } 10512 10513 if (len1 == len2) 10514 return 0; 10515 if (len1 < len2) 10516 return -1; 10517 else 10518 return 1; 10519 10520#undef COMPARE 10521} 10522 10523static int 10524unicode_compare_eq(PyObject *str1, PyObject *str2) 10525{ 10526 int kind; 10527 void *data1, *data2; 10528 Py_ssize_t len; 10529 int cmp; 10530 10531 /* a string is equal to itself */ 10532 if (str1 == str2) 10533 return 1; 10534 10535 len = PyUnicode_GET_LENGTH(str1); 10536 if (PyUnicode_GET_LENGTH(str2) != len) 10537 return 0; 10538 kind = PyUnicode_KIND(str1); 10539 if (PyUnicode_KIND(str2) != kind) 10540 return 0; 10541 data1 = PyUnicode_DATA(str1); 10542 data2 = PyUnicode_DATA(str2); 10543 10544 cmp = memcmp(data1, data2, len * kind); 10545 return (cmp == 0); 10546} 10547 10548 10549int 10550PyUnicode_Compare(PyObject *left, PyObject *right) 10551{ 10552 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10553 if (PyUnicode_READY(left) == -1 || 10554 PyUnicode_READY(right) == -1) 10555 return -1; 10556 return unicode_compare(left, right); 10557 } 10558 PyErr_Format(PyExc_TypeError, 10559 "Can't compare %.100s and %.100s", 10560 left->ob_type->tp_name, 10561 right->ob_type->tp_name); 10562 return -1; 10563} 10564 10565int 10566PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10567{ 10568 Py_ssize_t i; 10569 int kind; 10570 void *data; 10571 Py_UCS4 chr; 10572 10573 assert(_PyUnicode_CHECK(uni)); 10574 if (PyUnicode_READY(uni) == -1) 10575 return -1; 10576 kind = PyUnicode_KIND(uni); 10577 data = PyUnicode_DATA(uni); 10578 /* Compare Unicode string and source character set string */ 10579 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10580 if (chr != str[i]) 10581 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10582 /* This check keeps Python strings that end in '\0' from comparing equal 10583 to C strings identical up to that point. */ 10584 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10585 return 1; /* uni is longer */ 10586 if (str[i]) 10587 return -1; /* str is longer */ 10588 return 0; 10589} 10590 10591 10592#define TEST_COND(cond) \ 10593 ((cond) ? Py_True : Py_False) 10594 10595PyObject * 10596PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10597{ 10598 int result; 10599 PyObject *v; 10600 10601 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10602 Py_RETURN_NOTIMPLEMENTED; 10603 10604 if (PyUnicode_READY(left) == -1 || 10605 PyUnicode_READY(right) == -1) 10606 return NULL; 10607 10608 if (op == Py_EQ || op == Py_NE) { 10609 result = unicode_compare_eq(left, right); 10610 if (op == Py_EQ) 10611 v = TEST_COND(result); 10612 else 10613 v = TEST_COND(!result); 10614 } 10615 else { 10616 result = unicode_compare(left, right); 10617 10618 /* Convert the return value to a Boolean */ 10619 switch (op) { 10620 case Py_LE: 10621 v = TEST_COND(result <= 0); 10622 break; 10623 case Py_GE: 10624 v = TEST_COND(result >= 0); 10625 break; 10626 case Py_LT: 10627 v = TEST_COND(result == -1); 10628 break; 10629 case Py_GT: 10630 v = TEST_COND(result == 1); 10631 break; 10632 default: 10633 PyErr_BadArgument(); 10634 return NULL; 10635 } 10636 } 10637 Py_INCREF(v); 10638 return v; 10639} 10640 10641int 10642PyUnicode_Contains(PyObject *container, PyObject *element) 10643{ 10644 PyObject *str, *sub; 10645 int kind1, kind2; 10646 void *buf1, *buf2; 10647 Py_ssize_t len1, len2; 10648 int result; 10649 10650 /* Coerce the two arguments */ 10651 sub = PyUnicode_FromObject(element); 10652 if (!sub) { 10653 PyErr_Format(PyExc_TypeError, 10654 "'in <string>' requires string as left operand, not %s", 10655 element->ob_type->tp_name); 10656 return -1; 10657 } 10658 10659 str = PyUnicode_FromObject(container); 10660 if (!str) { 10661 Py_DECREF(sub); 10662 return -1; 10663 } 10664 10665 kind1 = PyUnicode_KIND(str); 10666 kind2 = PyUnicode_KIND(sub); 10667 buf1 = PyUnicode_DATA(str); 10668 buf2 = PyUnicode_DATA(sub); 10669 if (kind2 != kind1) { 10670 if (kind2 > kind1) { 10671 Py_DECREF(sub); 10672 Py_DECREF(str); 10673 return 0; 10674 } 10675 buf2 = _PyUnicode_AsKind(sub, kind1); 10676 } 10677 if (!buf2) { 10678 Py_DECREF(sub); 10679 Py_DECREF(str); 10680 return -1; 10681 } 10682 len1 = PyUnicode_GET_LENGTH(str); 10683 len2 = PyUnicode_GET_LENGTH(sub); 10684 10685 switch (kind1) { 10686 case PyUnicode_1BYTE_KIND: 10687 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10688 break; 10689 case PyUnicode_2BYTE_KIND: 10690 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10691 break; 10692 case PyUnicode_4BYTE_KIND: 10693 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10694 break; 10695 default: 10696 result = -1; 10697 assert(0); 10698 } 10699 10700 Py_DECREF(str); 10701 Py_DECREF(sub); 10702 10703 if (kind2 != kind1) 10704 PyMem_Free(buf2); 10705 10706 return result; 10707} 10708 10709/* Concat to string or Unicode object giving a new Unicode object. */ 10710 10711PyObject * 10712PyUnicode_Concat(PyObject *left, PyObject *right) 10713{ 10714 PyObject *u = NULL, *v = NULL, *w; 10715 Py_UCS4 maxchar, maxchar2; 10716 Py_ssize_t u_len, v_len, new_len; 10717 10718 /* Coerce the two arguments */ 10719 u = PyUnicode_FromObject(left); 10720 if (u == NULL) 10721 goto onError; 10722 v = PyUnicode_FromObject(right); 10723 if (v == NULL) 10724 goto onError; 10725 10726 /* Shortcuts */ 10727 if (v == unicode_empty) { 10728 Py_DECREF(v); 10729 return u; 10730 } 10731 if (u == unicode_empty) { 10732 Py_DECREF(u); 10733 return v; 10734 } 10735 10736 u_len = PyUnicode_GET_LENGTH(u); 10737 v_len = PyUnicode_GET_LENGTH(v); 10738 if (u_len > PY_SSIZE_T_MAX - v_len) { 10739 PyErr_SetString(PyExc_OverflowError, 10740 "strings are too large to concat"); 10741 goto onError; 10742 } 10743 new_len = u_len + v_len; 10744 10745 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10746 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10747 maxchar = Py_MAX(maxchar, maxchar2); 10748 10749 /* Concat the two Unicode strings */ 10750 w = PyUnicode_New(new_len, maxchar); 10751 if (w == NULL) 10752 goto onError; 10753 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10754 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10755 Py_DECREF(u); 10756 Py_DECREF(v); 10757 assert(_PyUnicode_CheckConsistency(w, 1)); 10758 return w; 10759 10760 onError: 10761 Py_XDECREF(u); 10762 Py_XDECREF(v); 10763 return NULL; 10764} 10765 10766void 10767PyUnicode_Append(PyObject **p_left, PyObject *right) 10768{ 10769 PyObject *left, *res; 10770 Py_UCS4 maxchar, maxchar2; 10771 Py_ssize_t left_len, right_len, new_len; 10772 10773 if (p_left == NULL) { 10774 if (!PyErr_Occurred()) 10775 PyErr_BadInternalCall(); 10776 return; 10777 } 10778 left = *p_left; 10779 if (right == NULL || left == NULL 10780 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 10781 if (!PyErr_Occurred()) 10782 PyErr_BadInternalCall(); 10783 goto error; 10784 } 10785 10786 if (PyUnicode_READY(left) == -1) 10787 goto error; 10788 if (PyUnicode_READY(right) == -1) 10789 goto error; 10790 10791 /* Shortcuts */ 10792 if (left == unicode_empty) { 10793 Py_DECREF(left); 10794 Py_INCREF(right); 10795 *p_left = right; 10796 return; 10797 } 10798 if (right == unicode_empty) 10799 return; 10800 10801 left_len = PyUnicode_GET_LENGTH(left); 10802 right_len = PyUnicode_GET_LENGTH(right); 10803 if (left_len > PY_SSIZE_T_MAX - right_len) { 10804 PyErr_SetString(PyExc_OverflowError, 10805 "strings are too large to concat"); 10806 goto error; 10807 } 10808 new_len = left_len + right_len; 10809 10810 if (unicode_modifiable(left) 10811 && PyUnicode_CheckExact(right) 10812 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10813 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10814 to change the structure size, but characters are stored just after 10815 the structure, and so it requires to move all characters which is 10816 not so different than duplicating the string. */ 10817 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10818 { 10819 /* append inplace */ 10820 if (unicode_resize(p_left, new_len) != 0) 10821 goto error; 10822 10823 /* copy 'right' into the newly allocated area of 'left' */ 10824 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10825 } 10826 else { 10827 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10828 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10829 maxchar = Py_MAX(maxchar, maxchar2); 10830 10831 /* Concat the two Unicode strings */ 10832 res = PyUnicode_New(new_len, maxchar); 10833 if (res == NULL) 10834 goto error; 10835 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10836 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10837 Py_DECREF(left); 10838 *p_left = res; 10839 } 10840 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10841 return; 10842 10843error: 10844 Py_CLEAR(*p_left); 10845} 10846 10847void 10848PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10849{ 10850 PyUnicode_Append(pleft, right); 10851 Py_XDECREF(right); 10852} 10853 10854PyDoc_STRVAR(count__doc__, 10855 "S.count(sub[, start[, end]]) -> int\n\ 10856\n\ 10857Return the number of non-overlapping occurrences of substring sub in\n\ 10858string S[start:end]. Optional arguments start and end are\n\ 10859interpreted as in slice notation."); 10860 10861static PyObject * 10862unicode_count(PyObject *self, PyObject *args) 10863{ 10864 PyObject *substring; 10865 Py_ssize_t start = 0; 10866 Py_ssize_t end = PY_SSIZE_T_MAX; 10867 PyObject *result; 10868 int kind1, kind2, kind; 10869 void *buf1, *buf2; 10870 Py_ssize_t len1, len2, iresult; 10871 10872 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10873 &start, &end)) 10874 return NULL; 10875 10876 kind1 = PyUnicode_KIND(self); 10877 kind2 = PyUnicode_KIND(substring); 10878 if (kind2 > kind1) 10879 return PyLong_FromLong(0); 10880 kind = kind1; 10881 buf1 = PyUnicode_DATA(self); 10882 buf2 = PyUnicode_DATA(substring); 10883 if (kind2 != kind) 10884 buf2 = _PyUnicode_AsKind(substring, kind); 10885 if (!buf2) { 10886 Py_DECREF(substring); 10887 return NULL; 10888 } 10889 len1 = PyUnicode_GET_LENGTH(self); 10890 len2 = PyUnicode_GET_LENGTH(substring); 10891 10892 ADJUST_INDICES(start, end, len1); 10893 switch (kind) { 10894 case PyUnicode_1BYTE_KIND: 10895 iresult = ucs1lib_count( 10896 ((Py_UCS1*)buf1) + start, end - start, 10897 buf2, len2, PY_SSIZE_T_MAX 10898 ); 10899 break; 10900 case PyUnicode_2BYTE_KIND: 10901 iresult = ucs2lib_count( 10902 ((Py_UCS2*)buf1) + start, end - start, 10903 buf2, len2, PY_SSIZE_T_MAX 10904 ); 10905 break; 10906 case PyUnicode_4BYTE_KIND: 10907 iresult = ucs4lib_count( 10908 ((Py_UCS4*)buf1) + start, end - start, 10909 buf2, len2, PY_SSIZE_T_MAX 10910 ); 10911 break; 10912 default: 10913 assert(0); iresult = 0; 10914 } 10915 10916 result = PyLong_FromSsize_t(iresult); 10917 10918 if (kind2 != kind) 10919 PyMem_Free(buf2); 10920 10921 Py_DECREF(substring); 10922 10923 return result; 10924} 10925 10926PyDoc_STRVAR(encode__doc__, 10927 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10928\n\ 10929Encode S using the codec registered for encoding. Default encoding\n\ 10930is 'utf-8'. errors may be given to set a different error\n\ 10931handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10932a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10933'xmlcharrefreplace' as well as any other name registered with\n\ 10934codecs.register_error that can handle UnicodeEncodeErrors."); 10935 10936static PyObject * 10937unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10938{ 10939 static char *kwlist[] = {"encoding", "errors", 0}; 10940 char *encoding = NULL; 10941 char *errors = NULL; 10942 10943 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10944 kwlist, &encoding, &errors)) 10945 return NULL; 10946 return PyUnicode_AsEncodedString(self, encoding, errors); 10947} 10948 10949PyDoc_STRVAR(expandtabs__doc__, 10950 "S.expandtabs([tabsize]) -> str\n\ 10951\n\ 10952Return a copy of S where all tab characters are expanded using spaces.\n\ 10953If tabsize is not given, a tab size of 8 characters is assumed."); 10954 10955static PyObject* 10956unicode_expandtabs(PyObject *self, PyObject *args) 10957{ 10958 Py_ssize_t i, j, line_pos, src_len, incr; 10959 Py_UCS4 ch; 10960 PyObject *u; 10961 void *src_data, *dest_data; 10962 int tabsize = 8; 10963 int kind; 10964 int found; 10965 10966 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10967 return NULL; 10968 10969 if (PyUnicode_READY(self) == -1) 10970 return NULL; 10971 10972 /* First pass: determine size of output string */ 10973 src_len = PyUnicode_GET_LENGTH(self); 10974 i = j = line_pos = 0; 10975 kind = PyUnicode_KIND(self); 10976 src_data = PyUnicode_DATA(self); 10977 found = 0; 10978 for (; i < src_len; i++) { 10979 ch = PyUnicode_READ(kind, src_data, i); 10980 if (ch == '\t') { 10981 found = 1; 10982 if (tabsize > 0) { 10983 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10984 if (j > PY_SSIZE_T_MAX - incr) 10985 goto overflow; 10986 line_pos += incr; 10987 j += incr; 10988 } 10989 } 10990 else { 10991 if (j > PY_SSIZE_T_MAX - 1) 10992 goto overflow; 10993 line_pos++; 10994 j++; 10995 if (ch == '\n' || ch == '\r') 10996 line_pos = 0; 10997 } 10998 } 10999 if (!found) 11000 return unicode_result_unchanged(self); 11001 11002 /* Second pass: create output string and fill it */ 11003 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11004 if (!u) 11005 return NULL; 11006 dest_data = PyUnicode_DATA(u); 11007 11008 i = j = line_pos = 0; 11009 11010 for (; i < src_len; i++) { 11011 ch = PyUnicode_READ(kind, src_data, i); 11012 if (ch == '\t') { 11013 if (tabsize > 0) { 11014 incr = tabsize - (line_pos % tabsize); 11015 line_pos += incr; 11016 FILL(kind, dest_data, ' ', j, incr); 11017 j += incr; 11018 } 11019 } 11020 else { 11021 line_pos++; 11022 PyUnicode_WRITE(kind, dest_data, j, ch); 11023 j++; 11024 if (ch == '\n' || ch == '\r') 11025 line_pos = 0; 11026 } 11027 } 11028 assert (j == PyUnicode_GET_LENGTH(u)); 11029 return unicode_result(u); 11030 11031 overflow: 11032 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11033 return NULL; 11034} 11035 11036PyDoc_STRVAR(find__doc__, 11037 "S.find(sub[, start[, end]]) -> int\n\ 11038\n\ 11039Return the lowest index in S where substring sub is found,\n\ 11040such that sub is contained within S[start:end]. Optional\n\ 11041arguments start and end are interpreted as in slice notation.\n\ 11042\n\ 11043Return -1 on failure."); 11044 11045static PyObject * 11046unicode_find(PyObject *self, PyObject *args) 11047{ 11048 PyObject *substring; 11049 Py_ssize_t start; 11050 Py_ssize_t end; 11051 Py_ssize_t result; 11052 11053 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11054 &start, &end)) 11055 return NULL; 11056 11057 if (PyUnicode_READY(self) == -1) 11058 return NULL; 11059 if (PyUnicode_READY(substring) == -1) 11060 return NULL; 11061 11062 result = any_find_slice(1, self, substring, start, end); 11063 11064 Py_DECREF(substring); 11065 11066 if (result == -2) 11067 return NULL; 11068 11069 return PyLong_FromSsize_t(result); 11070} 11071 11072static PyObject * 11073unicode_getitem(PyObject *self, Py_ssize_t index) 11074{ 11075 void *data; 11076 enum PyUnicode_Kind kind; 11077 Py_UCS4 ch; 11078 PyObject *res; 11079 11080 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11081 PyErr_BadArgument(); 11082 return NULL; 11083 } 11084 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11085 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11086 return NULL; 11087 } 11088 kind = PyUnicode_KIND(self); 11089 data = PyUnicode_DATA(self); 11090 ch = PyUnicode_READ(kind, data, index); 11091 if (ch < 256) 11092 return get_latin1_char(ch); 11093 11094 res = PyUnicode_New(1, ch); 11095 if (res == NULL) 11096 return NULL; 11097 kind = PyUnicode_KIND(res); 11098 data = PyUnicode_DATA(res); 11099 PyUnicode_WRITE(kind, data, 0, ch); 11100 assert(_PyUnicode_CheckConsistency(res, 1)); 11101 return res; 11102} 11103 11104/* Believe it or not, this produces the same value for ASCII strings 11105 as bytes_hash(). */ 11106static Py_hash_t 11107unicode_hash(PyObject *self) 11108{ 11109 Py_ssize_t len; 11110 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11111 11112#ifdef Py_DEBUG 11113 assert(_Py_HashSecret_Initialized); 11114#endif 11115 if (_PyUnicode_HASH(self) != -1) 11116 return _PyUnicode_HASH(self); 11117 if (PyUnicode_READY(self) == -1) 11118 return -1; 11119 len = PyUnicode_GET_LENGTH(self); 11120 /* 11121 We make the hash of the empty string be 0, rather than using 11122 (prefix ^ suffix), since this slightly obfuscates the hash secret 11123 */ 11124 if (len == 0) { 11125 _PyUnicode_HASH(self) = 0; 11126 return 0; 11127 } 11128 11129 /* The hash function as a macro, gets expanded three times below. */ 11130#define HASH(P) \ 11131 x ^= (Py_uhash_t) *P << 7; \ 11132 while (--len >= 0) \ 11133 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 11134 11135 x = (Py_uhash_t) _Py_HashSecret.prefix; 11136 switch (PyUnicode_KIND(self)) { 11137 case PyUnicode_1BYTE_KIND: { 11138 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11139 HASH(c); 11140 break; 11141 } 11142 case PyUnicode_2BYTE_KIND: { 11143 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11144 HASH(s); 11145 break; 11146 } 11147 default: { 11148 Py_UCS4 *l; 11149 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11150 "Impossible switch case in unicode_hash"); 11151 l = PyUnicode_4BYTE_DATA(self); 11152 HASH(l); 11153 break; 11154 } 11155 } 11156 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 11157 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 11158 11159 if (x == -1) 11160 x = -2; 11161 _PyUnicode_HASH(self) = x; 11162 return x; 11163} 11164#undef HASH 11165 11166PyDoc_STRVAR(index__doc__, 11167 "S.index(sub[, start[, end]]) -> int\n\ 11168\n\ 11169Like S.find() but raise ValueError when the substring is not found."); 11170 11171static PyObject * 11172unicode_index(PyObject *self, PyObject *args) 11173{ 11174 Py_ssize_t result; 11175 PyObject *substring; 11176 Py_ssize_t start; 11177 Py_ssize_t end; 11178 11179 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11180 &start, &end)) 11181 return NULL; 11182 11183 if (PyUnicode_READY(self) == -1) 11184 return NULL; 11185 if (PyUnicode_READY(substring) == -1) 11186 return NULL; 11187 11188 result = any_find_slice(1, self, substring, start, end); 11189 11190 Py_DECREF(substring); 11191 11192 if (result == -2) 11193 return NULL; 11194 11195 if (result < 0) { 11196 PyErr_SetString(PyExc_ValueError, "substring not found"); 11197 return NULL; 11198 } 11199 11200 return PyLong_FromSsize_t(result); 11201} 11202 11203PyDoc_STRVAR(islower__doc__, 11204 "S.islower() -> bool\n\ 11205\n\ 11206Return True if all cased characters in S are lowercase and there is\n\ 11207at least one cased character in S, False otherwise."); 11208 11209static PyObject* 11210unicode_islower(PyObject *self) 11211{ 11212 Py_ssize_t i, length; 11213 int kind; 11214 void *data; 11215 int cased; 11216 11217 if (PyUnicode_READY(self) == -1) 11218 return NULL; 11219 length = PyUnicode_GET_LENGTH(self); 11220 kind = PyUnicode_KIND(self); 11221 data = PyUnicode_DATA(self); 11222 11223 /* Shortcut for single character strings */ 11224 if (length == 1) 11225 return PyBool_FromLong( 11226 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11227 11228 /* Special case for empty strings */ 11229 if (length == 0) 11230 return PyBool_FromLong(0); 11231 11232 cased = 0; 11233 for (i = 0; i < length; i++) { 11234 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11235 11236 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11237 return PyBool_FromLong(0); 11238 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11239 cased = 1; 11240 } 11241 return PyBool_FromLong(cased); 11242} 11243 11244PyDoc_STRVAR(isupper__doc__, 11245 "S.isupper() -> bool\n\ 11246\n\ 11247Return True if all cased characters in S are uppercase and there is\n\ 11248at least one cased character in S, False otherwise."); 11249 11250static PyObject* 11251unicode_isupper(PyObject *self) 11252{ 11253 Py_ssize_t i, length; 11254 int kind; 11255 void *data; 11256 int cased; 11257 11258 if (PyUnicode_READY(self) == -1) 11259 return NULL; 11260 length = PyUnicode_GET_LENGTH(self); 11261 kind = PyUnicode_KIND(self); 11262 data = PyUnicode_DATA(self); 11263 11264 /* Shortcut for single character strings */ 11265 if (length == 1) 11266 return PyBool_FromLong( 11267 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11268 11269 /* Special case for empty strings */ 11270 if (length == 0) 11271 return PyBool_FromLong(0); 11272 11273 cased = 0; 11274 for (i = 0; i < length; i++) { 11275 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11276 11277 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11278 return PyBool_FromLong(0); 11279 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11280 cased = 1; 11281 } 11282 return PyBool_FromLong(cased); 11283} 11284 11285PyDoc_STRVAR(istitle__doc__, 11286 "S.istitle() -> bool\n\ 11287\n\ 11288Return True if S is a titlecased string and there is at least one\n\ 11289character in S, i.e. upper- and titlecase characters may only\n\ 11290follow uncased characters and lowercase characters only cased ones.\n\ 11291Return False otherwise."); 11292 11293static PyObject* 11294unicode_istitle(PyObject *self) 11295{ 11296 Py_ssize_t i, length; 11297 int kind; 11298 void *data; 11299 int cased, previous_is_cased; 11300 11301 if (PyUnicode_READY(self) == -1) 11302 return NULL; 11303 length = PyUnicode_GET_LENGTH(self); 11304 kind = PyUnicode_KIND(self); 11305 data = PyUnicode_DATA(self); 11306 11307 /* Shortcut for single character strings */ 11308 if (length == 1) { 11309 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11310 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11311 (Py_UNICODE_ISUPPER(ch) != 0)); 11312 } 11313 11314 /* Special case for empty strings */ 11315 if (length == 0) 11316 return PyBool_FromLong(0); 11317 11318 cased = 0; 11319 previous_is_cased = 0; 11320 for (i = 0; i < length; i++) { 11321 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11322 11323 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11324 if (previous_is_cased) 11325 return PyBool_FromLong(0); 11326 previous_is_cased = 1; 11327 cased = 1; 11328 } 11329 else if (Py_UNICODE_ISLOWER(ch)) { 11330 if (!previous_is_cased) 11331 return PyBool_FromLong(0); 11332 previous_is_cased = 1; 11333 cased = 1; 11334 } 11335 else 11336 previous_is_cased = 0; 11337 } 11338 return PyBool_FromLong(cased); 11339} 11340 11341PyDoc_STRVAR(isspace__doc__, 11342 "S.isspace() -> bool\n\ 11343\n\ 11344Return True if all characters in S are whitespace\n\ 11345and there is at least one character in S, False otherwise."); 11346 11347static PyObject* 11348unicode_isspace(PyObject *self) 11349{ 11350 Py_ssize_t i, length; 11351 int kind; 11352 void *data; 11353 11354 if (PyUnicode_READY(self) == -1) 11355 return NULL; 11356 length = PyUnicode_GET_LENGTH(self); 11357 kind = PyUnicode_KIND(self); 11358 data = PyUnicode_DATA(self); 11359 11360 /* Shortcut for single character strings */ 11361 if (length == 1) 11362 return PyBool_FromLong( 11363 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11364 11365 /* Special case for empty strings */ 11366 if (length == 0) 11367 return PyBool_FromLong(0); 11368 11369 for (i = 0; i < length; i++) { 11370 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11371 if (!Py_UNICODE_ISSPACE(ch)) 11372 return PyBool_FromLong(0); 11373 } 11374 return PyBool_FromLong(1); 11375} 11376 11377PyDoc_STRVAR(isalpha__doc__, 11378 "S.isalpha() -> bool\n\ 11379\n\ 11380Return True if all characters in S are alphabetic\n\ 11381and there is at least one character in S, False otherwise."); 11382 11383static PyObject* 11384unicode_isalpha(PyObject *self) 11385{ 11386 Py_ssize_t i, length; 11387 int kind; 11388 void *data; 11389 11390 if (PyUnicode_READY(self) == -1) 11391 return NULL; 11392 length = PyUnicode_GET_LENGTH(self); 11393 kind = PyUnicode_KIND(self); 11394 data = PyUnicode_DATA(self); 11395 11396 /* Shortcut for single character strings */ 11397 if (length == 1) 11398 return PyBool_FromLong( 11399 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11400 11401 /* Special case for empty strings */ 11402 if (length == 0) 11403 return PyBool_FromLong(0); 11404 11405 for (i = 0; i < length; i++) { 11406 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11407 return PyBool_FromLong(0); 11408 } 11409 return PyBool_FromLong(1); 11410} 11411 11412PyDoc_STRVAR(isalnum__doc__, 11413 "S.isalnum() -> bool\n\ 11414\n\ 11415Return True if all characters in S are alphanumeric\n\ 11416and there is at least one character in S, False otherwise."); 11417 11418static PyObject* 11419unicode_isalnum(PyObject *self) 11420{ 11421 int kind; 11422 void *data; 11423 Py_ssize_t len, i; 11424 11425 if (PyUnicode_READY(self) == -1) 11426 return NULL; 11427 11428 kind = PyUnicode_KIND(self); 11429 data = PyUnicode_DATA(self); 11430 len = PyUnicode_GET_LENGTH(self); 11431 11432 /* Shortcut for single character strings */ 11433 if (len == 1) { 11434 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11435 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11436 } 11437 11438 /* Special case for empty strings */ 11439 if (len == 0) 11440 return PyBool_FromLong(0); 11441 11442 for (i = 0; i < len; i++) { 11443 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11444 if (!Py_UNICODE_ISALNUM(ch)) 11445 return PyBool_FromLong(0); 11446 } 11447 return PyBool_FromLong(1); 11448} 11449 11450PyDoc_STRVAR(isdecimal__doc__, 11451 "S.isdecimal() -> bool\n\ 11452\n\ 11453Return True if there are only decimal characters in S,\n\ 11454False otherwise."); 11455 11456static PyObject* 11457unicode_isdecimal(PyObject *self) 11458{ 11459 Py_ssize_t i, length; 11460 int kind; 11461 void *data; 11462 11463 if (PyUnicode_READY(self) == -1) 11464 return NULL; 11465 length = PyUnicode_GET_LENGTH(self); 11466 kind = PyUnicode_KIND(self); 11467 data = PyUnicode_DATA(self); 11468 11469 /* Shortcut for single character strings */ 11470 if (length == 1) 11471 return PyBool_FromLong( 11472 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11473 11474 /* Special case for empty strings */ 11475 if (length == 0) 11476 return PyBool_FromLong(0); 11477 11478 for (i = 0; i < length; i++) { 11479 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11480 return PyBool_FromLong(0); 11481 } 11482 return PyBool_FromLong(1); 11483} 11484 11485PyDoc_STRVAR(isdigit__doc__, 11486 "S.isdigit() -> bool\n\ 11487\n\ 11488Return True if all characters in S are digits\n\ 11489and there is at least one character in S, False otherwise."); 11490 11491static PyObject* 11492unicode_isdigit(PyObject *self) 11493{ 11494 Py_ssize_t i, length; 11495 int kind; 11496 void *data; 11497 11498 if (PyUnicode_READY(self) == -1) 11499 return NULL; 11500 length = PyUnicode_GET_LENGTH(self); 11501 kind = PyUnicode_KIND(self); 11502 data = PyUnicode_DATA(self); 11503 11504 /* Shortcut for single character strings */ 11505 if (length == 1) { 11506 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11507 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11508 } 11509 11510 /* Special case for empty strings */ 11511 if (length == 0) 11512 return PyBool_FromLong(0); 11513 11514 for (i = 0; i < length; i++) { 11515 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11516 return PyBool_FromLong(0); 11517 } 11518 return PyBool_FromLong(1); 11519} 11520 11521PyDoc_STRVAR(isnumeric__doc__, 11522 "S.isnumeric() -> bool\n\ 11523\n\ 11524Return True if there are only numeric characters in S,\n\ 11525False otherwise."); 11526 11527static PyObject* 11528unicode_isnumeric(PyObject *self) 11529{ 11530 Py_ssize_t i, length; 11531 int kind; 11532 void *data; 11533 11534 if (PyUnicode_READY(self) == -1) 11535 return NULL; 11536 length = PyUnicode_GET_LENGTH(self); 11537 kind = PyUnicode_KIND(self); 11538 data = PyUnicode_DATA(self); 11539 11540 /* Shortcut for single character strings */ 11541 if (length == 1) 11542 return PyBool_FromLong( 11543 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11544 11545 /* Special case for empty strings */ 11546 if (length == 0) 11547 return PyBool_FromLong(0); 11548 11549 for (i = 0; i < length; i++) { 11550 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11551 return PyBool_FromLong(0); 11552 } 11553 return PyBool_FromLong(1); 11554} 11555 11556int 11557PyUnicode_IsIdentifier(PyObject *self) 11558{ 11559 int kind; 11560 void *data; 11561 Py_ssize_t i; 11562 Py_UCS4 first; 11563 11564 if (PyUnicode_READY(self) == -1) { 11565 Py_FatalError("identifier not ready"); 11566 return 0; 11567 } 11568 11569 /* Special case for empty strings */ 11570 if (PyUnicode_GET_LENGTH(self) == 0) 11571 return 0; 11572 kind = PyUnicode_KIND(self); 11573 data = PyUnicode_DATA(self); 11574 11575 /* PEP 3131 says that the first character must be in 11576 XID_Start and subsequent characters in XID_Continue, 11577 and for the ASCII range, the 2.x rules apply (i.e 11578 start with letters and underscore, continue with 11579 letters, digits, underscore). However, given the current 11580 definition of XID_Start and XID_Continue, it is sufficient 11581 to check just for these, except that _ must be allowed 11582 as starting an identifier. */ 11583 first = PyUnicode_READ(kind, data, 0); 11584 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11585 return 0; 11586 11587 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11588 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11589 return 0; 11590 return 1; 11591} 11592 11593PyDoc_STRVAR(isidentifier__doc__, 11594 "S.isidentifier() -> bool\n\ 11595\n\ 11596Return True if S is a valid identifier according\n\ 11597to the language definition.\n\ 11598\n\ 11599Use keyword.iskeyword() to test for reserved identifiers\n\ 11600such as \"def\" and \"class\".\n"); 11601 11602static PyObject* 11603unicode_isidentifier(PyObject *self) 11604{ 11605 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11606} 11607 11608PyDoc_STRVAR(isprintable__doc__, 11609 "S.isprintable() -> bool\n\ 11610\n\ 11611Return True if all characters in S are considered\n\ 11612printable in repr() or S is empty, False otherwise."); 11613 11614static PyObject* 11615unicode_isprintable(PyObject *self) 11616{ 11617 Py_ssize_t i, length; 11618 int kind; 11619 void *data; 11620 11621 if (PyUnicode_READY(self) == -1) 11622 return NULL; 11623 length = PyUnicode_GET_LENGTH(self); 11624 kind = PyUnicode_KIND(self); 11625 data = PyUnicode_DATA(self); 11626 11627 /* Shortcut for single character strings */ 11628 if (length == 1) 11629 return PyBool_FromLong( 11630 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11631 11632 for (i = 0; i < length; i++) { 11633 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11634 Py_RETURN_FALSE; 11635 } 11636 } 11637 Py_RETURN_TRUE; 11638} 11639 11640PyDoc_STRVAR(join__doc__, 11641 "S.join(iterable) -> str\n\ 11642\n\ 11643Return a string which is the concatenation of the strings in the\n\ 11644iterable. The separator between elements is S."); 11645 11646static PyObject* 11647unicode_join(PyObject *self, PyObject *data) 11648{ 11649 return PyUnicode_Join(self, data); 11650} 11651 11652static Py_ssize_t 11653unicode_length(PyObject *self) 11654{ 11655 if (PyUnicode_READY(self) == -1) 11656 return -1; 11657 return PyUnicode_GET_LENGTH(self); 11658} 11659 11660PyDoc_STRVAR(ljust__doc__, 11661 "S.ljust(width[, fillchar]) -> str\n\ 11662\n\ 11663Return S left-justified in a Unicode string of length width. Padding is\n\ 11664done using the specified fill character (default is a space)."); 11665 11666static PyObject * 11667unicode_ljust(PyObject *self, PyObject *args) 11668{ 11669 Py_ssize_t width; 11670 Py_UCS4 fillchar = ' '; 11671 11672 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11673 return NULL; 11674 11675 if (PyUnicode_READY(self) == -1) 11676 return NULL; 11677 11678 if (PyUnicode_GET_LENGTH(self) >= width) 11679 return unicode_result_unchanged(self); 11680 11681 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11682} 11683 11684PyDoc_STRVAR(lower__doc__, 11685 "S.lower() -> str\n\ 11686\n\ 11687Return a copy of the string S converted to lowercase."); 11688 11689static PyObject* 11690unicode_lower(PyObject *self) 11691{ 11692 if (PyUnicode_READY(self) == -1) 11693 return NULL; 11694 if (PyUnicode_IS_ASCII(self)) 11695 return ascii_upper_or_lower(self, 1); 11696 return case_operation(self, do_lower); 11697} 11698 11699#define LEFTSTRIP 0 11700#define RIGHTSTRIP 1 11701#define BOTHSTRIP 2 11702 11703/* Arrays indexed by above */ 11704static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11705 11706#define STRIPNAME(i) (stripformat[i]+3) 11707 11708/* externally visible for str.strip(unicode) */ 11709PyObject * 11710_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11711{ 11712 void *data; 11713 int kind; 11714 Py_ssize_t i, j, len; 11715 BLOOM_MASK sepmask; 11716 Py_ssize_t seplen; 11717 11718 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11719 return NULL; 11720 11721 kind = PyUnicode_KIND(self); 11722 data = PyUnicode_DATA(self); 11723 len = PyUnicode_GET_LENGTH(self); 11724 seplen = PyUnicode_GET_LENGTH(sepobj); 11725 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11726 PyUnicode_DATA(sepobj), 11727 seplen); 11728 11729 i = 0; 11730 if (striptype != RIGHTSTRIP) { 11731 while (i < len) { 11732 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11733 if (!BLOOM(sepmask, ch)) 11734 break; 11735 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 11736 break; 11737 i++; 11738 } 11739 } 11740 11741 j = len; 11742 if (striptype != LEFTSTRIP) { 11743 j--; 11744 while (j >= i) { 11745 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 11746 if (!BLOOM(sepmask, ch)) 11747 break; 11748 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 11749 break; 11750 j--; 11751 } 11752 11753 j++; 11754 } 11755 11756 return PyUnicode_Substring(self, i, j); 11757} 11758 11759PyObject* 11760PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11761{ 11762 unsigned char *data; 11763 int kind; 11764 Py_ssize_t length; 11765 11766 if (PyUnicode_READY(self) == -1) 11767 return NULL; 11768 11769 length = PyUnicode_GET_LENGTH(self); 11770 end = Py_MIN(end, length); 11771 11772 if (start == 0 && end == length) 11773 return unicode_result_unchanged(self); 11774 11775 if (start < 0 || end < 0) { 11776 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11777 return NULL; 11778 } 11779 if (start >= length || end < start) 11780 _Py_RETURN_UNICODE_EMPTY(); 11781 11782 length = end - start; 11783 if (PyUnicode_IS_ASCII(self)) { 11784 data = PyUnicode_1BYTE_DATA(self); 11785 return _PyUnicode_FromASCII((char*)(data + start), length); 11786 } 11787 else { 11788 kind = PyUnicode_KIND(self); 11789 data = PyUnicode_1BYTE_DATA(self); 11790 return PyUnicode_FromKindAndData(kind, 11791 data + kind * start, 11792 length); 11793 } 11794} 11795 11796static PyObject * 11797do_strip(PyObject *self, int striptype) 11798{ 11799 Py_ssize_t len, i, j; 11800 11801 if (PyUnicode_READY(self) == -1) 11802 return NULL; 11803 11804 len = PyUnicode_GET_LENGTH(self); 11805 11806 if (PyUnicode_IS_ASCII(self)) { 11807 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 11808 11809 i = 0; 11810 if (striptype != RIGHTSTRIP) { 11811 while (i < len) { 11812 Py_UCS1 ch = data[i]; 11813 if (!_Py_ascii_whitespace[ch]) 11814 break; 11815 i++; 11816 } 11817 } 11818 11819 j = len; 11820 if (striptype != LEFTSTRIP) { 11821 j--; 11822 while (j >= i) { 11823 Py_UCS1 ch = data[j]; 11824 if (!_Py_ascii_whitespace[ch]) 11825 break; 11826 j--; 11827 } 11828 j++; 11829 } 11830 } 11831 else { 11832 int kind = PyUnicode_KIND(self); 11833 void *data = PyUnicode_DATA(self); 11834 11835 i = 0; 11836 if (striptype != RIGHTSTRIP) { 11837 while (i < len) { 11838 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11839 if (!Py_UNICODE_ISSPACE(ch)) 11840 break; 11841 i++; 11842 } 11843 } 11844 11845 j = len; 11846 if (striptype != LEFTSTRIP) { 11847 j--; 11848 while (j >= i) { 11849 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 11850 if (!Py_UNICODE_ISSPACE(ch)) 11851 break; 11852 j--; 11853 } 11854 j++; 11855 } 11856 } 11857 11858 return PyUnicode_Substring(self, i, j); 11859} 11860 11861 11862static PyObject * 11863do_argstrip(PyObject *self, int striptype, PyObject *args) 11864{ 11865 PyObject *sep = NULL; 11866 11867 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11868 return NULL; 11869 11870 if (sep != NULL && sep != Py_None) { 11871 if (PyUnicode_Check(sep)) 11872 return _PyUnicode_XStrip(self, striptype, sep); 11873 else { 11874 PyErr_Format(PyExc_TypeError, 11875 "%s arg must be None or str", 11876 STRIPNAME(striptype)); 11877 return NULL; 11878 } 11879 } 11880 11881 return do_strip(self, striptype); 11882} 11883 11884 11885PyDoc_STRVAR(strip__doc__, 11886 "S.strip([chars]) -> str\n\ 11887\n\ 11888Return a copy of the string S with leading and trailing\n\ 11889whitespace removed.\n\ 11890If chars is given and not None, remove characters in chars instead."); 11891 11892static PyObject * 11893unicode_strip(PyObject *self, PyObject *args) 11894{ 11895 if (PyTuple_GET_SIZE(args) == 0) 11896 return do_strip(self, BOTHSTRIP); /* Common case */ 11897 else 11898 return do_argstrip(self, BOTHSTRIP, args); 11899} 11900 11901 11902PyDoc_STRVAR(lstrip__doc__, 11903 "S.lstrip([chars]) -> str\n\ 11904\n\ 11905Return a copy of the string S with leading whitespace removed.\n\ 11906If chars is given and not None, remove characters in chars instead."); 11907 11908static PyObject * 11909unicode_lstrip(PyObject *self, PyObject *args) 11910{ 11911 if (PyTuple_GET_SIZE(args) == 0) 11912 return do_strip(self, LEFTSTRIP); /* Common case */ 11913 else 11914 return do_argstrip(self, LEFTSTRIP, args); 11915} 11916 11917 11918PyDoc_STRVAR(rstrip__doc__, 11919 "S.rstrip([chars]) -> str\n\ 11920\n\ 11921Return a copy of the string S with trailing whitespace removed.\n\ 11922If chars is given and not None, remove characters in chars instead."); 11923 11924static PyObject * 11925unicode_rstrip(PyObject *self, PyObject *args) 11926{ 11927 if (PyTuple_GET_SIZE(args) == 0) 11928 return do_strip(self, RIGHTSTRIP); /* Common case */ 11929 else 11930 return do_argstrip(self, RIGHTSTRIP, args); 11931} 11932 11933 11934static PyObject* 11935unicode_repeat(PyObject *str, Py_ssize_t len) 11936{ 11937 PyObject *u; 11938 Py_ssize_t nchars, n; 11939 11940 if (len < 1) 11941 _Py_RETURN_UNICODE_EMPTY(); 11942 11943 /* no repeat, return original string */ 11944 if (len == 1) 11945 return unicode_result_unchanged(str); 11946 11947 if (PyUnicode_READY(str) == -1) 11948 return NULL; 11949 11950 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11951 PyErr_SetString(PyExc_OverflowError, 11952 "repeated string is too long"); 11953 return NULL; 11954 } 11955 nchars = len * PyUnicode_GET_LENGTH(str); 11956 11957 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11958 if (!u) 11959 return NULL; 11960 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11961 11962 if (PyUnicode_GET_LENGTH(str) == 1) { 11963 const int kind = PyUnicode_KIND(str); 11964 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11965 if (kind == PyUnicode_1BYTE_KIND) { 11966 void *to = PyUnicode_DATA(u); 11967 memset(to, (unsigned char)fill_char, len); 11968 } 11969 else if (kind == PyUnicode_2BYTE_KIND) { 11970 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11971 for (n = 0; n < len; ++n) 11972 ucs2[n] = fill_char; 11973 } else { 11974 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11975 assert(kind == PyUnicode_4BYTE_KIND); 11976 for (n = 0; n < len; ++n) 11977 ucs4[n] = fill_char; 11978 } 11979 } 11980 else { 11981 /* number of characters copied this far */ 11982 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11983 const Py_ssize_t char_size = PyUnicode_KIND(str); 11984 char *to = (char *) PyUnicode_DATA(u); 11985 Py_MEMCPY(to, PyUnicode_DATA(str), 11986 PyUnicode_GET_LENGTH(str) * char_size); 11987 while (done < nchars) { 11988 n = (done <= nchars-done) ? done : nchars-done; 11989 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11990 done += n; 11991 } 11992 } 11993 11994 assert(_PyUnicode_CheckConsistency(u, 1)); 11995 return u; 11996} 11997 11998PyObject * 11999PyUnicode_Replace(PyObject *obj, 12000 PyObject *subobj, 12001 PyObject *replobj, 12002 Py_ssize_t maxcount) 12003{ 12004 PyObject *self; 12005 PyObject *str1; 12006 PyObject *str2; 12007 PyObject *result; 12008 12009 self = PyUnicode_FromObject(obj); 12010 if (self == NULL) 12011 return NULL; 12012 str1 = PyUnicode_FromObject(subobj); 12013 if (str1 == NULL) { 12014 Py_DECREF(self); 12015 return NULL; 12016 } 12017 str2 = PyUnicode_FromObject(replobj); 12018 if (str2 == NULL) { 12019 Py_DECREF(self); 12020 Py_DECREF(str1); 12021 return NULL; 12022 } 12023 if (PyUnicode_READY(self) == -1 || 12024 PyUnicode_READY(str1) == -1 || 12025 PyUnicode_READY(str2) == -1) 12026 result = NULL; 12027 else 12028 result = replace(self, str1, str2, maxcount); 12029 Py_DECREF(self); 12030 Py_DECREF(str1); 12031 Py_DECREF(str2); 12032 return result; 12033} 12034 12035PyDoc_STRVAR(replace__doc__, 12036 "S.replace(old, new[, count]) -> str\n\ 12037\n\ 12038Return a copy of S with all occurrences of substring\n\ 12039old replaced by new. If the optional argument count is\n\ 12040given, only the first count occurrences are replaced."); 12041 12042static PyObject* 12043unicode_replace(PyObject *self, PyObject *args) 12044{ 12045 PyObject *str1; 12046 PyObject *str2; 12047 Py_ssize_t maxcount = -1; 12048 PyObject *result; 12049 12050 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12051 return NULL; 12052 if (PyUnicode_READY(self) == -1) 12053 return NULL; 12054 str1 = PyUnicode_FromObject(str1); 12055 if (str1 == NULL) 12056 return NULL; 12057 str2 = PyUnicode_FromObject(str2); 12058 if (str2 == NULL) { 12059 Py_DECREF(str1); 12060 return NULL; 12061 } 12062 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12063 result = NULL; 12064 else 12065 result = replace(self, str1, str2, maxcount); 12066 12067 Py_DECREF(str1); 12068 Py_DECREF(str2); 12069 return result; 12070} 12071 12072static PyObject * 12073unicode_repr(PyObject *unicode) 12074{ 12075 PyObject *repr; 12076 Py_ssize_t isize; 12077 Py_ssize_t osize, squote, dquote, i, o; 12078 Py_UCS4 max, quote; 12079 int ikind, okind, unchanged; 12080 void *idata, *odata; 12081 12082 if (PyUnicode_READY(unicode) == -1) 12083 return NULL; 12084 12085 isize = PyUnicode_GET_LENGTH(unicode); 12086 idata = PyUnicode_DATA(unicode); 12087 12088 /* Compute length of output, quote characters, and 12089 maximum character */ 12090 osize = 0; 12091 max = 127; 12092 squote = dquote = 0; 12093 ikind = PyUnicode_KIND(unicode); 12094 for (i = 0; i < isize; i++) { 12095 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12096 switch (ch) { 12097 case '\'': squote++; osize++; break; 12098 case '"': dquote++; osize++; break; 12099 case '\\': case '\t': case '\r': case '\n': 12100 osize += 2; break; 12101 default: 12102 /* Fast-path ASCII */ 12103 if (ch < ' ' || ch == 0x7f) 12104 osize += 4; /* \xHH */ 12105 else if (ch < 0x7f) 12106 osize++; 12107 else if (Py_UNICODE_ISPRINTABLE(ch)) { 12108 osize++; 12109 max = ch > max ? ch : max; 12110 } 12111 else if (ch < 0x100) 12112 osize += 4; /* \xHH */ 12113 else if (ch < 0x10000) 12114 osize += 6; /* \uHHHH */ 12115 else 12116 osize += 10; /* \uHHHHHHHH */ 12117 } 12118 } 12119 12120 quote = '\''; 12121 unchanged = (osize == isize); 12122 if (squote) { 12123 unchanged = 0; 12124 if (dquote) 12125 /* Both squote and dquote present. Use squote, 12126 and escape them */ 12127 osize += squote; 12128 else 12129 quote = '"'; 12130 } 12131 osize += 2; /* quotes */ 12132 12133 repr = PyUnicode_New(osize, max); 12134 if (repr == NULL) 12135 return NULL; 12136 okind = PyUnicode_KIND(repr); 12137 odata = PyUnicode_DATA(repr); 12138 12139 PyUnicode_WRITE(okind, odata, 0, quote); 12140 PyUnicode_WRITE(okind, odata, osize-1, quote); 12141 if (unchanged) { 12142 _PyUnicode_FastCopyCharacters(repr, 1, 12143 unicode, 0, 12144 isize); 12145 } 12146 else { 12147 for (i = 0, o = 1; i < isize; i++) { 12148 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12149 12150 /* Escape quotes and backslashes */ 12151 if ((ch == quote) || (ch == '\\')) { 12152 PyUnicode_WRITE(okind, odata, o++, '\\'); 12153 PyUnicode_WRITE(okind, odata, o++, ch); 12154 continue; 12155 } 12156 12157 /* Map special whitespace to '\t', \n', '\r' */ 12158 if (ch == '\t') { 12159 PyUnicode_WRITE(okind, odata, o++, '\\'); 12160 PyUnicode_WRITE(okind, odata, o++, 't'); 12161 } 12162 else if (ch == '\n') { 12163 PyUnicode_WRITE(okind, odata, o++, '\\'); 12164 PyUnicode_WRITE(okind, odata, o++, 'n'); 12165 } 12166 else if (ch == '\r') { 12167 PyUnicode_WRITE(okind, odata, o++, '\\'); 12168 PyUnicode_WRITE(okind, odata, o++, 'r'); 12169 } 12170 12171 /* Map non-printable US ASCII to '\xhh' */ 12172 else if (ch < ' ' || ch == 0x7F) { 12173 PyUnicode_WRITE(okind, odata, o++, '\\'); 12174 PyUnicode_WRITE(okind, odata, o++, 'x'); 12175 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12176 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12177 } 12178 12179 /* Copy ASCII characters as-is */ 12180 else if (ch < 0x7F) { 12181 PyUnicode_WRITE(okind, odata, o++, ch); 12182 } 12183 12184 /* Non-ASCII characters */ 12185 else { 12186 /* Map Unicode whitespace and control characters 12187 (categories Z* and C* except ASCII space) 12188 */ 12189 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12190 PyUnicode_WRITE(okind, odata, o++, '\\'); 12191 /* Map 8-bit characters to '\xhh' */ 12192 if (ch <= 0xff) { 12193 PyUnicode_WRITE(okind, odata, o++, 'x'); 12194 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12195 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12196 } 12197 /* Map 16-bit characters to '\uxxxx' */ 12198 else if (ch <= 0xffff) { 12199 PyUnicode_WRITE(okind, odata, o++, 'u'); 12200 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12201 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12202 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12203 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12204 } 12205 /* Map 21-bit characters to '\U00xxxxxx' */ 12206 else { 12207 PyUnicode_WRITE(okind, odata, o++, 'U'); 12208 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12209 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12210 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12212 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12213 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12214 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12215 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12216 } 12217 } 12218 /* Copy characters as-is */ 12219 else { 12220 PyUnicode_WRITE(okind, odata, o++, ch); 12221 } 12222 } 12223 } 12224 } 12225 /* Closing quote already added at the beginning */ 12226 assert(_PyUnicode_CheckConsistency(repr, 1)); 12227 return repr; 12228} 12229 12230PyDoc_STRVAR(rfind__doc__, 12231 "S.rfind(sub[, start[, end]]) -> int\n\ 12232\n\ 12233Return the highest index in S where substring sub is found,\n\ 12234such that sub is contained within S[start:end]. Optional\n\ 12235arguments start and end are interpreted as in slice notation.\n\ 12236\n\ 12237Return -1 on failure."); 12238 12239static PyObject * 12240unicode_rfind(PyObject *self, PyObject *args) 12241{ 12242 PyObject *substring; 12243 Py_ssize_t start; 12244 Py_ssize_t end; 12245 Py_ssize_t result; 12246 12247 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12248 &start, &end)) 12249 return NULL; 12250 12251 if (PyUnicode_READY(self) == -1) { 12252 Py_DECREF(substring); 12253 return NULL; 12254 } 12255 if (PyUnicode_READY(substring) == -1) { 12256 Py_DECREF(substring); 12257 return NULL; 12258 } 12259 12260 result = any_find_slice(-1, self, substring, start, end); 12261 12262 Py_DECREF(substring); 12263 12264 if (result == -2) 12265 return NULL; 12266 12267 return PyLong_FromSsize_t(result); 12268} 12269 12270PyDoc_STRVAR(rindex__doc__, 12271 "S.rindex(sub[, start[, end]]) -> int\n\ 12272\n\ 12273Like S.rfind() but raise ValueError when the substring is not found."); 12274 12275static PyObject * 12276unicode_rindex(PyObject *self, PyObject *args) 12277{ 12278 PyObject *substring; 12279 Py_ssize_t start; 12280 Py_ssize_t end; 12281 Py_ssize_t result; 12282 12283 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12284 &start, &end)) 12285 return NULL; 12286 12287 if (PyUnicode_READY(self) == -1) { 12288 Py_DECREF(substring); 12289 return NULL; 12290 } 12291 if (PyUnicode_READY(substring) == -1) { 12292 Py_DECREF(substring); 12293 return NULL; 12294 } 12295 12296 result = any_find_slice(-1, self, substring, start, end); 12297 12298 Py_DECREF(substring); 12299 12300 if (result == -2) 12301 return NULL; 12302 12303 if (result < 0) { 12304 PyErr_SetString(PyExc_ValueError, "substring not found"); 12305 return NULL; 12306 } 12307 12308 return PyLong_FromSsize_t(result); 12309} 12310 12311PyDoc_STRVAR(rjust__doc__, 12312 "S.rjust(width[, fillchar]) -> str\n\ 12313\n\ 12314Return S right-justified in a string of length width. Padding is\n\ 12315done using the specified fill character (default is a space)."); 12316 12317static PyObject * 12318unicode_rjust(PyObject *self, PyObject *args) 12319{ 12320 Py_ssize_t width; 12321 Py_UCS4 fillchar = ' '; 12322 12323 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12324 return NULL; 12325 12326 if (PyUnicode_READY(self) == -1) 12327 return NULL; 12328 12329 if (PyUnicode_GET_LENGTH(self) >= width) 12330 return unicode_result_unchanged(self); 12331 12332 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12333} 12334 12335PyObject * 12336PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12337{ 12338 PyObject *result; 12339 12340 s = PyUnicode_FromObject(s); 12341 if (s == NULL) 12342 return NULL; 12343 if (sep != NULL) { 12344 sep = PyUnicode_FromObject(sep); 12345 if (sep == NULL) { 12346 Py_DECREF(s); 12347 return NULL; 12348 } 12349 } 12350 12351 result = split(s, sep, maxsplit); 12352 12353 Py_DECREF(s); 12354 Py_XDECREF(sep); 12355 return result; 12356} 12357 12358PyDoc_STRVAR(split__doc__, 12359 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12360\n\ 12361Return a list of the words in S, using sep as the\n\ 12362delimiter string. If maxsplit is given, at most maxsplit\n\ 12363splits are done. If sep is not specified or is None, any\n\ 12364whitespace string is a separator and empty strings are\n\ 12365removed from the result."); 12366 12367static PyObject* 12368unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12369{ 12370 static char *kwlist[] = {"sep", "maxsplit", 0}; 12371 PyObject *substring = Py_None; 12372 Py_ssize_t maxcount = -1; 12373 12374 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12375 kwlist, &substring, &maxcount)) 12376 return NULL; 12377 12378 if (substring == Py_None) 12379 return split(self, NULL, maxcount); 12380 else if (PyUnicode_Check(substring)) 12381 return split(self, substring, maxcount); 12382 else 12383 return PyUnicode_Split(self, substring, maxcount); 12384} 12385 12386PyObject * 12387PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12388{ 12389 PyObject* str_obj; 12390 PyObject* sep_obj; 12391 PyObject* out; 12392 int kind1, kind2, kind; 12393 void *buf1 = NULL, *buf2 = NULL; 12394 Py_ssize_t len1, len2; 12395 12396 str_obj = PyUnicode_FromObject(str_in); 12397 if (!str_obj) 12398 return NULL; 12399 sep_obj = PyUnicode_FromObject(sep_in); 12400 if (!sep_obj) { 12401 Py_DECREF(str_obj); 12402 return NULL; 12403 } 12404 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12405 Py_DECREF(sep_obj); 12406 Py_DECREF(str_obj); 12407 return NULL; 12408 } 12409 12410 kind1 = PyUnicode_KIND(str_obj); 12411 kind2 = PyUnicode_KIND(sep_obj); 12412 kind = Py_MAX(kind1, kind2); 12413 buf1 = PyUnicode_DATA(str_obj); 12414 if (kind1 != kind) 12415 buf1 = _PyUnicode_AsKind(str_obj, kind); 12416 if (!buf1) 12417 goto onError; 12418 buf2 = PyUnicode_DATA(sep_obj); 12419 if (kind2 != kind) 12420 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12421 if (!buf2) 12422 goto onError; 12423 len1 = PyUnicode_GET_LENGTH(str_obj); 12424 len2 = PyUnicode_GET_LENGTH(sep_obj); 12425 12426 switch (PyUnicode_KIND(str_obj)) { 12427 case PyUnicode_1BYTE_KIND: 12428 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12429 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12430 else 12431 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12432 break; 12433 case PyUnicode_2BYTE_KIND: 12434 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12435 break; 12436 case PyUnicode_4BYTE_KIND: 12437 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12438 break; 12439 default: 12440 assert(0); 12441 out = 0; 12442 } 12443 12444 Py_DECREF(sep_obj); 12445 Py_DECREF(str_obj); 12446 if (kind1 != kind) 12447 PyMem_Free(buf1); 12448 if (kind2 != kind) 12449 PyMem_Free(buf2); 12450 12451 return out; 12452 onError: 12453 Py_DECREF(sep_obj); 12454 Py_DECREF(str_obj); 12455 if (kind1 != kind && buf1) 12456 PyMem_Free(buf1); 12457 if (kind2 != kind && buf2) 12458 PyMem_Free(buf2); 12459 return NULL; 12460} 12461 12462 12463PyObject * 12464PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12465{ 12466 PyObject* str_obj; 12467 PyObject* sep_obj; 12468 PyObject* out; 12469 int kind1, kind2, kind; 12470 void *buf1 = NULL, *buf2 = NULL; 12471 Py_ssize_t len1, len2; 12472 12473 str_obj = PyUnicode_FromObject(str_in); 12474 if (!str_obj) 12475 return NULL; 12476 sep_obj = PyUnicode_FromObject(sep_in); 12477 if (!sep_obj) { 12478 Py_DECREF(str_obj); 12479 return NULL; 12480 } 12481 12482 kind1 = PyUnicode_KIND(str_in); 12483 kind2 = PyUnicode_KIND(sep_obj); 12484 kind = Py_MAX(kind1, kind2); 12485 buf1 = PyUnicode_DATA(str_in); 12486 if (kind1 != kind) 12487 buf1 = _PyUnicode_AsKind(str_in, kind); 12488 if (!buf1) 12489 goto onError; 12490 buf2 = PyUnicode_DATA(sep_obj); 12491 if (kind2 != kind) 12492 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12493 if (!buf2) 12494 goto onError; 12495 len1 = PyUnicode_GET_LENGTH(str_obj); 12496 len2 = PyUnicode_GET_LENGTH(sep_obj); 12497 12498 switch (PyUnicode_KIND(str_in)) { 12499 case PyUnicode_1BYTE_KIND: 12500 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12501 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12502 else 12503 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12504 break; 12505 case PyUnicode_2BYTE_KIND: 12506 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12507 break; 12508 case PyUnicode_4BYTE_KIND: 12509 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12510 break; 12511 default: 12512 assert(0); 12513 out = 0; 12514 } 12515 12516 Py_DECREF(sep_obj); 12517 Py_DECREF(str_obj); 12518 if (kind1 != kind) 12519 PyMem_Free(buf1); 12520 if (kind2 != kind) 12521 PyMem_Free(buf2); 12522 12523 return out; 12524 onError: 12525 Py_DECREF(sep_obj); 12526 Py_DECREF(str_obj); 12527 if (kind1 != kind && buf1) 12528 PyMem_Free(buf1); 12529 if (kind2 != kind && buf2) 12530 PyMem_Free(buf2); 12531 return NULL; 12532} 12533 12534PyDoc_STRVAR(partition__doc__, 12535 "S.partition(sep) -> (head, sep, tail)\n\ 12536\n\ 12537Search for the separator sep in S, and return the part before it,\n\ 12538the separator itself, and the part after it. If the separator is not\n\ 12539found, return S and two empty strings."); 12540 12541static PyObject* 12542unicode_partition(PyObject *self, PyObject *separator) 12543{ 12544 return PyUnicode_Partition(self, separator); 12545} 12546 12547PyDoc_STRVAR(rpartition__doc__, 12548 "S.rpartition(sep) -> (head, sep, tail)\n\ 12549\n\ 12550Search for the separator sep in S, starting at the end of S, and return\n\ 12551the part before it, the separator itself, and the part after it. If the\n\ 12552separator is not found, return two empty strings and S."); 12553 12554static PyObject* 12555unicode_rpartition(PyObject *self, PyObject *separator) 12556{ 12557 return PyUnicode_RPartition(self, separator); 12558} 12559 12560PyObject * 12561PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12562{ 12563 PyObject *result; 12564 12565 s = PyUnicode_FromObject(s); 12566 if (s == NULL) 12567 return NULL; 12568 if (sep != NULL) { 12569 sep = PyUnicode_FromObject(sep); 12570 if (sep == NULL) { 12571 Py_DECREF(s); 12572 return NULL; 12573 } 12574 } 12575 12576 result = rsplit(s, sep, maxsplit); 12577 12578 Py_DECREF(s); 12579 Py_XDECREF(sep); 12580 return result; 12581} 12582 12583PyDoc_STRVAR(rsplit__doc__, 12584 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12585\n\ 12586Return a list of the words in S, using sep as the\n\ 12587delimiter string, starting at the end of the string and\n\ 12588working to the front. If maxsplit is given, at most maxsplit\n\ 12589splits are done. If sep is not specified, any whitespace string\n\ 12590is a separator."); 12591 12592static PyObject* 12593unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12594{ 12595 static char *kwlist[] = {"sep", "maxsplit", 0}; 12596 PyObject *substring = Py_None; 12597 Py_ssize_t maxcount = -1; 12598 12599 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12600 kwlist, &substring, &maxcount)) 12601 return NULL; 12602 12603 if (substring == Py_None) 12604 return rsplit(self, NULL, maxcount); 12605 else if (PyUnicode_Check(substring)) 12606 return rsplit(self, substring, maxcount); 12607 else 12608 return PyUnicode_RSplit(self, substring, maxcount); 12609} 12610 12611PyDoc_STRVAR(splitlines__doc__, 12612 "S.splitlines([keepends]) -> list of strings\n\ 12613\n\ 12614Return a list of the lines in S, breaking at line boundaries.\n\ 12615Line breaks are not included in the resulting list unless keepends\n\ 12616is given and true."); 12617 12618static PyObject* 12619unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12620{ 12621 static char *kwlist[] = {"keepends", 0}; 12622 int keepends = 0; 12623 12624 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12625 kwlist, &keepends)) 12626 return NULL; 12627 12628 return PyUnicode_Splitlines(self, keepends); 12629} 12630 12631static 12632PyObject *unicode_str(PyObject *self) 12633{ 12634 return unicode_result_unchanged(self); 12635} 12636 12637PyDoc_STRVAR(swapcase__doc__, 12638 "S.swapcase() -> str\n\ 12639\n\ 12640Return a copy of S with uppercase characters converted to lowercase\n\ 12641and vice versa."); 12642 12643static PyObject* 12644unicode_swapcase(PyObject *self) 12645{ 12646 if (PyUnicode_READY(self) == -1) 12647 return NULL; 12648 return case_operation(self, do_swapcase); 12649} 12650 12651PyDoc_STRVAR(maketrans__doc__, 12652 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12653\n\ 12654Return a translation table usable for str.translate().\n\ 12655If there is only one argument, it must be a dictionary mapping Unicode\n\ 12656ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12657Character keys will be then converted to ordinals.\n\ 12658If there are two arguments, they must be strings of equal length, and\n\ 12659in the resulting dictionary, each character in x will be mapped to the\n\ 12660character at the same position in y. If there is a third argument, it\n\ 12661must be a string, whose characters will be mapped to None in the result."); 12662 12663static PyObject* 12664unicode_maketrans(PyObject *null, PyObject *args) 12665{ 12666 PyObject *x, *y = NULL, *z = NULL; 12667 PyObject *new = NULL, *key, *value; 12668 Py_ssize_t i = 0; 12669 int res; 12670 12671 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12672 return NULL; 12673 new = PyDict_New(); 12674 if (!new) 12675 return NULL; 12676 if (y != NULL) { 12677 int x_kind, y_kind, z_kind; 12678 void *x_data, *y_data, *z_data; 12679 12680 /* x must be a string too, of equal length */ 12681 if (!PyUnicode_Check(x)) { 12682 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12683 "be a string if there is a second argument"); 12684 goto err; 12685 } 12686 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12687 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12688 "arguments must have equal length"); 12689 goto err; 12690 } 12691 /* create entries for translating chars in x to those in y */ 12692 x_kind = PyUnicode_KIND(x); 12693 y_kind = PyUnicode_KIND(y); 12694 x_data = PyUnicode_DATA(x); 12695 y_data = PyUnicode_DATA(y); 12696 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12697 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12698 if (!key) 12699 goto err; 12700 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12701 if (!value) { 12702 Py_DECREF(key); 12703 goto err; 12704 } 12705 res = PyDict_SetItem(new, key, value); 12706 Py_DECREF(key); 12707 Py_DECREF(value); 12708 if (res < 0) 12709 goto err; 12710 } 12711 /* create entries for deleting chars in z */ 12712 if (z != NULL) { 12713 z_kind = PyUnicode_KIND(z); 12714 z_data = PyUnicode_DATA(z); 12715 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12716 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12717 if (!key) 12718 goto err; 12719 res = PyDict_SetItem(new, key, Py_None); 12720 Py_DECREF(key); 12721 if (res < 0) 12722 goto err; 12723 } 12724 } 12725 } else { 12726 int kind; 12727 void *data; 12728 12729 /* x must be a dict */ 12730 if (!PyDict_CheckExact(x)) { 12731 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12732 "to maketrans it must be a dict"); 12733 goto err; 12734 } 12735 /* copy entries into the new dict, converting string keys to int keys */ 12736 while (PyDict_Next(x, &i, &key, &value)) { 12737 if (PyUnicode_Check(key)) { 12738 /* convert string keys to integer keys */ 12739 PyObject *newkey; 12740 if (PyUnicode_GET_LENGTH(key) != 1) { 12741 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12742 "table must be of length 1"); 12743 goto err; 12744 } 12745 kind = PyUnicode_KIND(key); 12746 data = PyUnicode_DATA(key); 12747 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12748 if (!newkey) 12749 goto err; 12750 res = PyDict_SetItem(new, newkey, value); 12751 Py_DECREF(newkey); 12752 if (res < 0) 12753 goto err; 12754 } else if (PyLong_Check(key)) { 12755 /* just keep integer keys */ 12756 if (PyDict_SetItem(new, key, value) < 0) 12757 goto err; 12758 } else { 12759 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12760 "be strings or integers"); 12761 goto err; 12762 } 12763 } 12764 } 12765 return new; 12766 err: 12767 Py_DECREF(new); 12768 return NULL; 12769} 12770 12771PyDoc_STRVAR(translate__doc__, 12772 "S.translate(table) -> str\n\ 12773\n\ 12774Return a copy of the string S, where all characters have been mapped\n\ 12775through the given translation table, which must be a mapping of\n\ 12776Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12777Unmapped characters are left untouched. Characters mapped to None\n\ 12778are deleted."); 12779 12780static PyObject* 12781unicode_translate(PyObject *self, PyObject *table) 12782{ 12783 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12784} 12785 12786PyDoc_STRVAR(upper__doc__, 12787 "S.upper() -> str\n\ 12788\n\ 12789Return a copy of S converted to uppercase."); 12790 12791static PyObject* 12792unicode_upper(PyObject *self) 12793{ 12794 if (PyUnicode_READY(self) == -1) 12795 return NULL; 12796 if (PyUnicode_IS_ASCII(self)) 12797 return ascii_upper_or_lower(self, 0); 12798 return case_operation(self, do_upper); 12799} 12800 12801PyDoc_STRVAR(zfill__doc__, 12802 "S.zfill(width) -> str\n\ 12803\n\ 12804Pad a numeric string S with zeros on the left, to fill a field\n\ 12805of the specified width. The string S is never truncated."); 12806 12807static PyObject * 12808unicode_zfill(PyObject *self, PyObject *args) 12809{ 12810 Py_ssize_t fill; 12811 PyObject *u; 12812 Py_ssize_t width; 12813 int kind; 12814 void *data; 12815 Py_UCS4 chr; 12816 12817 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12818 return NULL; 12819 12820 if (PyUnicode_READY(self) == -1) 12821 return NULL; 12822 12823 if (PyUnicode_GET_LENGTH(self) >= width) 12824 return unicode_result_unchanged(self); 12825 12826 fill = width - PyUnicode_GET_LENGTH(self); 12827 12828 u = pad(self, fill, 0, '0'); 12829 12830 if (u == NULL) 12831 return NULL; 12832 12833 kind = PyUnicode_KIND(u); 12834 data = PyUnicode_DATA(u); 12835 chr = PyUnicode_READ(kind, data, fill); 12836 12837 if (chr == '+' || chr == '-') { 12838 /* move sign to beginning of string */ 12839 PyUnicode_WRITE(kind, data, 0, chr); 12840 PyUnicode_WRITE(kind, data, fill, '0'); 12841 } 12842 12843 assert(_PyUnicode_CheckConsistency(u, 1)); 12844 return u; 12845} 12846 12847#if 0 12848static PyObject * 12849unicode__decimal2ascii(PyObject *self) 12850{ 12851 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12852} 12853#endif 12854 12855PyDoc_STRVAR(startswith__doc__, 12856 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12857\n\ 12858Return True if S starts with the specified prefix, False otherwise.\n\ 12859With optional start, test S beginning at that position.\n\ 12860With optional end, stop comparing S at that position.\n\ 12861prefix can also be a tuple of strings to try."); 12862 12863static PyObject * 12864unicode_startswith(PyObject *self, 12865 PyObject *args) 12866{ 12867 PyObject *subobj; 12868 PyObject *substring; 12869 Py_ssize_t start = 0; 12870 Py_ssize_t end = PY_SSIZE_T_MAX; 12871 int result; 12872 12873 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12874 return NULL; 12875 if (PyTuple_Check(subobj)) { 12876 Py_ssize_t i; 12877 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12878 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12879 if (substring == NULL) 12880 return NULL; 12881 result = tailmatch(self, substring, start, end, -1); 12882 Py_DECREF(substring); 12883 if (result == -1) 12884 return NULL; 12885 if (result) { 12886 Py_RETURN_TRUE; 12887 } 12888 } 12889 /* nothing matched */ 12890 Py_RETURN_FALSE; 12891 } 12892 substring = PyUnicode_FromObject(subobj); 12893 if (substring == NULL) { 12894 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12895 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12896 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12897 return NULL; 12898 } 12899 result = tailmatch(self, substring, start, end, -1); 12900 Py_DECREF(substring); 12901 if (result == -1) 12902 return NULL; 12903 return PyBool_FromLong(result); 12904} 12905 12906 12907PyDoc_STRVAR(endswith__doc__, 12908 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12909\n\ 12910Return True if S ends with the specified suffix, False otherwise.\n\ 12911With optional start, test S beginning at that position.\n\ 12912With optional end, stop comparing S at that position.\n\ 12913suffix can also be a tuple of strings to try."); 12914 12915static PyObject * 12916unicode_endswith(PyObject *self, 12917 PyObject *args) 12918{ 12919 PyObject *subobj; 12920 PyObject *substring; 12921 Py_ssize_t start = 0; 12922 Py_ssize_t end = PY_SSIZE_T_MAX; 12923 int result; 12924 12925 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12926 return NULL; 12927 if (PyTuple_Check(subobj)) { 12928 Py_ssize_t i; 12929 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12930 substring = PyUnicode_FromObject( 12931 PyTuple_GET_ITEM(subobj, i)); 12932 if (substring == NULL) 12933 return NULL; 12934 result = tailmatch(self, substring, start, end, +1); 12935 Py_DECREF(substring); 12936 if (result == -1) 12937 return NULL; 12938 if (result) { 12939 Py_RETURN_TRUE; 12940 } 12941 } 12942 Py_RETURN_FALSE; 12943 } 12944 substring = PyUnicode_FromObject(subobj); 12945 if (substring == NULL) { 12946 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12947 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12948 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12949 return NULL; 12950 } 12951 result = tailmatch(self, substring, start, end, +1); 12952 Py_DECREF(substring); 12953 if (result == -1) 12954 return NULL; 12955 return PyBool_FromLong(result); 12956} 12957 12958Py_LOCAL_INLINE(void) 12959_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12960{ 12961 if (!writer->readonly) 12962 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12963 else { 12964 /* Copy-on-write mode: set buffer size to 0 so 12965 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 12966 * next write. */ 12967 writer->size = 0; 12968 } 12969 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12970 writer->data = PyUnicode_DATA(writer->buffer); 12971 writer->kind = PyUnicode_KIND(writer->buffer); 12972} 12973 12974void 12975_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 12976{ 12977 memset(writer, 0, sizeof(*writer)); 12978#ifdef Py_DEBUG 12979 writer->kind = 5; /* invalid kind */ 12980#endif 12981 writer->min_char = 127; 12982} 12983 12984int 12985_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12986 Py_ssize_t length, Py_UCS4 maxchar) 12987{ 12988 Py_ssize_t newlen; 12989 PyObject *newbuffer; 12990 12991 assert(length > 0); 12992 12993 if (length > PY_SSIZE_T_MAX - writer->pos) { 12994 PyErr_NoMemory(); 12995 return -1; 12996 } 12997 newlen = writer->pos + length; 12998 12999 maxchar = Py_MAX(maxchar, writer->min_char); 13000 13001 if (writer->buffer == NULL) { 13002 assert(!writer->readonly); 13003 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { 13004 /* overallocate 25% to limit the number of resize */ 13005 newlen += newlen / 4; 13006 } 13007 if (newlen < writer->min_length) 13008 newlen = writer->min_length; 13009 13010 writer->buffer = PyUnicode_New(newlen, maxchar); 13011 if (writer->buffer == NULL) 13012 return -1; 13013 } 13014 else if (newlen > writer->size) { 13015 if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) { 13016 /* overallocate 25% to limit the number of resize */ 13017 newlen += newlen / 4; 13018 } 13019 if (newlen < writer->min_length) 13020 newlen = writer->min_length; 13021 13022 if (maxchar > writer->maxchar || writer->readonly) { 13023 /* resize + widen */ 13024 newbuffer = PyUnicode_New(newlen, maxchar); 13025 if (newbuffer == NULL) 13026 return -1; 13027 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13028 writer->buffer, 0, writer->pos); 13029 Py_DECREF(writer->buffer); 13030 writer->readonly = 0; 13031 } 13032 else { 13033 newbuffer = resize_compact(writer->buffer, newlen); 13034 if (newbuffer == NULL) 13035 return -1; 13036 } 13037 writer->buffer = newbuffer; 13038 } 13039 else if (maxchar > writer->maxchar) { 13040 assert(!writer->readonly); 13041 newbuffer = PyUnicode_New(writer->size, maxchar); 13042 if (newbuffer == NULL) 13043 return -1; 13044 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13045 writer->buffer, 0, writer->pos); 13046 Py_DECREF(writer->buffer); 13047 writer->buffer = newbuffer; 13048 } 13049 _PyUnicodeWriter_Update(writer); 13050 return 0; 13051} 13052 13053Py_LOCAL_INLINE(int) 13054_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13055{ 13056 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13057 return -1; 13058 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13059 writer->pos++; 13060 return 0; 13061} 13062 13063int 13064_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13065{ 13066 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13067} 13068 13069int 13070_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13071{ 13072 Py_UCS4 maxchar; 13073 Py_ssize_t len; 13074 13075 if (PyUnicode_READY(str) == -1) 13076 return -1; 13077 len = PyUnicode_GET_LENGTH(str); 13078 if (len == 0) 13079 return 0; 13080 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13081 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13082 if (writer->buffer == NULL && !writer->overallocate) { 13083 writer->readonly = 1; 13084 Py_INCREF(str); 13085 writer->buffer = str; 13086 _PyUnicodeWriter_Update(writer); 13087 writer->pos += len; 13088 return 0; 13089 } 13090 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13091 return -1; 13092 } 13093 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13094 str, 0, len); 13095 writer->pos += len; 13096 return 0; 13097} 13098 13099int 13100_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13101 Py_ssize_t start, Py_ssize_t end) 13102{ 13103 Py_UCS4 maxchar; 13104 Py_ssize_t len; 13105 13106 if (PyUnicode_READY(str) == -1) 13107 return -1; 13108 13109 assert(0 <= start); 13110 assert(end <= PyUnicode_GET_LENGTH(str)); 13111 assert(start <= end); 13112 13113 if (end == 0) 13114 return 0; 13115 13116 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13117 return _PyUnicodeWriter_WriteStr(writer, str); 13118 13119 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13120 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13121 else 13122 maxchar = writer->maxchar; 13123 len = end - start; 13124 13125 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13126 return -1; 13127 13128 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13129 str, start, len); 13130 writer->pos += len; 13131 return 0; 13132} 13133 13134int 13135_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) 13136{ 13137 Py_UCS4 maxchar; 13138 13139 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13140 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13141 return -1; 13142 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13143 writer->pos += len; 13144 return 0; 13145} 13146 13147PyObject * 13148_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13149{ 13150 if (writer->pos == 0) { 13151 Py_XDECREF(writer->buffer); 13152 _Py_RETURN_UNICODE_EMPTY(); 13153 } 13154 if (writer->readonly) { 13155 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 13156 return writer->buffer; 13157 } 13158 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13159 PyObject *newbuffer; 13160 newbuffer = resize_compact(writer->buffer, writer->pos); 13161 if (newbuffer == NULL) { 13162 Py_DECREF(writer->buffer); 13163 return NULL; 13164 } 13165 writer->buffer = newbuffer; 13166 } 13167 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 13168 return unicode_result_ready(writer->buffer); 13169} 13170 13171void 13172_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13173{ 13174 Py_CLEAR(writer->buffer); 13175} 13176 13177#include "stringlib/unicode_format.h" 13178 13179PyDoc_STRVAR(format__doc__, 13180 "S.format(*args, **kwargs) -> str\n\ 13181\n\ 13182Return a formatted version of S, using substitutions from args and kwargs.\n\ 13183The substitutions are identified by braces ('{' and '}')."); 13184 13185PyDoc_STRVAR(format_map__doc__, 13186 "S.format_map(mapping) -> str\n\ 13187\n\ 13188Return a formatted version of S, using substitutions from mapping.\n\ 13189The substitutions are identified by braces ('{' and '}')."); 13190 13191static PyObject * 13192unicode__format__(PyObject* self, PyObject* args) 13193{ 13194 PyObject *format_spec; 13195 _PyUnicodeWriter writer; 13196 int ret; 13197 13198 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13199 return NULL; 13200 13201 if (PyUnicode_READY(self) == -1) 13202 return NULL; 13203 _PyUnicodeWriter_Init(&writer); 13204 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13205 self, format_spec, 0, 13206 PyUnicode_GET_LENGTH(format_spec)); 13207 if (ret == -1) { 13208 _PyUnicodeWriter_Dealloc(&writer); 13209 return NULL; 13210 } 13211 return _PyUnicodeWriter_Finish(&writer); 13212} 13213 13214PyDoc_STRVAR(p_format__doc__, 13215 "S.__format__(format_spec) -> str\n\ 13216\n\ 13217Return a formatted version of S as described by format_spec."); 13218 13219static PyObject * 13220unicode__sizeof__(PyObject *v) 13221{ 13222 Py_ssize_t size; 13223 13224 /* If it's a compact object, account for base structure + 13225 character data. */ 13226 if (PyUnicode_IS_COMPACT_ASCII(v)) 13227 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13228 else if (PyUnicode_IS_COMPACT(v)) 13229 size = sizeof(PyCompactUnicodeObject) + 13230 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13231 else { 13232 /* If it is a two-block object, account for base object, and 13233 for character block if present. */ 13234 size = sizeof(PyUnicodeObject); 13235 if (_PyUnicode_DATA_ANY(v)) 13236 size += (PyUnicode_GET_LENGTH(v) + 1) * 13237 PyUnicode_KIND(v); 13238 } 13239 /* If the wstr pointer is present, account for it unless it is shared 13240 with the data pointer. Check if the data is not shared. */ 13241 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13242 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13243 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13244 size += PyUnicode_UTF8_LENGTH(v) + 1; 13245 13246 return PyLong_FromSsize_t(size); 13247} 13248 13249PyDoc_STRVAR(sizeof__doc__, 13250 "S.__sizeof__() -> size of S in memory, in bytes"); 13251 13252static PyObject * 13253unicode_getnewargs(PyObject *v) 13254{ 13255 PyObject *copy = _PyUnicode_Copy(v); 13256 if (!copy) 13257 return NULL; 13258 return Py_BuildValue("(N)", copy); 13259} 13260 13261static PyMethodDef unicode_methods[] = { 13262 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13263 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13264 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13265 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13266 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13267 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13268 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13269 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13270 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13271 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13272 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 13273 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13274 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13275 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13276 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13277 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13278 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13279 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13280 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13281 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13282 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13283 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13284 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13285 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13286 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13287 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13288 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13289 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13290 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13291 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13292 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13293 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13294 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13295 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13296 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13297 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13298 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13299 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13300 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13301 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13302 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13303 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13304 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13305 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13306 {"maketrans", (PyCFunction) unicode_maketrans, 13307 METH_VARARGS | METH_STATIC, maketrans__doc__}, 13308 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13309#if 0 13310 /* These methods are just used for debugging the implementation. */ 13311 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13312#endif 13313 13314 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13315 {NULL, NULL} 13316}; 13317 13318static PyObject * 13319unicode_mod(PyObject *v, PyObject *w) 13320{ 13321 if (!PyUnicode_Check(v)) 13322 Py_RETURN_NOTIMPLEMENTED; 13323 return PyUnicode_Format(v, w); 13324} 13325 13326static PyNumberMethods unicode_as_number = { 13327 0, /*nb_add*/ 13328 0, /*nb_subtract*/ 13329 0, /*nb_multiply*/ 13330 unicode_mod, /*nb_remainder*/ 13331}; 13332 13333static PySequenceMethods unicode_as_sequence = { 13334 (lenfunc) unicode_length, /* sq_length */ 13335 PyUnicode_Concat, /* sq_concat */ 13336 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13337 (ssizeargfunc) unicode_getitem, /* sq_item */ 13338 0, /* sq_slice */ 13339 0, /* sq_ass_item */ 13340 0, /* sq_ass_slice */ 13341 PyUnicode_Contains, /* sq_contains */ 13342}; 13343 13344static PyObject* 13345unicode_subscript(PyObject* self, PyObject* item) 13346{ 13347 if (PyUnicode_READY(self) == -1) 13348 return NULL; 13349 13350 if (PyIndex_Check(item)) { 13351 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13352 if (i == -1 && PyErr_Occurred()) 13353 return NULL; 13354 if (i < 0) 13355 i += PyUnicode_GET_LENGTH(self); 13356 return unicode_getitem(self, i); 13357 } else if (PySlice_Check(item)) { 13358 Py_ssize_t start, stop, step, slicelength, cur, i; 13359 PyObject *result; 13360 void *src_data, *dest_data; 13361 int src_kind, dest_kind; 13362 Py_UCS4 ch, max_char, kind_limit; 13363 13364 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13365 &start, &stop, &step, &slicelength) < 0) { 13366 return NULL; 13367 } 13368 13369 if (slicelength <= 0) { 13370 _Py_RETURN_UNICODE_EMPTY(); 13371 } else if (start == 0 && step == 1 && 13372 slicelength == PyUnicode_GET_LENGTH(self)) { 13373 return unicode_result_unchanged(self); 13374 } else if (step == 1) { 13375 return PyUnicode_Substring(self, 13376 start, start + slicelength); 13377 } 13378 /* General case */ 13379 src_kind = PyUnicode_KIND(self); 13380 src_data = PyUnicode_DATA(self); 13381 if (!PyUnicode_IS_ASCII(self)) { 13382 kind_limit = kind_maxchar_limit(src_kind); 13383 max_char = 0; 13384 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13385 ch = PyUnicode_READ(src_kind, src_data, cur); 13386 if (ch > max_char) { 13387 max_char = ch; 13388 if (max_char >= kind_limit) 13389 break; 13390 } 13391 } 13392 } 13393 else 13394 max_char = 127; 13395 result = PyUnicode_New(slicelength, max_char); 13396 if (result == NULL) 13397 return NULL; 13398 dest_kind = PyUnicode_KIND(result); 13399 dest_data = PyUnicode_DATA(result); 13400 13401 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13402 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13403 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13404 } 13405 assert(_PyUnicode_CheckConsistency(result, 1)); 13406 return result; 13407 } else { 13408 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13409 return NULL; 13410 } 13411} 13412 13413static PyMappingMethods unicode_as_mapping = { 13414 (lenfunc)unicode_length, /* mp_length */ 13415 (binaryfunc)unicode_subscript, /* mp_subscript */ 13416 (objobjargproc)0, /* mp_ass_subscript */ 13417}; 13418 13419 13420/* Helpers for PyUnicode_Format() */ 13421 13422struct unicode_formatter_t { 13423 PyObject *args; 13424 int args_owned; 13425 Py_ssize_t arglen, argidx; 13426 PyObject *dict; 13427 13428 enum PyUnicode_Kind fmtkind; 13429 Py_ssize_t fmtcnt, fmtpos; 13430 void *fmtdata; 13431 PyObject *fmtstr; 13432 13433 _PyUnicodeWriter writer; 13434}; 13435 13436struct unicode_format_arg_t { 13437 Py_UCS4 ch; 13438 int flags; 13439 Py_ssize_t width; 13440 int prec; 13441 int sign; 13442}; 13443 13444static PyObject * 13445unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13446{ 13447 Py_ssize_t argidx = ctx->argidx; 13448 13449 if (argidx < ctx->arglen) { 13450 ctx->argidx++; 13451 if (ctx->arglen < 0) 13452 return ctx->args; 13453 else 13454 return PyTuple_GetItem(ctx->args, argidx); 13455 } 13456 PyErr_SetString(PyExc_TypeError, 13457 "not enough arguments for format string"); 13458 return NULL; 13459} 13460 13461/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13462 13463/* Format a float into the writer if the writer is not NULL, or into *p_output 13464 otherwise. 13465 13466 Return 0 on success, raise an exception and return -1 on error. */ 13467static int 13468formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13469 PyObject **p_output, 13470 _PyUnicodeWriter *writer) 13471{ 13472 char *p; 13473 double x; 13474 Py_ssize_t len; 13475 int prec; 13476 int dtoa_flags; 13477 13478 x = PyFloat_AsDouble(v); 13479 if (x == -1.0 && PyErr_Occurred()) 13480 return -1; 13481 13482 prec = arg->prec; 13483 if (prec < 0) 13484 prec = 6; 13485 13486 if (arg->flags & F_ALT) 13487 dtoa_flags = Py_DTSF_ALT; 13488 else 13489 dtoa_flags = 0; 13490 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13491 if (p == NULL) 13492 return -1; 13493 len = strlen(p); 13494 if (writer) { 13495 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) { 13496 PyMem_Free(p); 13497 return -1; 13498 } 13499 unicode_write_cstr(writer->buffer, writer->pos, p, len); 13500 writer->pos += len; 13501 } 13502 else 13503 *p_output = _PyUnicode_FromASCII(p, len); 13504 PyMem_Free(p); 13505 return 0; 13506} 13507 13508/* formatlong() emulates the format codes d, u, o, x and X, and 13509 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13510 * Python's regular ints. 13511 * Return value: a new PyUnicodeObject*, or NULL if error. 13512 * The output string is of the form 13513 * "-"? ("0x" | "0X")? digit+ 13514 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13515 * set in flags. The case of hex digits will be correct, 13516 * There will be at least prec digits, zero-filled on the left if 13517 * necessary to get that many. 13518 * val object to be converted 13519 * flags bitmask of format flags; only F_ALT is looked at 13520 * prec minimum number of digits; 0-fill on left if needed 13521 * type a character in [duoxX]; u acts the same as d 13522 * 13523 * CAUTION: o, x and X conversions on regular ints can never 13524 * produce a '-' sign, but can for Python's unbounded ints. 13525 */ 13526static PyObject* 13527formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13528{ 13529 PyObject *result = NULL; 13530 char *buf; 13531 Py_ssize_t i; 13532 int sign; /* 1 if '-', else 0 */ 13533 int len; /* number of characters */ 13534 Py_ssize_t llen; 13535 int numdigits; /* len == numnondigits + numdigits */ 13536 int numnondigits = 0; 13537 int prec = arg->prec; 13538 int type = arg->ch; 13539 13540 /* Avoid exceeding SSIZE_T_MAX */ 13541 if (prec > INT_MAX-3) { 13542 PyErr_SetString(PyExc_OverflowError, 13543 "precision too large"); 13544 return NULL; 13545 } 13546 13547 assert(PyLong_Check(val)); 13548 13549 switch (type) { 13550 default: 13551 assert(!"'type' not in [diuoxX]"); 13552 case 'd': 13553 case 'i': 13554 case 'u': 13555 /* Special-case boolean: we want 0/1 */ 13556 if (PyBool_Check(val)) 13557 result = PyNumber_ToBase(val, 10); 13558 else 13559 result = Py_TYPE(val)->tp_str(val); 13560 break; 13561 case 'o': 13562 numnondigits = 2; 13563 result = PyNumber_ToBase(val, 8); 13564 break; 13565 case 'x': 13566 case 'X': 13567 numnondigits = 2; 13568 result = PyNumber_ToBase(val, 16); 13569 break; 13570 } 13571 if (!result) 13572 return NULL; 13573 13574 assert(unicode_modifiable(result)); 13575 assert(PyUnicode_IS_READY(result)); 13576 assert(PyUnicode_IS_ASCII(result)); 13577 13578 /* To modify the string in-place, there can only be one reference. */ 13579 if (Py_REFCNT(result) != 1) { 13580 PyErr_BadInternalCall(); 13581 return NULL; 13582 } 13583 buf = PyUnicode_DATA(result); 13584 llen = PyUnicode_GET_LENGTH(result); 13585 if (llen > INT_MAX) { 13586 PyErr_SetString(PyExc_ValueError, 13587 "string too large in _PyBytes_FormatLong"); 13588 return NULL; 13589 } 13590 len = (int)llen; 13591 sign = buf[0] == '-'; 13592 numnondigits += sign; 13593 numdigits = len - numnondigits; 13594 assert(numdigits > 0); 13595 13596 /* Get rid of base marker unless F_ALT */ 13597 if (((arg->flags & F_ALT) == 0 && 13598 (type == 'o' || type == 'x' || type == 'X'))) { 13599 assert(buf[sign] == '0'); 13600 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13601 buf[sign+1] == 'o'); 13602 numnondigits -= 2; 13603 buf += 2; 13604 len -= 2; 13605 if (sign) 13606 buf[0] = '-'; 13607 assert(len == numnondigits + numdigits); 13608 assert(numdigits > 0); 13609 } 13610 13611 /* Fill with leading zeroes to meet minimum width. */ 13612 if (prec > numdigits) { 13613 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13614 numnondigits + prec); 13615 char *b1; 13616 if (!r1) { 13617 Py_DECREF(result); 13618 return NULL; 13619 } 13620 b1 = PyBytes_AS_STRING(r1); 13621 for (i = 0; i < numnondigits; ++i) 13622 *b1++ = *buf++; 13623 for (i = 0; i < prec - numdigits; i++) 13624 *b1++ = '0'; 13625 for (i = 0; i < numdigits; i++) 13626 *b1++ = *buf++; 13627 *b1 = '\0'; 13628 Py_DECREF(result); 13629 result = r1; 13630 buf = PyBytes_AS_STRING(result); 13631 len = numnondigits + prec; 13632 } 13633 13634 /* Fix up case for hex conversions. */ 13635 if (type == 'X') { 13636 /* Need to convert all lower case letters to upper case. 13637 and need to convert 0x to 0X (and -0x to -0X). */ 13638 for (i = 0; i < len; i++) 13639 if (buf[i] >= 'a' && buf[i] <= 'x') 13640 buf[i] -= 'a'-'A'; 13641 } 13642 if (!PyUnicode_Check(result) 13643 || buf != PyUnicode_DATA(result)) { 13644 PyObject *unicode; 13645 unicode = _PyUnicode_FromASCII(buf, len); 13646 Py_DECREF(result); 13647 result = unicode; 13648 } 13649 else if (len != PyUnicode_GET_LENGTH(result)) { 13650 if (PyUnicode_Resize(&result, len) < 0) 13651 Py_CLEAR(result); 13652 } 13653 return result; 13654} 13655 13656/* Format an integer. 13657 * Return 1 if the number has been formatted into the writer, 13658 * 0 if the number has been formatted into *p_output 13659 * -1 and raise an exception on error */ 13660static int 13661mainformatlong(PyObject *v, 13662 struct unicode_format_arg_t *arg, 13663 PyObject **p_output, 13664 _PyUnicodeWriter *writer) 13665{ 13666 PyObject *iobj, *res; 13667 char type = (char)arg->ch; 13668 13669 if (!PyNumber_Check(v)) 13670 goto wrongtype; 13671 13672 if (!PyLong_Check(v)) { 13673 iobj = PyNumber_Long(v); 13674 if (iobj == NULL) { 13675 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13676 goto wrongtype; 13677 return -1; 13678 } 13679 assert(PyLong_Check(iobj)); 13680 } 13681 else { 13682 iobj = v; 13683 Py_INCREF(iobj); 13684 } 13685 13686 if (PyLong_CheckExact(v) 13687 && arg->width == -1 && arg->prec == -1 13688 && !(arg->flags & (F_SIGN | F_BLANK)) 13689 && type != 'X') 13690 { 13691 /* Fast path */ 13692 int alternate = arg->flags & F_ALT; 13693 int base; 13694 13695 switch(type) 13696 { 13697 default: 13698 assert(0 && "'type' not in [diuoxX]"); 13699 case 'd': 13700 case 'i': 13701 case 'u': 13702 base = 10; 13703 break; 13704 case 'o': 13705 base = 8; 13706 break; 13707 case 'x': 13708 case 'X': 13709 base = 16; 13710 break; 13711 } 13712 13713 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 13714 Py_DECREF(iobj); 13715 return -1; 13716 } 13717 Py_DECREF(iobj); 13718 return 1; 13719 } 13720 13721 res = formatlong(iobj, arg); 13722 Py_DECREF(iobj); 13723 if (res == NULL) 13724 return -1; 13725 *p_output = res; 13726 return 0; 13727 13728wrongtype: 13729 PyErr_Format(PyExc_TypeError, 13730 "%%%c format: a number is required, " 13731 "not %.200s", 13732 type, Py_TYPE(v)->tp_name); 13733 return -1; 13734} 13735 13736static Py_UCS4 13737formatchar(PyObject *v) 13738{ 13739 /* presume that the buffer is at least 3 characters long */ 13740 if (PyUnicode_Check(v)) { 13741 if (PyUnicode_GET_LENGTH(v) == 1) { 13742 return PyUnicode_READ_CHAR(v, 0); 13743 } 13744 goto onError; 13745 } 13746 else { 13747 /* Integer input truncated to a character */ 13748 long x; 13749 x = PyLong_AsLong(v); 13750 if (x == -1 && PyErr_Occurred()) 13751 goto onError; 13752 13753 if (x < 0 || x > MAX_UNICODE) { 13754 PyErr_SetString(PyExc_OverflowError, 13755 "%c arg not in range(0x110000)"); 13756 return (Py_UCS4) -1; 13757 } 13758 13759 return (Py_UCS4) x; 13760 } 13761 13762 onError: 13763 PyErr_SetString(PyExc_TypeError, 13764 "%c requires int or char"); 13765 return (Py_UCS4) -1; 13766} 13767 13768/* Parse options of an argument: flags, width, precision. 13769 Handle also "%(name)" syntax. 13770 13771 Return 0 if the argument has been formatted into arg->str. 13772 Return 1 if the argument has been written into ctx->writer, 13773 Raise an exception and return -1 on error. */ 13774static int 13775unicode_format_arg_parse(struct unicode_formatter_t *ctx, 13776 struct unicode_format_arg_t *arg) 13777{ 13778#define FORMAT_READ(ctx) \ 13779 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 13780 13781 PyObject *v; 13782 13783 if (arg->ch == '(') { 13784 /* Get argument value from a dictionary. Example: "%(name)s". */ 13785 Py_ssize_t keystart; 13786 Py_ssize_t keylen; 13787 PyObject *key; 13788 int pcount = 1; 13789 13790 if (ctx->dict == NULL) { 13791 PyErr_SetString(PyExc_TypeError, 13792 "format requires a mapping"); 13793 return -1; 13794 } 13795 ++ctx->fmtpos; 13796 --ctx->fmtcnt; 13797 keystart = ctx->fmtpos; 13798 /* Skip over balanced parentheses */ 13799 while (pcount > 0 && --ctx->fmtcnt >= 0) { 13800 arg->ch = FORMAT_READ(ctx); 13801 if (arg->ch == ')') 13802 --pcount; 13803 else if (arg->ch == '(') 13804 ++pcount; 13805 ctx->fmtpos++; 13806 } 13807 keylen = ctx->fmtpos - keystart - 1; 13808 if (ctx->fmtcnt < 0 || pcount > 0) { 13809 PyErr_SetString(PyExc_ValueError, 13810 "incomplete format key"); 13811 return -1; 13812 } 13813 key = PyUnicode_Substring(ctx->fmtstr, 13814 keystart, keystart + keylen); 13815 if (key == NULL) 13816 return -1; 13817 if (ctx->args_owned) { 13818 Py_DECREF(ctx->args); 13819 ctx->args_owned = 0; 13820 } 13821 ctx->args = PyObject_GetItem(ctx->dict, key); 13822 Py_DECREF(key); 13823 if (ctx->args == NULL) 13824 return -1; 13825 ctx->args_owned = 1; 13826 ctx->arglen = -1; 13827 ctx->argidx = -2; 13828 } 13829 13830 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 13831 while (--ctx->fmtcnt >= 0) { 13832 arg->ch = FORMAT_READ(ctx); 13833 ctx->fmtpos++; 13834 switch (arg->ch) { 13835 case '-': arg->flags |= F_LJUST; continue; 13836 case '+': arg->flags |= F_SIGN; continue; 13837 case ' ': arg->flags |= F_BLANK; continue; 13838 case '#': arg->flags |= F_ALT; continue; 13839 case '0': arg->flags |= F_ZERO; continue; 13840 } 13841 break; 13842 } 13843 13844 /* Parse width. Example: "%10s" => width=10 */ 13845 if (arg->ch == '*') { 13846 v = unicode_format_getnextarg(ctx); 13847 if (v == NULL) 13848 return -1; 13849 if (!PyLong_Check(v)) { 13850 PyErr_SetString(PyExc_TypeError, 13851 "* wants int"); 13852 return -1; 13853 } 13854 arg->width = PyLong_AsSsize_t(v); 13855 if (arg->width == -1 && PyErr_Occurred()) 13856 return -1; 13857 if (arg->width < 0) { 13858 arg->flags |= F_LJUST; 13859 arg->width = -arg->width; 13860 } 13861 if (--ctx->fmtcnt >= 0) { 13862 arg->ch = FORMAT_READ(ctx); 13863 ctx->fmtpos++; 13864 } 13865 } 13866 else if (arg->ch >= '0' && arg->ch <= '9') { 13867 arg->width = arg->ch - '0'; 13868 while (--ctx->fmtcnt >= 0) { 13869 arg->ch = FORMAT_READ(ctx); 13870 ctx->fmtpos++; 13871 if (arg->ch < '0' || arg->ch > '9') 13872 break; 13873 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 13874 mixing signed and unsigned comparison. Since arg->ch is between 13875 '0' and '9', casting to int is safe. */ 13876 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 13877 PyErr_SetString(PyExc_ValueError, 13878 "width too big"); 13879 return -1; 13880 } 13881 arg->width = arg->width*10 + (arg->ch - '0'); 13882 } 13883 } 13884 13885 /* Parse precision. Example: "%.3f" => prec=3 */ 13886 if (arg->ch == '.') { 13887 arg->prec = 0; 13888 if (--ctx->fmtcnt >= 0) { 13889 arg->ch = FORMAT_READ(ctx); 13890 ctx->fmtpos++; 13891 } 13892 if (arg->ch == '*') { 13893 v = unicode_format_getnextarg(ctx); 13894 if (v == NULL) 13895 return -1; 13896 if (!PyLong_Check(v)) { 13897 PyErr_SetString(PyExc_TypeError, 13898 "* wants int"); 13899 return -1; 13900 } 13901 arg->prec = _PyLong_AsInt(v); 13902 if (arg->prec == -1 && PyErr_Occurred()) 13903 return -1; 13904 if (arg->prec < 0) 13905 arg->prec = 0; 13906 if (--ctx->fmtcnt >= 0) { 13907 arg->ch = FORMAT_READ(ctx); 13908 ctx->fmtpos++; 13909 } 13910 } 13911 else if (arg->ch >= '0' && arg->ch <= '9') { 13912 arg->prec = arg->ch - '0'; 13913 while (--ctx->fmtcnt >= 0) { 13914 arg->ch = FORMAT_READ(ctx); 13915 ctx->fmtpos++; 13916 if (arg->ch < '0' || arg->ch > '9') 13917 break; 13918 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 13919 PyErr_SetString(PyExc_ValueError, 13920 "precision too big"); 13921 return -1; 13922 } 13923 arg->prec = arg->prec*10 + (arg->ch - '0'); 13924 } 13925 } 13926 } 13927 13928 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 13929 if (ctx->fmtcnt >= 0) { 13930 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 13931 if (--ctx->fmtcnt >= 0) { 13932 arg->ch = FORMAT_READ(ctx); 13933 ctx->fmtpos++; 13934 } 13935 } 13936 } 13937 if (ctx->fmtcnt < 0) { 13938 PyErr_SetString(PyExc_ValueError, 13939 "incomplete format"); 13940 return -1; 13941 } 13942 return 0; 13943 13944#undef FORMAT_READ 13945} 13946 13947/* Format one argument. Supported conversion specifiers: 13948 13949 - "s", "r", "a": any type 13950 - "i", "d", "u", "o", "x", "X": int 13951 - "e", "E", "f", "F", "g", "G": float 13952 - "c": int or str (1 character) 13953 13954 When possible, the output is written directly into the Unicode writer 13955 (ctx->writer). A string is created when padding is required. 13956 13957 Return 0 if the argument has been formatted into *p_str, 13958 1 if the argument has been written into ctx->writer, 13959 -1 on error. */ 13960static int 13961unicode_format_arg_format(struct unicode_formatter_t *ctx, 13962 struct unicode_format_arg_t *arg, 13963 PyObject **p_str) 13964{ 13965 PyObject *v; 13966 _PyUnicodeWriter *writer = &ctx->writer; 13967 13968 if (ctx->fmtcnt == 0) 13969 ctx->writer.overallocate = 0; 13970 13971 if (arg->ch == '%') { 13972 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 13973 return -1; 13974 return 1; 13975 } 13976 13977 v = unicode_format_getnextarg(ctx); 13978 if (v == NULL) 13979 return -1; 13980 13981 13982 switch (arg->ch) { 13983 case 's': 13984 case 'r': 13985 case 'a': 13986 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 13987 /* Fast path */ 13988 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 13989 return -1; 13990 return 1; 13991 } 13992 13993 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 13994 *p_str = v; 13995 Py_INCREF(*p_str); 13996 } 13997 else { 13998 if (arg->ch == 's') 13999 *p_str = PyObject_Str(v); 14000 else if (arg->ch == 'r') 14001 *p_str = PyObject_Repr(v); 14002 else 14003 *p_str = PyObject_ASCII(v); 14004 } 14005 break; 14006 14007 case 'i': 14008 case 'd': 14009 case 'u': 14010 case 'o': 14011 case 'x': 14012 case 'X': 14013 { 14014 int ret = mainformatlong(v, arg, p_str, writer); 14015 if (ret != 0) 14016 return ret; 14017 arg->sign = 1; 14018 break; 14019 } 14020 14021 case 'e': 14022 case 'E': 14023 case 'f': 14024 case 'F': 14025 case 'g': 14026 case 'G': 14027 if (arg->width == -1 && arg->prec == -1 14028 && !(arg->flags & (F_SIGN | F_BLANK))) 14029 { 14030 /* Fast path */ 14031 if (formatfloat(v, arg, NULL, writer) == -1) 14032 return -1; 14033 return 1; 14034 } 14035 14036 arg->sign = 1; 14037 if (formatfloat(v, arg, p_str, NULL) == -1) 14038 return -1; 14039 break; 14040 14041 case 'c': 14042 { 14043 Py_UCS4 ch = formatchar(v); 14044 if (ch == (Py_UCS4) -1) 14045 return -1; 14046 if (arg->width == -1 && arg->prec == -1) { 14047 /* Fast path */ 14048 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14049 return -1; 14050 return 1; 14051 } 14052 *p_str = PyUnicode_FromOrdinal(ch); 14053 break; 14054 } 14055 14056 default: 14057 PyErr_Format(PyExc_ValueError, 14058 "unsupported format character '%c' (0x%x) " 14059 "at index %zd", 14060 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14061 (int)arg->ch, 14062 ctx->fmtpos - 1); 14063 return -1; 14064 } 14065 if (*p_str == NULL) 14066 return -1; 14067 assert (PyUnicode_Check(*p_str)); 14068 return 0; 14069} 14070 14071static int 14072unicode_format_arg_output(struct unicode_formatter_t *ctx, 14073 struct unicode_format_arg_t *arg, 14074 PyObject *str) 14075{ 14076 Py_ssize_t len; 14077 enum PyUnicode_Kind kind; 14078 void *pbuf; 14079 Py_ssize_t pindex; 14080 Py_UCS4 signchar; 14081 Py_ssize_t buflen; 14082 Py_UCS4 maxchar; 14083 Py_ssize_t sublen; 14084 _PyUnicodeWriter *writer = &ctx->writer; 14085 Py_UCS4 fill; 14086 14087 fill = ' '; 14088 if (arg->sign && arg->flags & F_ZERO) 14089 fill = '0'; 14090 14091 if (PyUnicode_READY(str) == -1) 14092 return -1; 14093 14094 len = PyUnicode_GET_LENGTH(str); 14095 if ((arg->width == -1 || arg->width <= len) 14096 && (arg->prec == -1 || arg->prec >= len) 14097 && !(arg->flags & (F_SIGN | F_BLANK))) 14098 { 14099 /* Fast path */ 14100 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14101 return -1; 14102 return 0; 14103 } 14104 14105 /* Truncate the string for "s", "r" and "a" formats 14106 if the precision is set */ 14107 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14108 if (arg->prec >= 0 && len > arg->prec) 14109 len = arg->prec; 14110 } 14111 14112 /* Adjust sign and width */ 14113 kind = PyUnicode_KIND(str); 14114 pbuf = PyUnicode_DATA(str); 14115 pindex = 0; 14116 signchar = '\0'; 14117 if (arg->sign) { 14118 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14119 if (ch == '-' || ch == '+') { 14120 signchar = ch; 14121 len--; 14122 pindex++; 14123 } 14124 else if (arg->flags & F_SIGN) 14125 signchar = '+'; 14126 else if (arg->flags & F_BLANK) 14127 signchar = ' '; 14128 else 14129 arg->sign = 0; 14130 } 14131 if (arg->width < len) 14132 arg->width = len; 14133 14134 /* Prepare the writer */ 14135 maxchar = writer->maxchar; 14136 if (!(arg->flags & F_LJUST)) { 14137 if (arg->sign) { 14138 if ((arg->width-1) > len) 14139 maxchar = Py_MAX(maxchar, fill); 14140 } 14141 else { 14142 if (arg->width > len) 14143 maxchar = Py_MAX(maxchar, fill); 14144 } 14145 } 14146 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14147 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14148 maxchar = Py_MAX(maxchar, strmaxchar); 14149 } 14150 14151 buflen = arg->width; 14152 if (arg->sign && len == arg->width) 14153 buflen++; 14154 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14155 return -1; 14156 14157 /* Write the sign if needed */ 14158 if (arg->sign) { 14159 if (fill != ' ') { 14160 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14161 writer->pos += 1; 14162 } 14163 if (arg->width > len) 14164 arg->width--; 14165 } 14166 14167 /* Write the numeric prefix for "x", "X" and "o" formats 14168 if the alternate form is used. 14169 For example, write "0x" for the "%#x" format. */ 14170 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14171 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14172 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14173 if (fill != ' ') { 14174 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14175 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14176 writer->pos += 2; 14177 pindex += 2; 14178 } 14179 arg->width -= 2; 14180 if (arg->width < 0) 14181 arg->width = 0; 14182 len -= 2; 14183 } 14184 14185 /* Pad left with the fill character if needed */ 14186 if (arg->width > len && !(arg->flags & F_LJUST)) { 14187 sublen = arg->width - len; 14188 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14189 writer->pos += sublen; 14190 arg->width = len; 14191 } 14192 14193 /* If padding with spaces: write sign if needed and/or numeric prefix if 14194 the alternate form is used */ 14195 if (fill == ' ') { 14196 if (arg->sign) { 14197 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14198 writer->pos += 1; 14199 } 14200 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14201 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14202 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14203 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14204 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14205 writer->pos += 2; 14206 pindex += 2; 14207 } 14208 } 14209 14210 /* Write characters */ 14211 if (len) { 14212 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14213 str, pindex, len); 14214 writer->pos += len; 14215 } 14216 14217 /* Pad right with the fill character if needed */ 14218 if (arg->width > len) { 14219 sublen = arg->width - len; 14220 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14221 writer->pos += sublen; 14222 } 14223 return 0; 14224} 14225 14226/* Helper of PyUnicode_Format(): format one arg. 14227 Return 0 on success, raise an exception and return -1 on error. */ 14228static int 14229unicode_format_arg(struct unicode_formatter_t *ctx) 14230{ 14231 struct unicode_format_arg_t arg; 14232 PyObject *str; 14233 int ret; 14234 14235 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14236 arg.flags = 0; 14237 arg.width = -1; 14238 arg.prec = -1; 14239 arg.sign = 0; 14240 str = NULL; 14241 14242 ret = unicode_format_arg_parse(ctx, &arg); 14243 if (ret == -1) 14244 return -1; 14245 14246 ret = unicode_format_arg_format(ctx, &arg, &str); 14247 if (ret == -1) 14248 return -1; 14249 14250 if (ret != 1) { 14251 ret = unicode_format_arg_output(ctx, &arg, str); 14252 Py_DECREF(str); 14253 if (ret == -1) 14254 return -1; 14255 } 14256 14257 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14258 PyErr_SetString(PyExc_TypeError, 14259 "not all arguments converted during string formatting"); 14260 return -1; 14261 } 14262 return 0; 14263} 14264 14265PyObject * 14266PyUnicode_Format(PyObject *format, PyObject *args) 14267{ 14268 struct unicode_formatter_t ctx; 14269 14270 if (format == NULL || args == NULL) { 14271 PyErr_BadInternalCall(); 14272 return NULL; 14273 } 14274 14275 ctx.fmtstr = PyUnicode_FromObject(format); 14276 if (ctx.fmtstr == NULL) 14277 return NULL; 14278 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14279 Py_DECREF(ctx.fmtstr); 14280 return NULL; 14281 } 14282 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14283 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14284 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14285 ctx.fmtpos = 0; 14286 14287 _PyUnicodeWriter_Init(&ctx.writer); 14288 ctx.writer.min_length = ctx.fmtcnt + 100; 14289 ctx.writer.overallocate = 1; 14290 14291 if (PyTuple_Check(args)) { 14292 ctx.arglen = PyTuple_Size(args); 14293 ctx.argidx = 0; 14294 } 14295 else { 14296 ctx.arglen = -1; 14297 ctx.argidx = -2; 14298 } 14299 ctx.args_owned = 0; 14300 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14301 ctx.dict = args; 14302 else 14303 ctx.dict = NULL; 14304 ctx.args = args; 14305 14306 while (--ctx.fmtcnt >= 0) { 14307 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14308 Py_ssize_t nonfmtpos; 14309 14310 nonfmtpos = ctx.fmtpos++; 14311 while (ctx.fmtcnt >= 0 && 14312 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14313 ctx.fmtpos++; 14314 ctx.fmtcnt--; 14315 } 14316 if (ctx.fmtcnt < 0) { 14317 ctx.fmtpos--; 14318 ctx.writer.overallocate = 0; 14319 } 14320 14321 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14322 nonfmtpos, ctx.fmtpos) < 0) 14323 goto onError; 14324 } 14325 else { 14326 ctx.fmtpos++; 14327 if (unicode_format_arg(&ctx) == -1) 14328 goto onError; 14329 } 14330 } 14331 14332 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14333 PyErr_SetString(PyExc_TypeError, 14334 "not all arguments converted during string formatting"); 14335 goto onError; 14336 } 14337 14338 if (ctx.args_owned) { 14339 Py_DECREF(ctx.args); 14340 } 14341 Py_DECREF(ctx.fmtstr); 14342 return _PyUnicodeWriter_Finish(&ctx.writer); 14343 14344 onError: 14345 Py_DECREF(ctx.fmtstr); 14346 _PyUnicodeWriter_Dealloc(&ctx.writer); 14347 if (ctx.args_owned) { 14348 Py_DECREF(ctx.args); 14349 } 14350 return NULL; 14351} 14352 14353static PyObject * 14354unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14355 14356static PyObject * 14357unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14358{ 14359 PyObject *x = NULL; 14360 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14361 char *encoding = NULL; 14362 char *errors = NULL; 14363 14364 if (type != &PyUnicode_Type) 14365 return unicode_subtype_new(type, args, kwds); 14366 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14367 kwlist, &x, &encoding, &errors)) 14368 return NULL; 14369 if (x == NULL) 14370 _Py_RETURN_UNICODE_EMPTY(); 14371 if (encoding == NULL && errors == NULL) 14372 return PyObject_Str(x); 14373 else 14374 return PyUnicode_FromEncodedObject(x, encoding, errors); 14375} 14376 14377static PyObject * 14378unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14379{ 14380 PyObject *unicode, *self; 14381 Py_ssize_t length, char_size; 14382 int share_wstr, share_utf8; 14383 unsigned int kind; 14384 void *data; 14385 14386 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14387 14388 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14389 if (unicode == NULL) 14390 return NULL; 14391 assert(_PyUnicode_CHECK(unicode)); 14392 if (PyUnicode_READY(unicode) == -1) { 14393 Py_DECREF(unicode); 14394 return NULL; 14395 } 14396 14397 self = type->tp_alloc(type, 0); 14398 if (self == NULL) { 14399 Py_DECREF(unicode); 14400 return NULL; 14401 } 14402 kind = PyUnicode_KIND(unicode); 14403 length = PyUnicode_GET_LENGTH(unicode); 14404 14405 _PyUnicode_LENGTH(self) = length; 14406#ifdef Py_DEBUG 14407 _PyUnicode_HASH(self) = -1; 14408#else 14409 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14410#endif 14411 _PyUnicode_STATE(self).interned = 0; 14412 _PyUnicode_STATE(self).kind = kind; 14413 _PyUnicode_STATE(self).compact = 0; 14414 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14415 _PyUnicode_STATE(self).ready = 1; 14416 _PyUnicode_WSTR(self) = NULL; 14417 _PyUnicode_UTF8_LENGTH(self) = 0; 14418 _PyUnicode_UTF8(self) = NULL; 14419 _PyUnicode_WSTR_LENGTH(self) = 0; 14420 _PyUnicode_DATA_ANY(self) = NULL; 14421 14422 share_utf8 = 0; 14423 share_wstr = 0; 14424 if (kind == PyUnicode_1BYTE_KIND) { 14425 char_size = 1; 14426 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14427 share_utf8 = 1; 14428 } 14429 else if (kind == PyUnicode_2BYTE_KIND) { 14430 char_size = 2; 14431 if (sizeof(wchar_t) == 2) 14432 share_wstr = 1; 14433 } 14434 else { 14435 assert(kind == PyUnicode_4BYTE_KIND); 14436 char_size = 4; 14437 if (sizeof(wchar_t) == 4) 14438 share_wstr = 1; 14439 } 14440 14441 /* Ensure we won't overflow the length. */ 14442 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14443 PyErr_NoMemory(); 14444 goto onError; 14445 } 14446 data = PyObject_MALLOC((length + 1) * char_size); 14447 if (data == NULL) { 14448 PyErr_NoMemory(); 14449 goto onError; 14450 } 14451 14452 _PyUnicode_DATA_ANY(self) = data; 14453 if (share_utf8) { 14454 _PyUnicode_UTF8_LENGTH(self) = length; 14455 _PyUnicode_UTF8(self) = data; 14456 } 14457 if (share_wstr) { 14458 _PyUnicode_WSTR_LENGTH(self) = length; 14459 _PyUnicode_WSTR(self) = (wchar_t *)data; 14460 } 14461 14462 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14463 kind * (length + 1)); 14464 assert(_PyUnicode_CheckConsistency(self, 1)); 14465#ifdef Py_DEBUG 14466 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14467#endif 14468 Py_DECREF(unicode); 14469 return self; 14470 14471onError: 14472 Py_DECREF(unicode); 14473 Py_DECREF(self); 14474 return NULL; 14475} 14476 14477PyDoc_STRVAR(unicode_doc, 14478"str(object='') -> str\n\ 14479str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14480\n\ 14481Create a new string object from the given object. If encoding or\n\ 14482errors is specified, then the object must expose a data buffer\n\ 14483that will be decoded using the given encoding and error handler.\n\ 14484Otherwise, returns the result of object.__str__() (if defined)\n\ 14485or repr(object).\n\ 14486encoding defaults to sys.getdefaultencoding().\n\ 14487errors defaults to 'strict'."); 14488 14489static PyObject *unicode_iter(PyObject *seq); 14490 14491PyTypeObject PyUnicode_Type = { 14492 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14493 "str", /* tp_name */ 14494 sizeof(PyUnicodeObject), /* tp_size */ 14495 0, /* tp_itemsize */ 14496 /* Slots */ 14497 (destructor)unicode_dealloc, /* tp_dealloc */ 14498 0, /* tp_print */ 14499 0, /* tp_getattr */ 14500 0, /* tp_setattr */ 14501 0, /* tp_reserved */ 14502 unicode_repr, /* tp_repr */ 14503 &unicode_as_number, /* tp_as_number */ 14504 &unicode_as_sequence, /* tp_as_sequence */ 14505 &unicode_as_mapping, /* tp_as_mapping */ 14506 (hashfunc) unicode_hash, /* tp_hash*/ 14507 0, /* tp_call*/ 14508 (reprfunc) unicode_str, /* tp_str */ 14509 PyObject_GenericGetAttr, /* tp_getattro */ 14510 0, /* tp_setattro */ 14511 0, /* tp_as_buffer */ 14512 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14513 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14514 unicode_doc, /* tp_doc */ 14515 0, /* tp_traverse */ 14516 0, /* tp_clear */ 14517 PyUnicode_RichCompare, /* tp_richcompare */ 14518 0, /* tp_weaklistoffset */ 14519 unicode_iter, /* tp_iter */ 14520 0, /* tp_iternext */ 14521 unicode_methods, /* tp_methods */ 14522 0, /* tp_members */ 14523 0, /* tp_getset */ 14524 &PyBaseObject_Type, /* tp_base */ 14525 0, /* tp_dict */ 14526 0, /* tp_descr_get */ 14527 0, /* tp_descr_set */ 14528 0, /* tp_dictoffset */ 14529 0, /* tp_init */ 14530 0, /* tp_alloc */ 14531 unicode_new, /* tp_new */ 14532 PyObject_Del, /* tp_free */ 14533}; 14534 14535/* Initialize the Unicode implementation */ 14536 14537int _PyUnicode_Init(void) 14538{ 14539 /* XXX - move this array to unicodectype.c ? */ 14540 Py_UCS2 linebreak[] = { 14541 0x000A, /* LINE FEED */ 14542 0x000D, /* CARRIAGE RETURN */ 14543 0x001C, /* FILE SEPARATOR */ 14544 0x001D, /* GROUP SEPARATOR */ 14545 0x001E, /* RECORD SEPARATOR */ 14546 0x0085, /* NEXT LINE */ 14547 0x2028, /* LINE SEPARATOR */ 14548 0x2029, /* PARAGRAPH SEPARATOR */ 14549 }; 14550 14551 /* Init the implementation */ 14552 _Py_INCREF_UNICODE_EMPTY(); 14553 if (!unicode_empty) 14554 Py_FatalError("Can't create empty string"); 14555 Py_DECREF(unicode_empty); 14556 14557 if (PyType_Ready(&PyUnicode_Type) < 0) 14558 Py_FatalError("Can't initialize 'unicode'"); 14559 14560 /* initialize the linebreak bloom filter */ 14561 bloom_linebreak = make_bloom_mask( 14562 PyUnicode_2BYTE_KIND, linebreak, 14563 Py_ARRAY_LENGTH(linebreak)); 14564 14565 PyType_Ready(&EncodingMapType); 14566 14567 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 14568 Py_FatalError("Can't initialize field name iterator type"); 14569 14570 if (PyType_Ready(&PyFormatterIter_Type) < 0) 14571 Py_FatalError("Can't initialize formatter iter type"); 14572 14573#ifdef HAVE_MBCS 14574 winver.dwOSVersionInfoSize = sizeof(winver); 14575 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14576 PyErr_SetFromWindowsErr(0); 14577 return -1; 14578 } 14579#endif 14580 return 0; 14581} 14582 14583/* Finalize the Unicode implementation */ 14584 14585int 14586PyUnicode_ClearFreeList(void) 14587{ 14588 return 0; 14589} 14590 14591void 14592_PyUnicode_Fini(void) 14593{ 14594 int i; 14595 14596 Py_CLEAR(unicode_empty); 14597 14598 for (i = 0; i < 256; i++) 14599 Py_CLEAR(unicode_latin1[i]); 14600 _PyUnicode_ClearStaticStrings(); 14601 (void)PyUnicode_ClearFreeList(); 14602} 14603 14604void 14605PyUnicode_InternInPlace(PyObject **p) 14606{ 14607 register PyObject *s = *p; 14608 PyObject *t; 14609#ifdef Py_DEBUG 14610 assert(s != NULL); 14611 assert(_PyUnicode_CHECK(s)); 14612#else 14613 if (s == NULL || !PyUnicode_Check(s)) 14614 return; 14615#endif 14616 /* If it's a subclass, we don't really know what putting 14617 it in the interned dict might do. */ 14618 if (!PyUnicode_CheckExact(s)) 14619 return; 14620 if (PyUnicode_CHECK_INTERNED(s)) 14621 return; 14622 if (interned == NULL) { 14623 interned = PyDict_New(); 14624 if (interned == NULL) { 14625 PyErr_Clear(); /* Don't leave an exception */ 14626 return; 14627 } 14628 } 14629 /* It might be that the GetItem call fails even 14630 though the key is present in the dictionary, 14631 namely when this happens during a stack overflow. */ 14632 Py_ALLOW_RECURSION 14633 t = PyDict_GetItem(interned, s); 14634 Py_END_ALLOW_RECURSION 14635 14636 if (t) { 14637 Py_INCREF(t); 14638 Py_DECREF(*p); 14639 *p = t; 14640 return; 14641 } 14642 14643 PyThreadState_GET()->recursion_critical = 1; 14644 if (PyDict_SetItem(interned, s, s) < 0) { 14645 PyErr_Clear(); 14646 PyThreadState_GET()->recursion_critical = 0; 14647 return; 14648 } 14649 PyThreadState_GET()->recursion_critical = 0; 14650 /* The two references in interned are not counted by refcnt. 14651 The deallocator will take care of this */ 14652 Py_REFCNT(s) -= 2; 14653 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14654} 14655 14656void 14657PyUnicode_InternImmortal(PyObject **p) 14658{ 14659 PyUnicode_InternInPlace(p); 14660 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14661 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14662 Py_INCREF(*p); 14663 } 14664} 14665 14666PyObject * 14667PyUnicode_InternFromString(const char *cp) 14668{ 14669 PyObject *s = PyUnicode_FromString(cp); 14670 if (s == NULL) 14671 return NULL; 14672 PyUnicode_InternInPlace(&s); 14673 return s; 14674} 14675 14676void 14677_Py_ReleaseInternedUnicodeStrings(void) 14678{ 14679 PyObject *keys; 14680 PyObject *s; 14681 Py_ssize_t i, n; 14682 Py_ssize_t immortal_size = 0, mortal_size = 0; 14683 14684 if (interned == NULL || !PyDict_Check(interned)) 14685 return; 14686 keys = PyDict_Keys(interned); 14687 if (keys == NULL || !PyList_Check(keys)) { 14688 PyErr_Clear(); 14689 return; 14690 } 14691 14692 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14693 detector, interned unicode strings are not forcibly deallocated; 14694 rather, we give them their stolen references back, and then clear 14695 and DECREF the interned dict. */ 14696 14697 n = PyList_GET_SIZE(keys); 14698 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14699 n); 14700 for (i = 0; i < n; i++) { 14701 s = PyList_GET_ITEM(keys, i); 14702 if (PyUnicode_READY(s) == -1) { 14703 assert(0 && "could not ready string"); 14704 fprintf(stderr, "could not ready string\n"); 14705 } 14706 switch (PyUnicode_CHECK_INTERNED(s)) { 14707 case SSTATE_NOT_INTERNED: 14708 /* XXX Shouldn't happen */ 14709 break; 14710 case SSTATE_INTERNED_IMMORTAL: 14711 Py_REFCNT(s) += 1; 14712 immortal_size += PyUnicode_GET_LENGTH(s); 14713 break; 14714 case SSTATE_INTERNED_MORTAL: 14715 Py_REFCNT(s) += 2; 14716 mortal_size += PyUnicode_GET_LENGTH(s); 14717 break; 14718 default: 14719 Py_FatalError("Inconsistent interned string state."); 14720 } 14721 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14722 } 14723 fprintf(stderr, "total size of all interned strings: " 14724 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14725 "mortal/immortal\n", mortal_size, immortal_size); 14726 Py_DECREF(keys); 14727 PyDict_Clear(interned); 14728 Py_CLEAR(interned); 14729} 14730 14731 14732/********************* Unicode Iterator **************************/ 14733 14734typedef struct { 14735 PyObject_HEAD 14736 Py_ssize_t it_index; 14737 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14738} unicodeiterobject; 14739 14740static void 14741unicodeiter_dealloc(unicodeiterobject *it) 14742{ 14743 _PyObject_GC_UNTRACK(it); 14744 Py_XDECREF(it->it_seq); 14745 PyObject_GC_Del(it); 14746} 14747 14748static int 14749unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14750{ 14751 Py_VISIT(it->it_seq); 14752 return 0; 14753} 14754 14755static PyObject * 14756unicodeiter_next(unicodeiterobject *it) 14757{ 14758 PyObject *seq, *item; 14759 14760 assert(it != NULL); 14761 seq = it->it_seq; 14762 if (seq == NULL) 14763 return NULL; 14764 assert(_PyUnicode_CHECK(seq)); 14765 14766 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14767 int kind = PyUnicode_KIND(seq); 14768 void *data = PyUnicode_DATA(seq); 14769 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14770 item = PyUnicode_FromOrdinal(chr); 14771 if (item != NULL) 14772 ++it->it_index; 14773 return item; 14774 } 14775 14776 Py_DECREF(seq); 14777 it->it_seq = NULL; 14778 return NULL; 14779} 14780 14781static PyObject * 14782unicodeiter_len(unicodeiterobject *it) 14783{ 14784 Py_ssize_t len = 0; 14785 if (it->it_seq) 14786 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14787 return PyLong_FromSsize_t(len); 14788} 14789 14790PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14791 14792static PyObject * 14793unicodeiter_reduce(unicodeiterobject *it) 14794{ 14795 if (it->it_seq != NULL) { 14796 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14797 it->it_seq, it->it_index); 14798 } else { 14799 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14800 if (u == NULL) 14801 return NULL; 14802 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14803 } 14804} 14805 14806PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14807 14808static PyObject * 14809unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14810{ 14811 Py_ssize_t index = PyLong_AsSsize_t(state); 14812 if (index == -1 && PyErr_Occurred()) 14813 return NULL; 14814 if (index < 0) 14815 index = 0; 14816 it->it_index = index; 14817 Py_RETURN_NONE; 14818} 14819 14820PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14821 14822static PyMethodDef unicodeiter_methods[] = { 14823 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14824 length_hint_doc}, 14825 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14826 reduce_doc}, 14827 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14828 setstate_doc}, 14829 {NULL, NULL} /* sentinel */ 14830}; 14831 14832PyTypeObject PyUnicodeIter_Type = { 14833 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14834 "str_iterator", /* tp_name */ 14835 sizeof(unicodeiterobject), /* tp_basicsize */ 14836 0, /* tp_itemsize */ 14837 /* methods */ 14838 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14839 0, /* tp_print */ 14840 0, /* tp_getattr */ 14841 0, /* tp_setattr */ 14842 0, /* tp_reserved */ 14843 0, /* tp_repr */ 14844 0, /* tp_as_number */ 14845 0, /* tp_as_sequence */ 14846 0, /* tp_as_mapping */ 14847 0, /* tp_hash */ 14848 0, /* tp_call */ 14849 0, /* tp_str */ 14850 PyObject_GenericGetAttr, /* tp_getattro */ 14851 0, /* tp_setattro */ 14852 0, /* tp_as_buffer */ 14853 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14854 0, /* tp_doc */ 14855 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14856 0, /* tp_clear */ 14857 0, /* tp_richcompare */ 14858 0, /* tp_weaklistoffset */ 14859 PyObject_SelfIter, /* tp_iter */ 14860 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14861 unicodeiter_methods, /* tp_methods */ 14862 0, 14863}; 14864 14865static PyObject * 14866unicode_iter(PyObject *seq) 14867{ 14868 unicodeiterobject *it; 14869 14870 if (!PyUnicode_Check(seq)) { 14871 PyErr_BadInternalCall(); 14872 return NULL; 14873 } 14874 if (PyUnicode_READY(seq) == -1) 14875 return NULL; 14876 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14877 if (it == NULL) 14878 return NULL; 14879 it->it_index = 0; 14880 Py_INCREF(seq); 14881 it->it_seq = seq; 14882 _PyObject_GC_TRACK(it); 14883 return (PyObject *)it; 14884} 14885 14886 14887size_t 14888Py_UNICODE_strlen(const Py_UNICODE *u) 14889{ 14890 int res = 0; 14891 while(*u++) 14892 res++; 14893 return res; 14894} 14895 14896Py_UNICODE* 14897Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14898{ 14899 Py_UNICODE *u = s1; 14900 while ((*u++ = *s2++)); 14901 return s1; 14902} 14903 14904Py_UNICODE* 14905Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14906{ 14907 Py_UNICODE *u = s1; 14908 while ((*u++ = *s2++)) 14909 if (n-- == 0) 14910 break; 14911 return s1; 14912} 14913 14914Py_UNICODE* 14915Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14916{ 14917 Py_UNICODE *u1 = s1; 14918 u1 += Py_UNICODE_strlen(u1); 14919 Py_UNICODE_strcpy(u1, s2); 14920 return s1; 14921} 14922 14923int 14924Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14925{ 14926 while (*s1 && *s2 && *s1 == *s2) 14927 s1++, s2++; 14928 if (*s1 && *s2) 14929 return (*s1 < *s2) ? -1 : +1; 14930 if (*s1) 14931 return 1; 14932 if (*s2) 14933 return -1; 14934 return 0; 14935} 14936 14937int 14938Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14939{ 14940 register Py_UNICODE u1, u2; 14941 for (; n != 0; n--) { 14942 u1 = *s1; 14943 u2 = *s2; 14944 if (u1 != u2) 14945 return (u1 < u2) ? -1 : +1; 14946 if (u1 == '\0') 14947 return 0; 14948 s1++; 14949 s2++; 14950 } 14951 return 0; 14952} 14953 14954Py_UNICODE* 14955Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14956{ 14957 const Py_UNICODE *p; 14958 for (p = s; *p; p++) 14959 if (*p == c) 14960 return (Py_UNICODE*)p; 14961 return NULL; 14962} 14963 14964Py_UNICODE* 14965Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14966{ 14967 const Py_UNICODE *p; 14968 p = s + Py_UNICODE_strlen(s); 14969 while (p != s) { 14970 p--; 14971 if (*p == c) 14972 return (Py_UNICODE*)p; 14973 } 14974 return NULL; 14975} 14976 14977Py_UNICODE* 14978PyUnicode_AsUnicodeCopy(PyObject *unicode) 14979{ 14980 Py_UNICODE *u, *copy; 14981 Py_ssize_t len, size; 14982 14983 if (!PyUnicode_Check(unicode)) { 14984 PyErr_BadArgument(); 14985 return NULL; 14986 } 14987 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14988 if (u == NULL) 14989 return NULL; 14990 /* Ensure we won't overflow the size. */ 14991 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14992 PyErr_NoMemory(); 14993 return NULL; 14994 } 14995 size = len + 1; /* copy the null character */ 14996 size *= sizeof(Py_UNICODE); 14997 copy = PyMem_Malloc(size); 14998 if (copy == NULL) { 14999 PyErr_NoMemory(); 15000 return NULL; 15001 } 15002 memcpy(copy, u, size); 15003 return copy; 15004} 15005 15006/* A _string module, to export formatter_parser and formatter_field_name_split 15007 to the string.Formatter class implemented in Python. */ 15008 15009static PyMethodDef _string_methods[] = { 15010 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15011 METH_O, PyDoc_STR("split the argument as a field name")}, 15012 {"formatter_parser", (PyCFunction) formatter_parser, 15013 METH_O, PyDoc_STR("parse the argument as a format string")}, 15014 {NULL, NULL} 15015}; 15016 15017static struct PyModuleDef _string_module = { 15018 PyModuleDef_HEAD_INIT, 15019 "_string", 15020 PyDoc_STR("string helper module"), 15021 0, 15022 _string_methods, 15023 NULL, 15024 NULL, 15025 NULL, 15026 NULL 15027}; 15028 15029PyMODINIT_FUNC 15030PyInit__string(void) 15031{ 15032 return PyModule_Create(&_string_module); 15033} 15034 15035 15036#ifdef __cplusplus 15037} 15038#endif 15039