unicodeobject.c revision 0d4df752acbaf14164f1e8b2b95ebe3fe288bb82
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/*[clinic input] 51class str "PyUnicodeObject *" "&PyUnicode_Type" 52[clinic start generated code]*/ 53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 54 55/* --- Globals ------------------------------------------------------------ 56 57NOTE: In the interpreter's initialization phase, some globals are currently 58 initialized dynamically as needed. In the process Unicode objects may 59 be created before the Unicode type is ready. 60 61*/ 62 63 64#ifdef __cplusplus 65extern "C" { 66#endif 67 68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 69#define MAX_UNICODE 0x10ffff 70 71#ifdef Py_DEBUG 72# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 73#else 74# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 75#endif 76 77#define _PyUnicode_UTF8(op) \ 78 (((PyCompactUnicodeObject*)(op))->utf8) 79#define PyUnicode_UTF8(op) \ 80 (assert(_PyUnicode_CHECK(op)), \ 81 assert(PyUnicode_IS_READY(op)), \ 82 PyUnicode_IS_COMPACT_ASCII(op) ? \ 83 ((char*)((PyASCIIObject*)(op) + 1)) : \ 84 _PyUnicode_UTF8(op)) 85#define _PyUnicode_UTF8_LENGTH(op) \ 86 (((PyCompactUnicodeObject*)(op))->utf8_length) 87#define PyUnicode_UTF8_LENGTH(op) \ 88 (assert(_PyUnicode_CHECK(op)), \ 89 assert(PyUnicode_IS_READY(op)), \ 90 PyUnicode_IS_COMPACT_ASCII(op) ? \ 91 ((PyASCIIObject*)(op))->length : \ 92 _PyUnicode_UTF8_LENGTH(op)) 93#define _PyUnicode_WSTR(op) \ 94 (((PyASCIIObject*)(op))->wstr) 95#define _PyUnicode_WSTR_LENGTH(op) \ 96 (((PyCompactUnicodeObject*)(op))->wstr_length) 97#define _PyUnicode_LENGTH(op) \ 98 (((PyASCIIObject *)(op))->length) 99#define _PyUnicode_STATE(op) \ 100 (((PyASCIIObject *)(op))->state) 101#define _PyUnicode_HASH(op) \ 102 (((PyASCIIObject *)(op))->hash) 103#define _PyUnicode_KIND(op) \ 104 (assert(_PyUnicode_CHECK(op)), \ 105 ((PyASCIIObject *)(op))->state.kind) 106#define _PyUnicode_GET_LENGTH(op) \ 107 (assert(_PyUnicode_CHECK(op)), \ 108 ((PyASCIIObject *)(op))->length) 109#define _PyUnicode_DATA_ANY(op) \ 110 (((PyUnicodeObject*)(op))->data.any) 111 112#undef PyUnicode_READY 113#define PyUnicode_READY(op) \ 114 (assert(_PyUnicode_CHECK(op)), \ 115 (PyUnicode_IS_READY(op) ? \ 116 0 : \ 117 _PyUnicode_Ready(op))) 118 119#define _PyUnicode_SHARE_UTF8(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 123#define _PyUnicode_SHARE_WSTR(op) \ 124 (assert(_PyUnicode_CHECK(op)), \ 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 126 127/* true if the Unicode object has an allocated UTF-8 memory block 128 (not shared with other data) */ 129#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 ((_PyUnicode_WSTR(op) && \ 138 (!PyUnicode_IS_READY(op) || \ 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 140 141/* Generic helper macro to convert characters of different types. 142 from_type and to_type have to be valid type names, begin and end 143 are pointers to the source characters which should be of type 144 "from_type *". to is a pointer of type "to_type *" and points to the 145 buffer where the result characters are written to. */ 146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 147 do { \ 148 to_type *_to = (to_type *)(to); \ 149 const from_type *_iter = (from_type *)(begin); \ 150 const from_type *_end = (from_type *)(end); \ 151 Py_ssize_t n = (_end) - (_iter); \ 152 const from_type *_unrolled_end = \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 154 while (_iter < (_unrolled_end)) { \ 155 _to[0] = (to_type) _iter[0]; \ 156 _to[1] = (to_type) _iter[1]; \ 157 _to[2] = (to_type) _iter[2]; \ 158 _to[3] = (to_type) _iter[3]; \ 159 _iter += 4; _to += 4; \ 160 } \ 161 while (_iter < (_end)) \ 162 *_to++ = (to_type) *_iter++; \ 163 } while (0) 164 165/* This dictionary holds all interned unicode strings. Note that references 166 to strings in this dictionary are *not* counted in the string's ob_refcnt. 167 When the interned string reaches a refcnt of 0 the string deallocation 168 function will delete the reference from this dictionary. 169 170 Another way to look at this is that to say that the actual reference 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 172*/ 173static PyObject *interned = NULL; 174 175/* The empty Unicode object is shared to improve performance. */ 176static PyObject *unicode_empty = NULL; 177 178#define _Py_INCREF_UNICODE_EMPTY() \ 179 do { \ 180 if (unicode_empty != NULL) \ 181 Py_INCREF(unicode_empty); \ 182 else { \ 183 unicode_empty = PyUnicode_New(0, 0); \ 184 if (unicode_empty != NULL) { \ 185 Py_INCREF(unicode_empty); \ 186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 187 } \ 188 } \ 189 } while (0) 190 191#define _Py_RETURN_UNICODE_EMPTY() \ 192 do { \ 193 _Py_INCREF_UNICODE_EMPTY(); \ 194 return unicode_empty; \ 195 } while (0) 196 197/* Forward declaration */ 198Py_LOCAL_INLINE(int) 199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 200 201/* List of static strings. */ 202static _Py_Identifier *static_strings = NULL; 203 204/* Single character Unicode strings in the Latin-1 range are being 205 shared as well. */ 206static PyObject *unicode_latin1[256] = {NULL}; 207 208/* Fast detection of the most frequent whitespace characters */ 209const unsigned char _Py_ascii_whitespace[] = { 210 0, 0, 0, 0, 0, 0, 0, 0, 211/* case 0x0009: * CHARACTER TABULATION */ 212/* case 0x000A: * LINE FEED */ 213/* case 0x000B: * LINE TABULATION */ 214/* case 0x000C: * FORM FEED */ 215/* case 0x000D: * CARRIAGE RETURN */ 216 0, 1, 1, 1, 1, 1, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218/* case 0x001C: * FILE SEPARATOR */ 219/* case 0x001D: * GROUP SEPARATOR */ 220/* case 0x001E: * RECORD SEPARATOR */ 221/* case 0x001F: * UNIT SEPARATOR */ 222 0, 0, 0, 0, 1, 1, 1, 1, 223/* case 0x0020: * SPACE */ 224 1, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0 237}; 238 239/* forward */ 240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 241static PyObject* get_latin1_char(unsigned char ch); 242static int unicode_modifiable(PyObject *unicode); 243 244 245static PyObject * 246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 247static PyObject * 248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 249static PyObject * 250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 251 252static PyObject * 253unicode_encode_call_errorhandler(const char *errors, 254 PyObject **errorHandler,const char *encoding, const char *reason, 255 PyObject *unicode, PyObject **exceptionObject, 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 257 258static void 259raise_encode_exception(PyObject **exceptionObject, 260 const char *encoding, 261 PyObject *unicode, 262 Py_ssize_t startpos, Py_ssize_t endpos, 263 const char *reason); 264 265/* Same for linebreaks */ 266static unsigned char ascii_linebreak[] = { 267 0, 0, 0, 0, 0, 0, 0, 0, 268/* 0x000A, * LINE FEED */ 269/* 0x000B, * LINE TABULATION */ 270/* 0x000C, * FORM FEED */ 271/* 0x000D, * CARRIAGE RETURN */ 272 0, 0, 1, 1, 1, 1, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274/* 0x001C, * FILE SEPARATOR */ 275/* 0x001D, * GROUP SEPARATOR */ 276/* 0x001E, * RECORD SEPARATOR */ 277 0, 0, 0, 0, 1, 1, 1, 0, 278 0, 0, 0, 0, 0, 0, 0, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0 291}; 292 293#include "clinic/unicodeobject.c.h" 294 295/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 296 This function is kept for backward compatibility with the old API. */ 297Py_UNICODE 298PyUnicode_GetMax(void) 299{ 300#ifdef Py_UNICODE_WIDE 301 return 0x10FFFF; 302#else 303 /* This is actually an illegal character, so it should 304 not be passed to unichr. */ 305 return 0xFFFF; 306#endif 307} 308 309#ifdef Py_DEBUG 310int 311_PyUnicode_CheckConsistency(PyObject *op, int check_content) 312{ 313 PyASCIIObject *ascii; 314 unsigned int kind; 315 316 assert(PyUnicode_Check(op)); 317 318 ascii = (PyASCIIObject *)op; 319 kind = ascii->state.kind; 320 321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 322 assert(kind == PyUnicode_1BYTE_KIND); 323 assert(ascii->state.ready == 1); 324 } 325 else { 326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 327 void *data; 328 329 if (ascii->state.compact == 1) { 330 data = compact + 1; 331 assert(kind == PyUnicode_1BYTE_KIND 332 || kind == PyUnicode_2BYTE_KIND 333 || kind == PyUnicode_4BYTE_KIND); 334 assert(ascii->state.ascii == 0); 335 assert(ascii->state.ready == 1); 336 assert (compact->utf8 != data); 337 } 338 else { 339 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 340 341 data = unicode->data.any; 342 if (kind == PyUnicode_WCHAR_KIND) { 343 assert(ascii->length == 0); 344 assert(ascii->hash == -1); 345 assert(ascii->state.compact == 0); 346 assert(ascii->state.ascii == 0); 347 assert(ascii->state.ready == 0); 348 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 349 assert(ascii->wstr != NULL); 350 assert(data == NULL); 351 assert(compact->utf8 == NULL); 352 } 353 else { 354 assert(kind == PyUnicode_1BYTE_KIND 355 || kind == PyUnicode_2BYTE_KIND 356 || kind == PyUnicode_4BYTE_KIND); 357 assert(ascii->state.compact == 0); 358 assert(ascii->state.ready == 1); 359 assert(data != NULL); 360 if (ascii->state.ascii) { 361 assert (compact->utf8 == data); 362 assert (compact->utf8_length == ascii->length); 363 } 364 else 365 assert (compact->utf8 != data); 366 } 367 } 368 if (kind != PyUnicode_WCHAR_KIND) { 369 if ( 370#if SIZEOF_WCHAR_T == 2 371 kind == PyUnicode_2BYTE_KIND 372#else 373 kind == PyUnicode_4BYTE_KIND 374#endif 375 ) 376 { 377 assert(ascii->wstr == data); 378 assert(compact->wstr_length == ascii->length); 379 } else 380 assert(ascii->wstr != data); 381 } 382 383 if (compact->utf8 == NULL) 384 assert(compact->utf8_length == 0); 385 if (ascii->wstr == NULL) 386 assert(compact->wstr_length == 0); 387 } 388 /* check that the best kind is used */ 389 if (check_content && kind != PyUnicode_WCHAR_KIND) 390 { 391 Py_ssize_t i; 392 Py_UCS4 maxchar = 0; 393 void *data; 394 Py_UCS4 ch; 395 396 data = PyUnicode_DATA(ascii); 397 for (i=0; i < ascii->length; i++) 398 { 399 ch = PyUnicode_READ(kind, data, i); 400 if (ch > maxchar) 401 maxchar = ch; 402 } 403 if (kind == PyUnicode_1BYTE_KIND) { 404 if (ascii->state.ascii == 0) { 405 assert(maxchar >= 128); 406 assert(maxchar <= 255); 407 } 408 else 409 assert(maxchar < 128); 410 } 411 else if (kind == PyUnicode_2BYTE_KIND) { 412 assert(maxchar >= 0x100); 413 assert(maxchar <= 0xFFFF); 414 } 415 else { 416 assert(maxchar >= 0x10000); 417 assert(maxchar <= MAX_UNICODE); 418 } 419 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 420 } 421 return 1; 422} 423#endif 424 425static PyObject* 426unicode_result_wchar(PyObject *unicode) 427{ 428#ifndef Py_DEBUG 429 Py_ssize_t len; 430 431 len = _PyUnicode_WSTR_LENGTH(unicode); 432 if (len == 0) { 433 Py_DECREF(unicode); 434 _Py_RETURN_UNICODE_EMPTY(); 435 } 436 437 if (len == 1) { 438 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 439 if ((Py_UCS4)ch < 256) { 440 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 441 Py_DECREF(unicode); 442 return latin1_char; 443 } 444 } 445 446 if (_PyUnicode_Ready(unicode) < 0) { 447 Py_DECREF(unicode); 448 return NULL; 449 } 450#else 451 assert(Py_REFCNT(unicode) == 1); 452 453 /* don't make the result ready in debug mode to ensure that the caller 454 makes the string ready before using it */ 455 assert(_PyUnicode_CheckConsistency(unicode, 1)); 456#endif 457 return unicode; 458} 459 460static PyObject* 461unicode_result_ready(PyObject *unicode) 462{ 463 Py_ssize_t length; 464 465 length = PyUnicode_GET_LENGTH(unicode); 466 if (length == 0) { 467 if (unicode != unicode_empty) { 468 Py_DECREF(unicode); 469 _Py_RETURN_UNICODE_EMPTY(); 470 } 471 return unicode_empty; 472 } 473 474 if (length == 1) { 475 void *data = PyUnicode_DATA(unicode); 476 int kind = PyUnicode_KIND(unicode); 477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 478 if (ch < 256) { 479 PyObject *latin1_char = unicode_latin1[ch]; 480 if (latin1_char != NULL) { 481 if (unicode != latin1_char) { 482 Py_INCREF(latin1_char); 483 Py_DECREF(unicode); 484 } 485 return latin1_char; 486 } 487 else { 488 assert(_PyUnicode_CheckConsistency(unicode, 1)); 489 Py_INCREF(unicode); 490 unicode_latin1[ch] = unicode; 491 return unicode; 492 } 493 } 494 } 495 496 assert(_PyUnicode_CheckConsistency(unicode, 1)); 497 return unicode; 498} 499 500static PyObject* 501unicode_result(PyObject *unicode) 502{ 503 assert(_PyUnicode_CHECK(unicode)); 504 if (PyUnicode_IS_READY(unicode)) 505 return unicode_result_ready(unicode); 506 else 507 return unicode_result_wchar(unicode); 508} 509 510static PyObject* 511unicode_result_unchanged(PyObject *unicode) 512{ 513 if (PyUnicode_CheckExact(unicode)) { 514 if (PyUnicode_READY(unicode) == -1) 515 return NULL; 516 Py_INCREF(unicode); 517 return unicode; 518 } 519 else 520 /* Subtype -- return genuine unicode string with the same value. */ 521 return _PyUnicode_Copy(unicode); 522} 523 524/* --- Bloom Filters ----------------------------------------------------- */ 525 526/* stuff to implement simple "bloom filters" for Unicode characters. 527 to keep things simple, we use a single bitmask, using the least 5 528 bits from each unicode characters as the bit index. */ 529 530/* the linebreak mask is set up by Unicode_Init below */ 531 532#if LONG_BIT >= 128 533#define BLOOM_WIDTH 128 534#elif LONG_BIT >= 64 535#define BLOOM_WIDTH 64 536#elif LONG_BIT >= 32 537#define BLOOM_WIDTH 32 538#else 539#error "LONG_BIT is smaller than 32" 540#endif 541 542#define BLOOM_MASK unsigned long 543 544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 545 546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 547 548#define BLOOM_LINEBREAK(ch) \ 549 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 551 552Py_LOCAL_INLINE(BLOOM_MASK) 553make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 554{ 555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 556 do { \ 557 TYPE *data = (TYPE *)PTR; \ 558 TYPE *end = data + LEN; \ 559 Py_UCS4 ch; \ 560 for (; data != end; data++) { \ 561 ch = *data; \ 562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 563 } \ 564 break; \ 565 } while (0) 566 567 /* calculate simple bloom-style bitmask for a given unicode string */ 568 569 BLOOM_MASK mask; 570 571 mask = 0; 572 switch (kind) { 573 case PyUnicode_1BYTE_KIND: 574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 575 break; 576 case PyUnicode_2BYTE_KIND: 577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 578 break; 579 case PyUnicode_4BYTE_KIND: 580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 581 break; 582 default: 583 assert(0); 584 } 585 return mask; 586 587#undef BLOOM_UPDATE 588} 589 590/* Compilation of templated routines */ 591 592#include "stringlib/asciilib.h" 593#include "stringlib/fastsearch.h" 594#include "stringlib/partition.h" 595#include "stringlib/split.h" 596#include "stringlib/count.h" 597#include "stringlib/find.h" 598#include "stringlib/find_max_char.h" 599#include "stringlib/localeutil.h" 600#include "stringlib/undef.h" 601 602#include "stringlib/ucs1lib.h" 603#include "stringlib/fastsearch.h" 604#include "stringlib/partition.h" 605#include "stringlib/split.h" 606#include "stringlib/count.h" 607#include "stringlib/find.h" 608#include "stringlib/replace.h" 609#include "stringlib/find_max_char.h" 610#include "stringlib/localeutil.h" 611#include "stringlib/undef.h" 612 613#include "stringlib/ucs2lib.h" 614#include "stringlib/fastsearch.h" 615#include "stringlib/partition.h" 616#include "stringlib/split.h" 617#include "stringlib/count.h" 618#include "stringlib/find.h" 619#include "stringlib/replace.h" 620#include "stringlib/find_max_char.h" 621#include "stringlib/localeutil.h" 622#include "stringlib/undef.h" 623 624#include "stringlib/ucs4lib.h" 625#include "stringlib/fastsearch.h" 626#include "stringlib/partition.h" 627#include "stringlib/split.h" 628#include "stringlib/count.h" 629#include "stringlib/find.h" 630#include "stringlib/replace.h" 631#include "stringlib/find_max_char.h" 632#include "stringlib/localeutil.h" 633#include "stringlib/undef.h" 634 635#include "stringlib/unicodedefs.h" 636#include "stringlib/fastsearch.h" 637#include "stringlib/count.h" 638#include "stringlib/find.h" 639#include "stringlib/undef.h" 640 641/* --- Unicode Object ----------------------------------------------------- */ 642 643static PyObject * 644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 645 646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, 647 Py_ssize_t size, Py_UCS4 ch, 648 int direction) 649{ 650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 651 652 switch (kind) { 653 case PyUnicode_1BYTE_KIND: 654 { 655 Py_UCS1 ch1 = (Py_UCS1) ch; 656 if (ch1 == ch) 657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 658 else 659 return -1; 660 } 661 case PyUnicode_2BYTE_KIND: 662 { 663 Py_UCS2 ch2 = (Py_UCS2) ch; 664 if (ch2 == ch) 665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 666 else 667 return -1; 668 } 669 case PyUnicode_4BYTE_KIND: 670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 671 default: 672 assert(0); 673 return -1; 674 } 675} 676 677#ifdef Py_DEBUG 678/* Fill the data of an Unicode string with invalid characters to detect bugs 679 earlier. 680 681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 683 invalid character in Unicode 6.0. */ 684static void 685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 686{ 687 int kind = PyUnicode_KIND(unicode); 688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 689 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 690 if (length <= old_length) 691 return; 692 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 693} 694#endif 695 696static PyObject* 697resize_compact(PyObject *unicode, Py_ssize_t length) 698{ 699 Py_ssize_t char_size; 700 Py_ssize_t struct_size; 701 Py_ssize_t new_size; 702 int share_wstr; 703 PyObject *new_unicode; 704#ifdef Py_DEBUG 705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 706#endif 707 708 assert(unicode_modifiable(unicode)); 709 assert(PyUnicode_IS_READY(unicode)); 710 assert(PyUnicode_IS_COMPACT(unicode)); 711 712 char_size = PyUnicode_KIND(unicode); 713 if (PyUnicode_IS_ASCII(unicode)) 714 struct_size = sizeof(PyASCIIObject); 715 else 716 struct_size = sizeof(PyCompactUnicodeObject); 717 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 718 719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 720 PyErr_NoMemory(); 721 return NULL; 722 } 723 new_size = (struct_size + (length + 1) * char_size); 724 725 _Py_DEC_REFTOTAL; 726 _Py_ForgetReference(unicode); 727 728 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 729 if (new_unicode == NULL) { 730 _Py_NewReference(unicode); 731 PyErr_NoMemory(); 732 return NULL; 733 } 734 unicode = new_unicode; 735 _Py_NewReference(unicode); 736 737 _PyUnicode_LENGTH(unicode) = length; 738 if (share_wstr) { 739 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 740 if (!PyUnicode_IS_ASCII(unicode)) 741 _PyUnicode_WSTR_LENGTH(unicode) = length; 742 } 743 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 744 PyObject_DEL(_PyUnicode_WSTR(unicode)); 745 _PyUnicode_WSTR(unicode) = NULL; 746 } 747#ifdef Py_DEBUG 748 unicode_fill_invalid(unicode, old_length); 749#endif 750 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 751 length, 0); 752 assert(_PyUnicode_CheckConsistency(unicode, 0)); 753 return unicode; 754} 755 756static int 757resize_inplace(PyObject *unicode, Py_ssize_t length) 758{ 759 wchar_t *wstr; 760 Py_ssize_t new_size; 761 assert(!PyUnicode_IS_COMPACT(unicode)); 762 assert(Py_REFCNT(unicode) == 1); 763 764 if (PyUnicode_IS_READY(unicode)) { 765 Py_ssize_t char_size; 766 int share_wstr, share_utf8; 767 void *data; 768#ifdef Py_DEBUG 769 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 770#endif 771 772 data = _PyUnicode_DATA_ANY(unicode); 773 char_size = PyUnicode_KIND(unicode); 774 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 775 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 776 777 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 778 PyErr_NoMemory(); 779 return -1; 780 } 781 new_size = (length + 1) * char_size; 782 783 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 784 { 785 PyObject_DEL(_PyUnicode_UTF8(unicode)); 786 _PyUnicode_UTF8(unicode) = NULL; 787 _PyUnicode_UTF8_LENGTH(unicode) = 0; 788 } 789 790 data = (PyObject *)PyObject_REALLOC(data, new_size); 791 if (data == NULL) { 792 PyErr_NoMemory(); 793 return -1; 794 } 795 _PyUnicode_DATA_ANY(unicode) = data; 796 if (share_wstr) { 797 _PyUnicode_WSTR(unicode) = data; 798 _PyUnicode_WSTR_LENGTH(unicode) = length; 799 } 800 if (share_utf8) { 801 _PyUnicode_UTF8(unicode) = data; 802 _PyUnicode_UTF8_LENGTH(unicode) = length; 803 } 804 _PyUnicode_LENGTH(unicode) = length; 805 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 806#ifdef Py_DEBUG 807 unicode_fill_invalid(unicode, old_length); 808#endif 809 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 810 assert(_PyUnicode_CheckConsistency(unicode, 0)); 811 return 0; 812 } 813 } 814 assert(_PyUnicode_WSTR(unicode) != NULL); 815 816 /* check for integer overflow */ 817 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 818 PyErr_NoMemory(); 819 return -1; 820 } 821 new_size = sizeof(wchar_t) * (length + 1); 822 wstr = _PyUnicode_WSTR(unicode); 823 wstr = PyObject_REALLOC(wstr, new_size); 824 if (!wstr) { 825 PyErr_NoMemory(); 826 return -1; 827 } 828 _PyUnicode_WSTR(unicode) = wstr; 829 _PyUnicode_WSTR(unicode)[length] = 0; 830 _PyUnicode_WSTR_LENGTH(unicode) = length; 831 assert(_PyUnicode_CheckConsistency(unicode, 0)); 832 return 0; 833} 834 835static PyObject* 836resize_copy(PyObject *unicode, Py_ssize_t length) 837{ 838 Py_ssize_t copy_length; 839 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 840 PyObject *copy; 841 842 if (PyUnicode_READY(unicode) == -1) 843 return NULL; 844 845 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 846 if (copy == NULL) 847 return NULL; 848 849 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 850 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 851 return copy; 852 } 853 else { 854 PyObject *w; 855 856 w = (PyObject*)_PyUnicode_New(length); 857 if (w == NULL) 858 return NULL; 859 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 860 copy_length = Py_MIN(copy_length, length); 861 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 862 copy_length * sizeof(wchar_t)); 863 return w; 864 } 865} 866 867/* We allocate one more byte to make sure the string is 868 Ux0000 terminated; some code (e.g. new_identifier) 869 relies on that. 870 871 XXX This allocator could further be enhanced by assuring that the 872 free list never reduces its size below 1. 873 874*/ 875 876static PyUnicodeObject * 877_PyUnicode_New(Py_ssize_t length) 878{ 879 PyUnicodeObject *unicode; 880 size_t new_size; 881 882 /* Optimization for empty strings */ 883 if (length == 0 && unicode_empty != NULL) { 884 Py_INCREF(unicode_empty); 885 return (PyUnicodeObject*)unicode_empty; 886 } 887 888 /* Ensure we won't overflow the size. */ 889 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 890 return (PyUnicodeObject *)PyErr_NoMemory(); 891 } 892 if (length < 0) { 893 PyErr_SetString(PyExc_SystemError, 894 "Negative size passed to _PyUnicode_New"); 895 return NULL; 896 } 897 898 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 899 if (unicode == NULL) 900 return NULL; 901 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 902 903 _PyUnicode_WSTR_LENGTH(unicode) = length; 904 _PyUnicode_HASH(unicode) = -1; 905 _PyUnicode_STATE(unicode).interned = 0; 906 _PyUnicode_STATE(unicode).kind = 0; 907 _PyUnicode_STATE(unicode).compact = 0; 908 _PyUnicode_STATE(unicode).ready = 0; 909 _PyUnicode_STATE(unicode).ascii = 0; 910 _PyUnicode_DATA_ANY(unicode) = NULL; 911 _PyUnicode_LENGTH(unicode) = 0; 912 _PyUnicode_UTF8(unicode) = NULL; 913 _PyUnicode_UTF8_LENGTH(unicode) = 0; 914 915 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 916 if (!_PyUnicode_WSTR(unicode)) { 917 Py_DECREF(unicode); 918 PyErr_NoMemory(); 919 return NULL; 920 } 921 922 /* Initialize the first element to guard against cases where 923 * the caller fails before initializing str -- unicode_resize() 924 * reads str[0], and the Keep-Alive optimization can keep memory 925 * allocated for str alive across a call to unicode_dealloc(unicode). 926 * We don't want unicode_resize to read uninitialized memory in 927 * that case. 928 */ 929 _PyUnicode_WSTR(unicode)[0] = 0; 930 _PyUnicode_WSTR(unicode)[length] = 0; 931 932 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 933 return unicode; 934} 935 936static const char* 937unicode_kind_name(PyObject *unicode) 938{ 939 /* don't check consistency: unicode_kind_name() is called from 940 _PyUnicode_Dump() */ 941 if (!PyUnicode_IS_COMPACT(unicode)) 942 { 943 if (!PyUnicode_IS_READY(unicode)) 944 return "wstr"; 945 switch (PyUnicode_KIND(unicode)) 946 { 947 case PyUnicode_1BYTE_KIND: 948 if (PyUnicode_IS_ASCII(unicode)) 949 return "legacy ascii"; 950 else 951 return "legacy latin1"; 952 case PyUnicode_2BYTE_KIND: 953 return "legacy UCS2"; 954 case PyUnicode_4BYTE_KIND: 955 return "legacy UCS4"; 956 default: 957 return "<legacy invalid kind>"; 958 } 959 } 960 assert(PyUnicode_IS_READY(unicode)); 961 switch (PyUnicode_KIND(unicode)) { 962 case PyUnicode_1BYTE_KIND: 963 if (PyUnicode_IS_ASCII(unicode)) 964 return "ascii"; 965 else 966 return "latin1"; 967 case PyUnicode_2BYTE_KIND: 968 return "UCS2"; 969 case PyUnicode_4BYTE_KIND: 970 return "UCS4"; 971 default: 972 return "<invalid compact kind>"; 973 } 974} 975 976#ifdef Py_DEBUG 977/* Functions wrapping macros for use in debugger */ 978char *_PyUnicode_utf8(void *unicode){ 979 return PyUnicode_UTF8(unicode); 980} 981 982void *_PyUnicode_compact_data(void *unicode) { 983 return _PyUnicode_COMPACT_DATA(unicode); 984} 985void *_PyUnicode_data(void *unicode){ 986 printf("obj %p\n", unicode); 987 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 988 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 989 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 990 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 991 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 992 return PyUnicode_DATA(unicode); 993} 994 995void 996_PyUnicode_Dump(PyObject *op) 997{ 998 PyASCIIObject *ascii = (PyASCIIObject *)op; 999 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1000 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1001 void *data; 1002 1003 if (ascii->state.compact) 1004 { 1005 if (ascii->state.ascii) 1006 data = (ascii + 1); 1007 else 1008 data = (compact + 1); 1009 } 1010 else 1011 data = unicode->data.any; 1012 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1013 unicode_kind_name(op), ascii->length); 1014 1015 if (ascii->wstr == data) 1016 printf("shared "); 1017 printf("wstr=%p", ascii->wstr); 1018 1019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1020 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1021 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1022 printf("shared "); 1023 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1024 compact->utf8, compact->utf8_length); 1025 } 1026 printf(", data=%p\n", data); 1027} 1028#endif 1029 1030PyObject * 1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1032{ 1033 PyObject *obj; 1034 PyCompactUnicodeObject *unicode; 1035 void *data; 1036 enum PyUnicode_Kind kind; 1037 int is_sharing, is_ascii; 1038 Py_ssize_t char_size; 1039 Py_ssize_t struct_size; 1040 1041 /* Optimization for empty strings */ 1042 if (size == 0 && unicode_empty != NULL) { 1043 Py_INCREF(unicode_empty); 1044 return unicode_empty; 1045 } 1046 1047 is_ascii = 0; 1048 is_sharing = 0; 1049 struct_size = sizeof(PyCompactUnicodeObject); 1050 if (maxchar < 128) { 1051 kind = PyUnicode_1BYTE_KIND; 1052 char_size = 1; 1053 is_ascii = 1; 1054 struct_size = sizeof(PyASCIIObject); 1055 } 1056 else if (maxchar < 256) { 1057 kind = PyUnicode_1BYTE_KIND; 1058 char_size = 1; 1059 } 1060 else if (maxchar < 65536) { 1061 kind = PyUnicode_2BYTE_KIND; 1062 char_size = 2; 1063 if (sizeof(wchar_t) == 2) 1064 is_sharing = 1; 1065 } 1066 else { 1067 if (maxchar > MAX_UNICODE) { 1068 PyErr_SetString(PyExc_SystemError, 1069 "invalid maximum character passed to PyUnicode_New"); 1070 return NULL; 1071 } 1072 kind = PyUnicode_4BYTE_KIND; 1073 char_size = 4; 1074 if (sizeof(wchar_t) == 4) 1075 is_sharing = 1; 1076 } 1077 1078 /* Ensure we won't overflow the size. */ 1079 if (size < 0) { 1080 PyErr_SetString(PyExc_SystemError, 1081 "Negative size passed to PyUnicode_New"); 1082 return NULL; 1083 } 1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1085 return PyErr_NoMemory(); 1086 1087 /* Duplicated allocation code from _PyObject_New() instead of a call to 1088 * PyObject_New() so we are able to allocate space for the object and 1089 * it's data buffer. 1090 */ 1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1092 if (obj == NULL) 1093 return PyErr_NoMemory(); 1094 obj = PyObject_INIT(obj, &PyUnicode_Type); 1095 if (obj == NULL) 1096 return NULL; 1097 1098 unicode = (PyCompactUnicodeObject *)obj; 1099 if (is_ascii) 1100 data = ((PyASCIIObject*)obj) + 1; 1101 else 1102 data = unicode + 1; 1103 _PyUnicode_LENGTH(unicode) = size; 1104 _PyUnicode_HASH(unicode) = -1; 1105 _PyUnicode_STATE(unicode).interned = 0; 1106 _PyUnicode_STATE(unicode).kind = kind; 1107 _PyUnicode_STATE(unicode).compact = 1; 1108 _PyUnicode_STATE(unicode).ready = 1; 1109 _PyUnicode_STATE(unicode).ascii = is_ascii; 1110 if (is_ascii) { 1111 ((char*)data)[size] = 0; 1112 _PyUnicode_WSTR(unicode) = NULL; 1113 } 1114 else if (kind == PyUnicode_1BYTE_KIND) { 1115 ((char*)data)[size] = 0; 1116 _PyUnicode_WSTR(unicode) = NULL; 1117 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1118 unicode->utf8 = NULL; 1119 unicode->utf8_length = 0; 1120 } 1121 else { 1122 unicode->utf8 = NULL; 1123 unicode->utf8_length = 0; 1124 if (kind == PyUnicode_2BYTE_KIND) 1125 ((Py_UCS2*)data)[size] = 0; 1126 else /* kind == PyUnicode_4BYTE_KIND */ 1127 ((Py_UCS4*)data)[size] = 0; 1128 if (is_sharing) { 1129 _PyUnicode_WSTR_LENGTH(unicode) = size; 1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1131 } 1132 else { 1133 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1134 _PyUnicode_WSTR(unicode) = NULL; 1135 } 1136 } 1137#ifdef Py_DEBUG 1138 unicode_fill_invalid((PyObject*)unicode, 0); 1139#endif 1140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1141 return obj; 1142} 1143 1144#if SIZEOF_WCHAR_T == 2 1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1146 will decode surrogate pairs, the other conversions are implemented as macros 1147 for efficiency. 1148 1149 This function assumes that unicode can hold one more code point than wstr 1150 characters for a terminating null character. */ 1151static void 1152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1153 PyObject *unicode) 1154{ 1155 const wchar_t *iter; 1156 Py_UCS4 *ucs4_out; 1157 1158 assert(unicode != NULL); 1159 assert(_PyUnicode_CHECK(unicode)); 1160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1162 1163 for (iter = begin; iter < end; ) { 1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1165 _PyUnicode_GET_LENGTH(unicode))); 1166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1167 && (iter+1) < end 1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1169 { 1170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1171 iter += 2; 1172 } 1173 else { 1174 *ucs4_out++ = *iter; 1175 iter++; 1176 } 1177 } 1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1179 _PyUnicode_GET_LENGTH(unicode))); 1180 1181} 1182#endif 1183 1184static int 1185unicode_check_modifiable(PyObject *unicode) 1186{ 1187 if (!unicode_modifiable(unicode)) { 1188 PyErr_SetString(PyExc_SystemError, 1189 "Cannot modify a string currently used"); 1190 return -1; 1191 } 1192 return 0; 1193} 1194 1195static int 1196_copy_characters(PyObject *to, Py_ssize_t to_start, 1197 PyObject *from, Py_ssize_t from_start, 1198 Py_ssize_t how_many, int check_maxchar) 1199{ 1200 unsigned int from_kind, to_kind; 1201 void *from_data, *to_data; 1202 1203 assert(0 <= how_many); 1204 assert(0 <= from_start); 1205 assert(0 <= to_start); 1206 assert(PyUnicode_Check(from)); 1207 assert(PyUnicode_IS_READY(from)); 1208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1209 1210 assert(PyUnicode_Check(to)); 1211 assert(PyUnicode_IS_READY(to)); 1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1213 1214 if (how_many == 0) 1215 return 0; 1216 1217 from_kind = PyUnicode_KIND(from); 1218 from_data = PyUnicode_DATA(from); 1219 to_kind = PyUnicode_KIND(to); 1220 to_data = PyUnicode_DATA(to); 1221 1222#ifdef Py_DEBUG 1223 if (!check_maxchar 1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1225 { 1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1227 Py_UCS4 ch; 1228 Py_ssize_t i; 1229 for (i=0; i < how_many; i++) { 1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1231 assert(ch <= to_maxchar); 1232 } 1233 } 1234#endif 1235 1236 if (from_kind == to_kind) { 1237 if (check_maxchar 1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1239 { 1240 /* Writing Latin-1 characters into an ASCII string requires to 1241 check that all written characters are pure ASCII */ 1242 Py_UCS4 max_char; 1243 max_char = ucs1lib_find_max_char(from_data, 1244 (Py_UCS1*)from_data + how_many); 1245 if (max_char >= 128) 1246 return -1; 1247 } 1248 Py_MEMCPY((char*)to_data + to_kind * to_start, 1249 (char*)from_data + from_kind * from_start, 1250 to_kind * how_many); 1251 } 1252 else if (from_kind == PyUnicode_1BYTE_KIND 1253 && to_kind == PyUnicode_2BYTE_KIND) 1254 { 1255 _PyUnicode_CONVERT_BYTES( 1256 Py_UCS1, Py_UCS2, 1257 PyUnicode_1BYTE_DATA(from) + from_start, 1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1259 PyUnicode_2BYTE_DATA(to) + to_start 1260 ); 1261 } 1262 else if (from_kind == PyUnicode_1BYTE_KIND 1263 && to_kind == PyUnicode_4BYTE_KIND) 1264 { 1265 _PyUnicode_CONVERT_BYTES( 1266 Py_UCS1, Py_UCS4, 1267 PyUnicode_1BYTE_DATA(from) + from_start, 1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1269 PyUnicode_4BYTE_DATA(to) + to_start 1270 ); 1271 } 1272 else if (from_kind == PyUnicode_2BYTE_KIND 1273 && to_kind == PyUnicode_4BYTE_KIND) 1274 { 1275 _PyUnicode_CONVERT_BYTES( 1276 Py_UCS2, Py_UCS4, 1277 PyUnicode_2BYTE_DATA(from) + from_start, 1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1279 PyUnicode_4BYTE_DATA(to) + to_start 1280 ); 1281 } 1282 else { 1283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1284 1285 if (!check_maxchar) { 1286 if (from_kind == PyUnicode_2BYTE_KIND 1287 && to_kind == PyUnicode_1BYTE_KIND) 1288 { 1289 _PyUnicode_CONVERT_BYTES( 1290 Py_UCS2, Py_UCS1, 1291 PyUnicode_2BYTE_DATA(from) + from_start, 1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1293 PyUnicode_1BYTE_DATA(to) + to_start 1294 ); 1295 } 1296 else if (from_kind == PyUnicode_4BYTE_KIND 1297 && to_kind == PyUnicode_1BYTE_KIND) 1298 { 1299 _PyUnicode_CONVERT_BYTES( 1300 Py_UCS4, Py_UCS1, 1301 PyUnicode_4BYTE_DATA(from) + from_start, 1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1303 PyUnicode_1BYTE_DATA(to) + to_start 1304 ); 1305 } 1306 else if (from_kind == PyUnicode_4BYTE_KIND 1307 && to_kind == PyUnicode_2BYTE_KIND) 1308 { 1309 _PyUnicode_CONVERT_BYTES( 1310 Py_UCS4, Py_UCS2, 1311 PyUnicode_4BYTE_DATA(from) + from_start, 1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1313 PyUnicode_2BYTE_DATA(to) + to_start 1314 ); 1315 } 1316 else { 1317 assert(0); 1318 return -1; 1319 } 1320 } 1321 else { 1322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1323 Py_UCS4 ch; 1324 Py_ssize_t i; 1325 1326 for (i=0; i < how_many; i++) { 1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1328 if (ch > to_maxchar) 1329 return -1; 1330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1331 } 1332 } 1333 } 1334 return 0; 1335} 1336 1337void 1338_PyUnicode_FastCopyCharacters( 1339 PyObject *to, Py_ssize_t to_start, 1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1341{ 1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1343} 1344 1345Py_ssize_t 1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1347 PyObject *from, Py_ssize_t from_start, 1348 Py_ssize_t how_many) 1349{ 1350 int err; 1351 1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1353 PyErr_BadInternalCall(); 1354 return -1; 1355 } 1356 1357 if (PyUnicode_READY(from) == -1) 1358 return -1; 1359 if (PyUnicode_READY(to) == -1) 1360 return -1; 1361 1362 if (from_start < 0) { 1363 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1364 return -1; 1365 } 1366 if (to_start < 0) { 1367 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1368 return -1; 1369 } 1370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1372 PyErr_Format(PyExc_SystemError, 1373 "Cannot write %zi characters at %zi " 1374 "in a string of %zi characters", 1375 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1376 return -1; 1377 } 1378 1379 if (how_many == 0) 1380 return 0; 1381 1382 if (unicode_check_modifiable(to)) 1383 return -1; 1384 1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1386 if (err) { 1387 PyErr_Format(PyExc_SystemError, 1388 "Cannot copy %s characters " 1389 "into a string of %s characters", 1390 unicode_kind_name(from), 1391 unicode_kind_name(to)); 1392 return -1; 1393 } 1394 return how_many; 1395} 1396 1397/* Find the maximum code point and count the number of surrogate pairs so a 1398 correct string length can be computed before converting a string to UCS4. 1399 This function counts single surrogates as a character and not as a pair. 1400 1401 Return 0 on success, or -1 on error. */ 1402static int 1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1405{ 1406 const wchar_t *iter; 1407 Py_UCS4 ch; 1408 1409 assert(num_surrogates != NULL && maxchar != NULL); 1410 *num_surrogates = 0; 1411 *maxchar = 0; 1412 1413 for (iter = begin; iter < end; ) { 1414#if SIZEOF_WCHAR_T == 2 1415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1416 && (iter+1) < end 1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1418 { 1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1420 ++(*num_surrogates); 1421 iter += 2; 1422 } 1423 else 1424#endif 1425 { 1426 ch = *iter; 1427 iter++; 1428 } 1429 if (ch > *maxchar) { 1430 *maxchar = ch; 1431 if (*maxchar > MAX_UNICODE) { 1432 PyErr_Format(PyExc_ValueError, 1433 "character U+%x is not in range [U+0000; U+10ffff]", 1434 ch); 1435 return -1; 1436 } 1437 } 1438 } 1439 return 0; 1440} 1441 1442int 1443_PyUnicode_Ready(PyObject *unicode) 1444{ 1445 wchar_t *end; 1446 Py_UCS4 maxchar = 0; 1447 Py_ssize_t num_surrogates; 1448#if SIZEOF_WCHAR_T == 2 1449 Py_ssize_t length_wo_surrogates; 1450#endif 1451 1452 /* _PyUnicode_Ready() is only intended for old-style API usage where 1453 strings were created using _PyObject_New() and where no canonical 1454 representation (the str field) has been set yet aka strings 1455 which are not yet ready. */ 1456 assert(_PyUnicode_CHECK(unicode)); 1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1458 assert(_PyUnicode_WSTR(unicode) != NULL); 1459 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1460 assert(_PyUnicode_UTF8(unicode) == NULL); 1461 /* Actually, it should neither be interned nor be anything else: */ 1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1463 1464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1466 &maxchar, &num_surrogates) == -1) 1467 return -1; 1468 1469 if (maxchar < 256) { 1470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1471 if (!_PyUnicode_DATA_ANY(unicode)) { 1472 PyErr_NoMemory(); 1473 return -1; 1474 } 1475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1476 _PyUnicode_WSTR(unicode), end, 1477 PyUnicode_1BYTE_DATA(unicode)); 1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1481 if (maxchar < 128) { 1482 _PyUnicode_STATE(unicode).ascii = 1; 1483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1485 } 1486 else { 1487 _PyUnicode_STATE(unicode).ascii = 0; 1488 _PyUnicode_UTF8(unicode) = NULL; 1489 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1490 } 1491 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1492 _PyUnicode_WSTR(unicode) = NULL; 1493 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1494 } 1495 /* In this case we might have to convert down from 4-byte native 1496 wchar_t to 2-byte unicode. */ 1497 else if (maxchar < 65536) { 1498 assert(num_surrogates == 0 && 1499 "FindMaxCharAndNumSurrogatePairs() messed up"); 1500 1501#if SIZEOF_WCHAR_T == 2 1502 /* We can share representations and are done. */ 1503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1507 _PyUnicode_UTF8(unicode) = NULL; 1508 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1509#else 1510 /* sizeof(wchar_t) == 4 */ 1511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1513 if (!_PyUnicode_DATA_ANY(unicode)) { 1514 PyErr_NoMemory(); 1515 return -1; 1516 } 1517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1518 _PyUnicode_WSTR(unicode), end, 1519 PyUnicode_2BYTE_DATA(unicode)); 1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1523 _PyUnicode_UTF8(unicode) = NULL; 1524 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1525 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1526 _PyUnicode_WSTR(unicode) = NULL; 1527 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1528#endif 1529 } 1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1531 else { 1532#if SIZEOF_WCHAR_T == 2 1533 /* in case the native representation is 2-bytes, we need to allocate a 1534 new normalized 4-byte version. */ 1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1536 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1537 PyErr_NoMemory(); 1538 return -1; 1539 } 1540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1541 if (!_PyUnicode_DATA_ANY(unicode)) { 1542 PyErr_NoMemory(); 1543 return -1; 1544 } 1545 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1546 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1547 _PyUnicode_UTF8(unicode) = NULL; 1548 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1549 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1550 _PyUnicode_STATE(unicode).ready = 1; 1551 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1552 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1553 _PyUnicode_WSTR(unicode) = NULL; 1554 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1555#else 1556 assert(num_surrogates == 0); 1557 1558 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1559 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1560 _PyUnicode_UTF8(unicode) = NULL; 1561 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1562 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1563#endif 1564 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1565 } 1566 _PyUnicode_STATE(unicode).ready = 1; 1567 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1568 return 0; 1569} 1570 1571static void 1572unicode_dealloc(PyObject *unicode) 1573{ 1574 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1575 case SSTATE_NOT_INTERNED: 1576 break; 1577 1578 case SSTATE_INTERNED_MORTAL: 1579 /* revive dead object temporarily for DelItem */ 1580 Py_REFCNT(unicode) = 3; 1581 if (PyDict_DelItem(interned, unicode) != 0) 1582 Py_FatalError( 1583 "deletion of interned string failed"); 1584 break; 1585 1586 case SSTATE_INTERNED_IMMORTAL: 1587 Py_FatalError("Immortal interned string died."); 1588 1589 default: 1590 Py_FatalError("Inconsistent interned string state."); 1591 } 1592 1593 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1594 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1595 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1596 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1597 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1598 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1599 1600 Py_TYPE(unicode)->tp_free(unicode); 1601} 1602 1603#ifdef Py_DEBUG 1604static int 1605unicode_is_singleton(PyObject *unicode) 1606{ 1607 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1608 if (unicode == unicode_empty) 1609 return 1; 1610 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1611 { 1612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1613 if (ch < 256 && unicode_latin1[ch] == unicode) 1614 return 1; 1615 } 1616 return 0; 1617} 1618#endif 1619 1620static int 1621unicode_modifiable(PyObject *unicode) 1622{ 1623 assert(_PyUnicode_CHECK(unicode)); 1624 if (Py_REFCNT(unicode) != 1) 1625 return 0; 1626 if (_PyUnicode_HASH(unicode) != -1) 1627 return 0; 1628 if (PyUnicode_CHECK_INTERNED(unicode)) 1629 return 0; 1630 if (!PyUnicode_CheckExact(unicode)) 1631 return 0; 1632#ifdef Py_DEBUG 1633 /* singleton refcount is greater than 1 */ 1634 assert(!unicode_is_singleton(unicode)); 1635#endif 1636 return 1; 1637} 1638 1639static int 1640unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1641{ 1642 PyObject *unicode; 1643 Py_ssize_t old_length; 1644 1645 assert(p_unicode != NULL); 1646 unicode = *p_unicode; 1647 1648 assert(unicode != NULL); 1649 assert(PyUnicode_Check(unicode)); 1650 assert(0 <= length); 1651 1652 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1653 old_length = PyUnicode_WSTR_LENGTH(unicode); 1654 else 1655 old_length = PyUnicode_GET_LENGTH(unicode); 1656 if (old_length == length) 1657 return 0; 1658 1659 if (length == 0) { 1660 _Py_INCREF_UNICODE_EMPTY(); 1661 if (!unicode_empty) 1662 return -1; 1663 Py_DECREF(*p_unicode); 1664 *p_unicode = unicode_empty; 1665 return 0; 1666 } 1667 1668 if (!unicode_modifiable(unicode)) { 1669 PyObject *copy = resize_copy(unicode, length); 1670 if (copy == NULL) 1671 return -1; 1672 Py_DECREF(*p_unicode); 1673 *p_unicode = copy; 1674 return 0; 1675 } 1676 1677 if (PyUnicode_IS_COMPACT(unicode)) { 1678 PyObject *new_unicode = resize_compact(unicode, length); 1679 if (new_unicode == NULL) 1680 return -1; 1681 *p_unicode = new_unicode; 1682 return 0; 1683 } 1684 return resize_inplace(unicode, length); 1685} 1686 1687int 1688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1689{ 1690 PyObject *unicode; 1691 if (p_unicode == NULL) { 1692 PyErr_BadInternalCall(); 1693 return -1; 1694 } 1695 unicode = *p_unicode; 1696 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1697 { 1698 PyErr_BadInternalCall(); 1699 return -1; 1700 } 1701 return unicode_resize(p_unicode, length); 1702} 1703 1704/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1705 1706 WARNING: The function doesn't copy the terminating null character and 1707 doesn't check the maximum character (may write a latin1 character in an 1708 ASCII string). */ 1709static void 1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1711 const char *str, Py_ssize_t len) 1712{ 1713 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1714 void *data = PyUnicode_DATA(unicode); 1715 const char *end = str + len; 1716 1717 switch (kind) { 1718 case PyUnicode_1BYTE_KIND: { 1719 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1720#ifdef Py_DEBUG 1721 if (PyUnicode_IS_ASCII(unicode)) { 1722 Py_UCS4 maxchar = ucs1lib_find_max_char( 1723 (const Py_UCS1*)str, 1724 (const Py_UCS1*)str + len); 1725 assert(maxchar < 128); 1726 } 1727#endif 1728 memcpy((char *) data + index, str, len); 1729 break; 1730 } 1731 case PyUnicode_2BYTE_KIND: { 1732 Py_UCS2 *start = (Py_UCS2 *)data + index; 1733 Py_UCS2 *ucs2 = start; 1734 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1735 1736 for (; str < end; ++ucs2, ++str) 1737 *ucs2 = (Py_UCS2)*str; 1738 1739 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1740 break; 1741 } 1742 default: { 1743 Py_UCS4 *start = (Py_UCS4 *)data + index; 1744 Py_UCS4 *ucs4 = start; 1745 assert(kind == PyUnicode_4BYTE_KIND); 1746 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1747 1748 for (; str < end; ++ucs4, ++str) 1749 *ucs4 = (Py_UCS4)*str; 1750 1751 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1752 } 1753 } 1754} 1755 1756static PyObject* 1757get_latin1_char(unsigned char ch) 1758{ 1759 PyObject *unicode = unicode_latin1[ch]; 1760 if (!unicode) { 1761 unicode = PyUnicode_New(1, ch); 1762 if (!unicode) 1763 return NULL; 1764 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1765 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1766 unicode_latin1[ch] = unicode; 1767 } 1768 Py_INCREF(unicode); 1769 return unicode; 1770} 1771 1772static PyObject* 1773unicode_char(Py_UCS4 ch) 1774{ 1775 PyObject *unicode; 1776 1777 assert(ch <= MAX_UNICODE); 1778 1779 if (ch < 256) 1780 return get_latin1_char(ch); 1781 1782 unicode = PyUnicode_New(1, ch); 1783 if (unicode == NULL) 1784 return NULL; 1785 switch (PyUnicode_KIND(unicode)) { 1786 case PyUnicode_1BYTE_KIND: 1787 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1788 break; 1789 case PyUnicode_2BYTE_KIND: 1790 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1791 break; 1792 default: 1793 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1794 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1795 } 1796 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1797 return unicode; 1798} 1799 1800PyObject * 1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1802{ 1803 PyObject *unicode; 1804 Py_UCS4 maxchar = 0; 1805 Py_ssize_t num_surrogates; 1806 1807 if (u == NULL) 1808 return (PyObject*)_PyUnicode_New(size); 1809 1810 /* If the Unicode data is known at construction time, we can apply 1811 some optimizations which share commonly used objects. */ 1812 1813 /* Optimization for empty strings */ 1814 if (size == 0) 1815 _Py_RETURN_UNICODE_EMPTY(); 1816 1817 /* Single character Unicode objects in the Latin-1 range are 1818 shared when using this constructor */ 1819 if (size == 1 && (Py_UCS4)*u < 256) 1820 return get_latin1_char((unsigned char)*u); 1821 1822 /* If not empty and not single character, copy the Unicode data 1823 into the new object */ 1824 if (find_maxchar_surrogates(u, u + size, 1825 &maxchar, &num_surrogates) == -1) 1826 return NULL; 1827 1828 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1829 if (!unicode) 1830 return NULL; 1831 1832 switch (PyUnicode_KIND(unicode)) { 1833 case PyUnicode_1BYTE_KIND: 1834 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1835 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1836 break; 1837 case PyUnicode_2BYTE_KIND: 1838#if Py_UNICODE_SIZE == 2 1839 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1840#else 1841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1842 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1843#endif 1844 break; 1845 case PyUnicode_4BYTE_KIND: 1846#if SIZEOF_WCHAR_T == 2 1847 /* This is the only case which has to process surrogates, thus 1848 a simple copy loop is not enough and we need a function. */ 1849 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1850#else 1851 assert(num_surrogates == 0); 1852 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1853#endif 1854 break; 1855 default: 1856 assert(0 && "Impossible state"); 1857 } 1858 1859 return unicode_result(unicode); 1860} 1861 1862PyObject * 1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1864{ 1865 if (size < 0) { 1866 PyErr_SetString(PyExc_SystemError, 1867 "Negative size passed to PyUnicode_FromStringAndSize"); 1868 return NULL; 1869 } 1870 if (u != NULL) 1871 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1872 else 1873 return (PyObject *)_PyUnicode_New(size); 1874} 1875 1876PyObject * 1877PyUnicode_FromString(const char *u) 1878{ 1879 size_t size = strlen(u); 1880 if (size > PY_SSIZE_T_MAX) { 1881 PyErr_SetString(PyExc_OverflowError, "input too long"); 1882 return NULL; 1883 } 1884 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1885} 1886 1887PyObject * 1888_PyUnicode_FromId(_Py_Identifier *id) 1889{ 1890 if (!id->object) { 1891 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1892 strlen(id->string), 1893 NULL, NULL); 1894 if (!id->object) 1895 return NULL; 1896 PyUnicode_InternInPlace(&id->object); 1897 assert(!id->next); 1898 id->next = static_strings; 1899 static_strings = id; 1900 } 1901 return id->object; 1902} 1903 1904void 1905_PyUnicode_ClearStaticStrings() 1906{ 1907 _Py_Identifier *tmp, *s = static_strings; 1908 while (s) { 1909 Py_CLEAR(s->object); 1910 tmp = s->next; 1911 s->next = NULL; 1912 s = tmp; 1913 } 1914 static_strings = NULL; 1915} 1916 1917/* Internal function, doesn't check maximum character */ 1918 1919PyObject* 1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1921{ 1922 const unsigned char *s = (const unsigned char *)buffer; 1923 PyObject *unicode; 1924 if (size == 1) { 1925#ifdef Py_DEBUG 1926 assert((unsigned char)s[0] < 128); 1927#endif 1928 return get_latin1_char(s[0]); 1929 } 1930 unicode = PyUnicode_New(size, 127); 1931 if (!unicode) 1932 return NULL; 1933 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1934 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1935 return unicode; 1936} 1937 1938static Py_UCS4 1939kind_maxchar_limit(unsigned int kind) 1940{ 1941 switch (kind) { 1942 case PyUnicode_1BYTE_KIND: 1943 return 0x80; 1944 case PyUnicode_2BYTE_KIND: 1945 return 0x100; 1946 case PyUnicode_4BYTE_KIND: 1947 return 0x10000; 1948 default: 1949 assert(0 && "invalid kind"); 1950 return MAX_UNICODE; 1951 } 1952} 1953 1954Py_LOCAL_INLINE(Py_UCS4) 1955align_maxchar(Py_UCS4 maxchar) 1956{ 1957 if (maxchar <= 127) 1958 return 127; 1959 else if (maxchar <= 255) 1960 return 255; 1961 else if (maxchar <= 65535) 1962 return 65535; 1963 else 1964 return MAX_UNICODE; 1965} 1966 1967static PyObject* 1968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1969{ 1970 PyObject *res; 1971 unsigned char max_char; 1972 1973 if (size == 0) 1974 _Py_RETURN_UNICODE_EMPTY(); 1975 assert(size > 0); 1976 if (size == 1) 1977 return get_latin1_char(u[0]); 1978 1979 max_char = ucs1lib_find_max_char(u, u + size); 1980 res = PyUnicode_New(size, max_char); 1981 if (!res) 1982 return NULL; 1983 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1984 assert(_PyUnicode_CheckConsistency(res, 1)); 1985 return res; 1986} 1987 1988static PyObject* 1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1990{ 1991 PyObject *res; 1992 Py_UCS2 max_char; 1993 1994 if (size == 0) 1995 _Py_RETURN_UNICODE_EMPTY(); 1996 assert(size > 0); 1997 if (size == 1) 1998 return unicode_char(u[0]); 1999 2000 max_char = ucs2lib_find_max_char(u, u + size); 2001 res = PyUnicode_New(size, max_char); 2002 if (!res) 2003 return NULL; 2004 if (max_char >= 256) 2005 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2006 else { 2007 _PyUnicode_CONVERT_BYTES( 2008 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2009 } 2010 assert(_PyUnicode_CheckConsistency(res, 1)); 2011 return res; 2012} 2013 2014static PyObject* 2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2016{ 2017 PyObject *res; 2018 Py_UCS4 max_char; 2019 2020 if (size == 0) 2021 _Py_RETURN_UNICODE_EMPTY(); 2022 assert(size > 0); 2023 if (size == 1) 2024 return unicode_char(u[0]); 2025 2026 max_char = ucs4lib_find_max_char(u, u + size); 2027 res = PyUnicode_New(size, max_char); 2028 if (!res) 2029 return NULL; 2030 if (max_char < 256) 2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2032 PyUnicode_1BYTE_DATA(res)); 2033 else if (max_char < 0x10000) 2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2035 PyUnicode_2BYTE_DATA(res)); 2036 else 2037 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2038 assert(_PyUnicode_CheckConsistency(res, 1)); 2039 return res; 2040} 2041 2042PyObject* 2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2044{ 2045 if (size < 0) { 2046 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2047 return NULL; 2048 } 2049 switch (kind) { 2050 case PyUnicode_1BYTE_KIND: 2051 return _PyUnicode_FromUCS1(buffer, size); 2052 case PyUnicode_2BYTE_KIND: 2053 return _PyUnicode_FromUCS2(buffer, size); 2054 case PyUnicode_4BYTE_KIND: 2055 return _PyUnicode_FromUCS4(buffer, size); 2056 default: 2057 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2058 return NULL; 2059 } 2060} 2061 2062Py_UCS4 2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2064{ 2065 enum PyUnicode_Kind kind; 2066 void *startptr, *endptr; 2067 2068 assert(PyUnicode_IS_READY(unicode)); 2069 assert(0 <= start); 2070 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2071 assert(start <= end); 2072 2073 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2074 return PyUnicode_MAX_CHAR_VALUE(unicode); 2075 2076 if (start == end) 2077 return 127; 2078 2079 if (PyUnicode_IS_ASCII(unicode)) 2080 return 127; 2081 2082 kind = PyUnicode_KIND(unicode); 2083 startptr = PyUnicode_DATA(unicode); 2084 endptr = (char *)startptr + end * kind; 2085 startptr = (char *)startptr + start * kind; 2086 switch(kind) { 2087 case PyUnicode_1BYTE_KIND: 2088 return ucs1lib_find_max_char(startptr, endptr); 2089 case PyUnicode_2BYTE_KIND: 2090 return ucs2lib_find_max_char(startptr, endptr); 2091 case PyUnicode_4BYTE_KIND: 2092 return ucs4lib_find_max_char(startptr, endptr); 2093 default: 2094 assert(0); 2095 return 0; 2096 } 2097} 2098 2099/* Ensure that a string uses the most efficient storage, if it is not the 2100 case: create a new string with of the right kind. Write NULL into *p_unicode 2101 on error. */ 2102static void 2103unicode_adjust_maxchar(PyObject **p_unicode) 2104{ 2105 PyObject *unicode, *copy; 2106 Py_UCS4 max_char; 2107 Py_ssize_t len; 2108 unsigned int kind; 2109 2110 assert(p_unicode != NULL); 2111 unicode = *p_unicode; 2112 assert(PyUnicode_IS_READY(unicode)); 2113 if (PyUnicode_IS_ASCII(unicode)) 2114 return; 2115 2116 len = PyUnicode_GET_LENGTH(unicode); 2117 kind = PyUnicode_KIND(unicode); 2118 if (kind == PyUnicode_1BYTE_KIND) { 2119 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2120 max_char = ucs1lib_find_max_char(u, u + len); 2121 if (max_char >= 128) 2122 return; 2123 } 2124 else if (kind == PyUnicode_2BYTE_KIND) { 2125 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2126 max_char = ucs2lib_find_max_char(u, u + len); 2127 if (max_char >= 256) 2128 return; 2129 } 2130 else { 2131 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2132 assert(kind == PyUnicode_4BYTE_KIND); 2133 max_char = ucs4lib_find_max_char(u, u + len); 2134 if (max_char >= 0x10000) 2135 return; 2136 } 2137 copy = PyUnicode_New(len, max_char); 2138 if (copy != NULL) 2139 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2140 Py_DECREF(unicode); 2141 *p_unicode = copy; 2142} 2143 2144PyObject* 2145_PyUnicode_Copy(PyObject *unicode) 2146{ 2147 Py_ssize_t length; 2148 PyObject *copy; 2149 2150 if (!PyUnicode_Check(unicode)) { 2151 PyErr_BadInternalCall(); 2152 return NULL; 2153 } 2154 if (PyUnicode_READY(unicode) == -1) 2155 return NULL; 2156 2157 length = PyUnicode_GET_LENGTH(unicode); 2158 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2159 if (!copy) 2160 return NULL; 2161 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2162 2163 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2164 length * PyUnicode_KIND(unicode)); 2165 assert(_PyUnicode_CheckConsistency(copy, 1)); 2166 return copy; 2167} 2168 2169 2170/* Widen Unicode objects to larger buffers. Don't write terminating null 2171 character. Return NULL on error. */ 2172 2173void* 2174_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2175{ 2176 Py_ssize_t len; 2177 void *result; 2178 unsigned int skind; 2179 2180 if (PyUnicode_READY(s) == -1) 2181 return NULL; 2182 2183 len = PyUnicode_GET_LENGTH(s); 2184 skind = PyUnicode_KIND(s); 2185 if (skind >= kind) { 2186 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2187 return NULL; 2188 } 2189 switch (kind) { 2190 case PyUnicode_2BYTE_KIND: 2191 result = PyMem_New(Py_UCS2, len); 2192 if (!result) 2193 return PyErr_NoMemory(); 2194 assert(skind == PyUnicode_1BYTE_KIND); 2195 _PyUnicode_CONVERT_BYTES( 2196 Py_UCS1, Py_UCS2, 2197 PyUnicode_1BYTE_DATA(s), 2198 PyUnicode_1BYTE_DATA(s) + len, 2199 result); 2200 return result; 2201 case PyUnicode_4BYTE_KIND: 2202 result = PyMem_New(Py_UCS4, len); 2203 if (!result) 2204 return PyErr_NoMemory(); 2205 if (skind == PyUnicode_2BYTE_KIND) { 2206 _PyUnicode_CONVERT_BYTES( 2207 Py_UCS2, Py_UCS4, 2208 PyUnicode_2BYTE_DATA(s), 2209 PyUnicode_2BYTE_DATA(s) + len, 2210 result); 2211 } 2212 else { 2213 assert(skind == PyUnicode_1BYTE_KIND); 2214 _PyUnicode_CONVERT_BYTES( 2215 Py_UCS1, Py_UCS4, 2216 PyUnicode_1BYTE_DATA(s), 2217 PyUnicode_1BYTE_DATA(s) + len, 2218 result); 2219 } 2220 return result; 2221 default: 2222 break; 2223 } 2224 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2225 return NULL; 2226} 2227 2228static Py_UCS4* 2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2230 int copy_null) 2231{ 2232 int kind; 2233 void *data; 2234 Py_ssize_t len, targetlen; 2235 if (PyUnicode_READY(string) == -1) 2236 return NULL; 2237 kind = PyUnicode_KIND(string); 2238 data = PyUnicode_DATA(string); 2239 len = PyUnicode_GET_LENGTH(string); 2240 targetlen = len; 2241 if (copy_null) 2242 targetlen++; 2243 if (!target) { 2244 target = PyMem_New(Py_UCS4, targetlen); 2245 if (!target) { 2246 PyErr_NoMemory(); 2247 return NULL; 2248 } 2249 } 2250 else { 2251 if (targetsize < targetlen) { 2252 PyErr_Format(PyExc_SystemError, 2253 "string is longer than the buffer"); 2254 if (copy_null && 0 < targetsize) 2255 target[0] = 0; 2256 return NULL; 2257 } 2258 } 2259 if (kind == PyUnicode_1BYTE_KIND) { 2260 Py_UCS1 *start = (Py_UCS1 *) data; 2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2262 } 2263 else if (kind == PyUnicode_2BYTE_KIND) { 2264 Py_UCS2 *start = (Py_UCS2 *) data; 2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2266 } 2267 else { 2268 assert(kind == PyUnicode_4BYTE_KIND); 2269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2270 } 2271 if (copy_null) 2272 target[len] = 0; 2273 return target; 2274} 2275 2276Py_UCS4* 2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2278 int copy_null) 2279{ 2280 if (target == NULL || targetsize < 0) { 2281 PyErr_BadInternalCall(); 2282 return NULL; 2283 } 2284 return as_ucs4(string, target, targetsize, copy_null); 2285} 2286 2287Py_UCS4* 2288PyUnicode_AsUCS4Copy(PyObject *string) 2289{ 2290 return as_ucs4(string, NULL, 0, 1); 2291} 2292 2293#ifdef HAVE_WCHAR_H 2294 2295PyObject * 2296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2297{ 2298 if (w == NULL) { 2299 if (size == 0) 2300 _Py_RETURN_UNICODE_EMPTY(); 2301 PyErr_BadInternalCall(); 2302 return NULL; 2303 } 2304 2305 if (size == -1) { 2306 size = wcslen(w); 2307 } 2308 2309 return PyUnicode_FromUnicode(w, size); 2310} 2311 2312#endif /* HAVE_WCHAR_H */ 2313 2314/* maximum number of characters required for output of %lld or %p. 2315 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2316 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2318 2319static int 2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2321 Py_ssize_t width, Py_ssize_t precision) 2322{ 2323 Py_ssize_t length, fill, arglen; 2324 Py_UCS4 maxchar; 2325 2326 if (PyUnicode_READY(str) == -1) 2327 return -1; 2328 2329 length = PyUnicode_GET_LENGTH(str); 2330 if ((precision == -1 || precision >= length) 2331 && width <= length) 2332 return _PyUnicodeWriter_WriteStr(writer, str); 2333 2334 if (precision != -1) 2335 length = Py_MIN(precision, length); 2336 2337 arglen = Py_MAX(length, width); 2338 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2339 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2340 else 2341 maxchar = writer->maxchar; 2342 2343 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2344 return -1; 2345 2346 if (width > length) { 2347 fill = width - length; 2348 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2349 return -1; 2350 writer->pos += fill; 2351 } 2352 2353 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2354 str, 0, length); 2355 writer->pos += length; 2356 return 0; 2357} 2358 2359static int 2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2361 Py_ssize_t width, Py_ssize_t precision) 2362{ 2363 /* UTF-8 */ 2364 Py_ssize_t length; 2365 PyObject *unicode; 2366 int res; 2367 2368 length = strlen(str); 2369 if (precision != -1) 2370 length = Py_MIN(length, precision); 2371 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2372 if (unicode == NULL) 2373 return -1; 2374 2375 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2376 Py_DECREF(unicode); 2377 return res; 2378} 2379 2380static const char* 2381unicode_fromformat_arg(_PyUnicodeWriter *writer, 2382 const char *f, va_list *vargs) 2383{ 2384 const char *p; 2385 Py_ssize_t len; 2386 int zeropad; 2387 Py_ssize_t width; 2388 Py_ssize_t precision; 2389 int longflag; 2390 int longlongflag; 2391 int size_tflag; 2392 Py_ssize_t fill; 2393 2394 p = f; 2395 f++; 2396 zeropad = 0; 2397 if (*f == '0') { 2398 zeropad = 1; 2399 f++; 2400 } 2401 2402 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2403 width = -1; 2404 if (Py_ISDIGIT((unsigned)*f)) { 2405 width = *f - '0'; 2406 f++; 2407 while (Py_ISDIGIT((unsigned)*f)) { 2408 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2409 PyErr_SetString(PyExc_ValueError, 2410 "width too big"); 2411 return NULL; 2412 } 2413 width = (width * 10) + (*f - '0'); 2414 f++; 2415 } 2416 } 2417 precision = -1; 2418 if (*f == '.') { 2419 f++; 2420 if (Py_ISDIGIT((unsigned)*f)) { 2421 precision = (*f - '0'); 2422 f++; 2423 while (Py_ISDIGIT((unsigned)*f)) { 2424 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2425 PyErr_SetString(PyExc_ValueError, 2426 "precision too big"); 2427 return NULL; 2428 } 2429 precision = (precision * 10) + (*f - '0'); 2430 f++; 2431 } 2432 } 2433 if (*f == '%') { 2434 /* "%.3%s" => f points to "3" */ 2435 f--; 2436 } 2437 } 2438 if (*f == '\0') { 2439 /* bogus format "%.123" => go backward, f points to "3" */ 2440 f--; 2441 } 2442 2443 /* Handle %ld, %lu, %lld and %llu. */ 2444 longflag = 0; 2445 longlongflag = 0; 2446 size_tflag = 0; 2447 if (*f == 'l') { 2448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2449 longflag = 1; 2450 ++f; 2451 } 2452#ifdef HAVE_LONG_LONG 2453 else if (f[1] == 'l' && 2454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2455 longlongflag = 1; 2456 f += 2; 2457 } 2458#endif 2459 } 2460 /* handle the size_t flag. */ 2461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2462 size_tflag = 1; 2463 ++f; 2464 } 2465 2466 if (f[1] == '\0') 2467 writer->overallocate = 0; 2468 2469 switch (*f) { 2470 case 'c': 2471 { 2472 int ordinal = va_arg(*vargs, int); 2473 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2474 PyErr_SetString(PyExc_OverflowError, 2475 "character argument not in range(0x110000)"); 2476 return NULL; 2477 } 2478 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2479 return NULL; 2480 break; 2481 } 2482 2483 case 'i': 2484 case 'd': 2485 case 'u': 2486 case 'x': 2487 { 2488 /* used by sprintf */ 2489 char buffer[MAX_LONG_LONG_CHARS]; 2490 Py_ssize_t arglen; 2491 2492 if (*f == 'u') { 2493 if (longflag) 2494 len = sprintf(buffer, "%lu", 2495 va_arg(*vargs, unsigned long)); 2496#ifdef HAVE_LONG_LONG 2497 else if (longlongflag) 2498 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u", 2499 va_arg(*vargs, unsigned PY_LONG_LONG)); 2500#endif 2501 else if (size_tflag) 2502 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2503 va_arg(*vargs, size_t)); 2504 else 2505 len = sprintf(buffer, "%u", 2506 va_arg(*vargs, unsigned int)); 2507 } 2508 else if (*f == 'x') { 2509 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2510 } 2511 else { 2512 if (longflag) 2513 len = sprintf(buffer, "%li", 2514 va_arg(*vargs, long)); 2515#ifdef HAVE_LONG_LONG 2516 else if (longlongflag) 2517 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i", 2518 va_arg(*vargs, PY_LONG_LONG)); 2519#endif 2520 else if (size_tflag) 2521 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2522 va_arg(*vargs, Py_ssize_t)); 2523 else 2524 len = sprintf(buffer, "%i", 2525 va_arg(*vargs, int)); 2526 } 2527 assert(len >= 0); 2528 2529 if (precision < len) 2530 precision = len; 2531 2532 arglen = Py_MAX(precision, width); 2533 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2534 return NULL; 2535 2536 if (width > precision) { 2537 Py_UCS4 fillchar; 2538 fill = width - precision; 2539 fillchar = zeropad?'0':' '; 2540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2541 return NULL; 2542 writer->pos += fill; 2543 } 2544 if (precision > len) { 2545 fill = precision - len; 2546 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2547 return NULL; 2548 writer->pos += fill; 2549 } 2550 2551 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2552 return NULL; 2553 break; 2554 } 2555 2556 case 'p': 2557 { 2558 char number[MAX_LONG_LONG_CHARS]; 2559 2560 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2561 assert(len >= 0); 2562 2563 /* %p is ill-defined: ensure leading 0x. */ 2564 if (number[1] == 'X') 2565 number[1] = 'x'; 2566 else if (number[1] != 'x') { 2567 memmove(number + 2, number, 2568 strlen(number) + 1); 2569 number[0] = '0'; 2570 number[1] = 'x'; 2571 len += 2; 2572 } 2573 2574 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2575 return NULL; 2576 break; 2577 } 2578 2579 case 's': 2580 { 2581 /* UTF-8 */ 2582 const char *s = va_arg(*vargs, const char*); 2583 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2584 return NULL; 2585 break; 2586 } 2587 2588 case 'U': 2589 { 2590 PyObject *obj = va_arg(*vargs, PyObject *); 2591 assert(obj && _PyUnicode_CHECK(obj)); 2592 2593 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2594 return NULL; 2595 break; 2596 } 2597 2598 case 'V': 2599 { 2600 PyObject *obj = va_arg(*vargs, PyObject *); 2601 const char *str = va_arg(*vargs, const char *); 2602 if (obj) { 2603 assert(_PyUnicode_CHECK(obj)); 2604 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2605 return NULL; 2606 } 2607 else { 2608 assert(str != NULL); 2609 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2610 return NULL; 2611 } 2612 break; 2613 } 2614 2615 case 'S': 2616 { 2617 PyObject *obj = va_arg(*vargs, PyObject *); 2618 PyObject *str; 2619 assert(obj); 2620 str = PyObject_Str(obj); 2621 if (!str) 2622 return NULL; 2623 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2624 Py_DECREF(str); 2625 return NULL; 2626 } 2627 Py_DECREF(str); 2628 break; 2629 } 2630 2631 case 'R': 2632 { 2633 PyObject *obj = va_arg(*vargs, PyObject *); 2634 PyObject *repr; 2635 assert(obj); 2636 repr = PyObject_Repr(obj); 2637 if (!repr) 2638 return NULL; 2639 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2640 Py_DECREF(repr); 2641 return NULL; 2642 } 2643 Py_DECREF(repr); 2644 break; 2645 } 2646 2647 case 'A': 2648 { 2649 PyObject *obj = va_arg(*vargs, PyObject *); 2650 PyObject *ascii; 2651 assert(obj); 2652 ascii = PyObject_ASCII(obj); 2653 if (!ascii) 2654 return NULL; 2655 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2656 Py_DECREF(ascii); 2657 return NULL; 2658 } 2659 Py_DECREF(ascii); 2660 break; 2661 } 2662 2663 case '%': 2664 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2665 return NULL; 2666 break; 2667 2668 default: 2669 /* if we stumble upon an unknown formatting code, copy the rest 2670 of the format string to the output string. (we cannot just 2671 skip the code, since there's no way to know what's in the 2672 argument list) */ 2673 len = strlen(p); 2674 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2675 return NULL; 2676 f = p+len; 2677 return f; 2678 } 2679 2680 f++; 2681 return f; 2682} 2683 2684PyObject * 2685PyUnicode_FromFormatV(const char *format, va_list vargs) 2686{ 2687 va_list vargs2; 2688 const char *f; 2689 _PyUnicodeWriter writer; 2690 2691 _PyUnicodeWriter_Init(&writer); 2692 writer.min_length = strlen(format) + 100; 2693 writer.overallocate = 1; 2694 2695 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2696 Copy it to be able to pass a reference to a subfunction. */ 2697 Py_VA_COPY(vargs2, vargs); 2698 2699 for (f = format; *f; ) { 2700 if (*f == '%') { 2701 f = unicode_fromformat_arg(&writer, f, &vargs2); 2702 if (f == NULL) 2703 goto fail; 2704 } 2705 else { 2706 const char *p; 2707 Py_ssize_t len; 2708 2709 p = f; 2710 do 2711 { 2712 if ((unsigned char)*p > 127) { 2713 PyErr_Format(PyExc_ValueError, 2714 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2715 "string, got a non-ASCII byte: 0x%02x", 2716 (unsigned char)*p); 2717 return NULL; 2718 } 2719 p++; 2720 } 2721 while (*p != '\0' && *p != '%'); 2722 len = p - f; 2723 2724 if (*p == '\0') 2725 writer.overallocate = 0; 2726 2727 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2728 goto fail; 2729 2730 f = p; 2731 } 2732 } 2733 return _PyUnicodeWriter_Finish(&writer); 2734 2735 fail: 2736 _PyUnicodeWriter_Dealloc(&writer); 2737 return NULL; 2738} 2739 2740PyObject * 2741PyUnicode_FromFormat(const char *format, ...) 2742{ 2743 PyObject* ret; 2744 va_list vargs; 2745 2746#ifdef HAVE_STDARG_PROTOTYPES 2747 va_start(vargs, format); 2748#else 2749 va_start(vargs); 2750#endif 2751 ret = PyUnicode_FromFormatV(format, vargs); 2752 va_end(vargs); 2753 return ret; 2754} 2755 2756#ifdef HAVE_WCHAR_H 2757 2758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2759 convert a Unicode object to a wide character string. 2760 2761 - If w is NULL: return the number of wide characters (including the null 2762 character) required to convert the unicode object. Ignore size argument. 2763 2764 - Otherwise: return the number of wide characters (excluding the null 2765 character) written into w. Write at most size wide characters (including 2766 the null character). */ 2767static Py_ssize_t 2768unicode_aswidechar(PyObject *unicode, 2769 wchar_t *w, 2770 Py_ssize_t size) 2771{ 2772 Py_ssize_t res; 2773 const wchar_t *wstr; 2774 2775 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2776 if (wstr == NULL) 2777 return -1; 2778 2779 if (w != NULL) { 2780 if (size > res) 2781 size = res + 1; 2782 else 2783 res = size; 2784 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2785 return res; 2786 } 2787 else 2788 return res + 1; 2789} 2790 2791Py_ssize_t 2792PyUnicode_AsWideChar(PyObject *unicode, 2793 wchar_t *w, 2794 Py_ssize_t size) 2795{ 2796 if (unicode == NULL) { 2797 PyErr_BadInternalCall(); 2798 return -1; 2799 } 2800 return unicode_aswidechar(unicode, w, size); 2801} 2802 2803wchar_t* 2804PyUnicode_AsWideCharString(PyObject *unicode, 2805 Py_ssize_t *size) 2806{ 2807 wchar_t* buffer; 2808 Py_ssize_t buflen; 2809 2810 if (unicode == NULL) { 2811 PyErr_BadInternalCall(); 2812 return NULL; 2813 } 2814 2815 buflen = unicode_aswidechar(unicode, NULL, 0); 2816 if (buflen == -1) 2817 return NULL; 2818 buffer = PyMem_NEW(wchar_t, buflen); 2819 if (buffer == NULL) { 2820 PyErr_NoMemory(); 2821 return NULL; 2822 } 2823 buflen = unicode_aswidechar(unicode, buffer, buflen); 2824 if (buflen == -1) { 2825 PyMem_FREE(buffer); 2826 return NULL; 2827 } 2828 if (size != NULL) 2829 *size = buflen; 2830 return buffer; 2831} 2832 2833#endif /* HAVE_WCHAR_H */ 2834 2835PyObject * 2836PyUnicode_FromOrdinal(int ordinal) 2837{ 2838 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2839 PyErr_SetString(PyExc_ValueError, 2840 "chr() arg not in range(0x110000)"); 2841 return NULL; 2842 } 2843 2844 return unicode_char((Py_UCS4)ordinal); 2845} 2846 2847PyObject * 2848PyUnicode_FromObject(PyObject *obj) 2849{ 2850 /* XXX Perhaps we should make this API an alias of 2851 PyObject_Str() instead ?! */ 2852 if (PyUnicode_CheckExact(obj)) { 2853 if (PyUnicode_READY(obj) == -1) 2854 return NULL; 2855 Py_INCREF(obj); 2856 return obj; 2857 } 2858 if (PyUnicode_Check(obj)) { 2859 /* For a Unicode subtype that's not a Unicode object, 2860 return a true Unicode object with the same data. */ 2861 return _PyUnicode_Copy(obj); 2862 } 2863 PyErr_Format(PyExc_TypeError, 2864 "Can't convert '%.100s' object to str implicitly", 2865 Py_TYPE(obj)->tp_name); 2866 return NULL; 2867} 2868 2869PyObject * 2870PyUnicode_FromEncodedObject(PyObject *obj, 2871 const char *encoding, 2872 const char *errors) 2873{ 2874 Py_buffer buffer; 2875 PyObject *v; 2876 2877 if (obj == NULL) { 2878 PyErr_BadInternalCall(); 2879 return NULL; 2880 } 2881 2882 /* Decoding bytes objects is the most common case and should be fast */ 2883 if (PyBytes_Check(obj)) { 2884 if (PyBytes_GET_SIZE(obj) == 0) 2885 _Py_RETURN_UNICODE_EMPTY(); 2886 v = PyUnicode_Decode( 2887 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2888 encoding, errors); 2889 return v; 2890 } 2891 2892 if (PyUnicode_Check(obj)) { 2893 PyErr_SetString(PyExc_TypeError, 2894 "decoding str is not supported"); 2895 return NULL; 2896 } 2897 2898 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2899 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2900 PyErr_Format(PyExc_TypeError, 2901 "coercing to str: need a bytes-like object, %.80s found", 2902 Py_TYPE(obj)->tp_name); 2903 return NULL; 2904 } 2905 2906 if (buffer.len == 0) { 2907 PyBuffer_Release(&buffer); 2908 _Py_RETURN_UNICODE_EMPTY(); 2909 } 2910 2911 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2912 PyBuffer_Release(&buffer); 2913 return v; 2914} 2915 2916/* Convert encoding to lower case and replace '_' with '-' in order to 2917 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2918 1 on success. */ 2919int 2920_Py_normalize_encoding(const char *encoding, 2921 char *lower, 2922 size_t lower_len) 2923{ 2924 const char *e; 2925 char *l; 2926 char *l_end; 2927 2928 if (encoding == NULL) { 2929 /* 6 == strlen("utf-8") + 1 */ 2930 if (lower_len < 6) 2931 return 0; 2932 strcpy(lower, "utf-8"); 2933 return 1; 2934 } 2935 e = encoding; 2936 l = lower; 2937 l_end = &lower[lower_len - 1]; 2938 while (*e) { 2939 if (l == l_end) 2940 return 0; 2941 if (Py_ISUPPER(*e)) { 2942 *l++ = Py_TOLOWER(*e++); 2943 } 2944 else if (*e == '_') { 2945 *l++ = '-'; 2946 e++; 2947 } 2948 else { 2949 *l++ = *e++; 2950 } 2951 } 2952 *l = '\0'; 2953 return 1; 2954} 2955 2956PyObject * 2957PyUnicode_Decode(const char *s, 2958 Py_ssize_t size, 2959 const char *encoding, 2960 const char *errors) 2961{ 2962 PyObject *buffer = NULL, *unicode; 2963 Py_buffer info; 2964 char lower[11]; /* Enough for any encoding shortcut */ 2965 2966 /* Shortcuts for common default encodings */ 2967 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 2968 if ((strcmp(lower, "utf-8") == 0) || 2969 (strcmp(lower, "utf8") == 0)) 2970 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2971 else if ((strcmp(lower, "latin-1") == 0) || 2972 (strcmp(lower, "latin1") == 0) || 2973 (strcmp(lower, "iso-8859-1") == 0) || 2974 (strcmp(lower, "iso8859-1") == 0)) 2975 return PyUnicode_DecodeLatin1(s, size, errors); 2976#ifdef HAVE_MBCS 2977 else if (strcmp(lower, "mbcs") == 0) 2978 return PyUnicode_DecodeMBCS(s, size, errors); 2979#endif 2980 else if (strcmp(lower, "ascii") == 0) 2981 return PyUnicode_DecodeASCII(s, size, errors); 2982 else if (strcmp(lower, "utf-16") == 0) 2983 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2984 else if (strcmp(lower, "utf-32") == 0) 2985 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2986 } 2987 2988 /* Decode via the codec registry */ 2989 buffer = NULL; 2990 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2991 goto onError; 2992 buffer = PyMemoryView_FromBuffer(&info); 2993 if (buffer == NULL) 2994 goto onError; 2995 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 2996 if (unicode == NULL) 2997 goto onError; 2998 if (!PyUnicode_Check(unicode)) { 2999 PyErr_Format(PyExc_TypeError, 3000 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3001 "use codecs.decode() to decode to arbitrary types", 3002 encoding, 3003 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3004 Py_DECREF(unicode); 3005 goto onError; 3006 } 3007 Py_DECREF(buffer); 3008 return unicode_result(unicode); 3009 3010 onError: 3011 Py_XDECREF(buffer); 3012 return NULL; 3013} 3014 3015PyObject * 3016PyUnicode_AsDecodedObject(PyObject *unicode, 3017 const char *encoding, 3018 const char *errors) 3019{ 3020 PyObject *v; 3021 3022 if (!PyUnicode_Check(unicode)) { 3023 PyErr_BadArgument(); 3024 goto onError; 3025 } 3026 3027 if (encoding == NULL) 3028 encoding = PyUnicode_GetDefaultEncoding(); 3029 3030 /* Decode via the codec registry */ 3031 v = PyCodec_Decode(unicode, encoding, errors); 3032 if (v == NULL) 3033 goto onError; 3034 return unicode_result(v); 3035 3036 onError: 3037 return NULL; 3038} 3039 3040PyObject * 3041PyUnicode_AsDecodedUnicode(PyObject *unicode, 3042 const char *encoding, 3043 const char *errors) 3044{ 3045 PyObject *v; 3046 3047 if (!PyUnicode_Check(unicode)) { 3048 PyErr_BadArgument(); 3049 goto onError; 3050 } 3051 3052 if (encoding == NULL) 3053 encoding = PyUnicode_GetDefaultEncoding(); 3054 3055 /* Decode via the codec registry */ 3056 v = PyCodec_Decode(unicode, encoding, errors); 3057 if (v == NULL) 3058 goto onError; 3059 if (!PyUnicode_Check(v)) { 3060 PyErr_Format(PyExc_TypeError, 3061 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3062 "use codecs.decode() to decode to arbitrary types", 3063 encoding, 3064 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3065 Py_DECREF(v); 3066 goto onError; 3067 } 3068 return unicode_result(v); 3069 3070 onError: 3071 return NULL; 3072} 3073 3074PyObject * 3075PyUnicode_Encode(const Py_UNICODE *s, 3076 Py_ssize_t size, 3077 const char *encoding, 3078 const char *errors) 3079{ 3080 PyObject *v, *unicode; 3081 3082 unicode = PyUnicode_FromUnicode(s, size); 3083 if (unicode == NULL) 3084 return NULL; 3085 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3086 Py_DECREF(unicode); 3087 return v; 3088} 3089 3090PyObject * 3091PyUnicode_AsEncodedObject(PyObject *unicode, 3092 const char *encoding, 3093 const char *errors) 3094{ 3095 PyObject *v; 3096 3097 if (!PyUnicode_Check(unicode)) { 3098 PyErr_BadArgument(); 3099 goto onError; 3100 } 3101 3102 if (encoding == NULL) 3103 encoding = PyUnicode_GetDefaultEncoding(); 3104 3105 /* Encode via the codec registry */ 3106 v = PyCodec_Encode(unicode, encoding, errors); 3107 if (v == NULL) 3108 goto onError; 3109 return v; 3110 3111 onError: 3112 return NULL; 3113} 3114 3115static size_t 3116wcstombs_errorpos(const wchar_t *wstr) 3117{ 3118 size_t len; 3119#if SIZEOF_WCHAR_T == 2 3120 wchar_t buf[3]; 3121#else 3122 wchar_t buf[2]; 3123#endif 3124 char outbuf[MB_LEN_MAX]; 3125 const wchar_t *start, *previous; 3126 3127#if SIZEOF_WCHAR_T == 2 3128 buf[2] = 0; 3129#else 3130 buf[1] = 0; 3131#endif 3132 start = wstr; 3133 while (*wstr != L'\0') 3134 { 3135 previous = wstr; 3136#if SIZEOF_WCHAR_T == 2 3137 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3138 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3139 { 3140 buf[0] = wstr[0]; 3141 buf[1] = wstr[1]; 3142 wstr += 2; 3143 } 3144 else { 3145 buf[0] = *wstr; 3146 buf[1] = 0; 3147 wstr++; 3148 } 3149#else 3150 buf[0] = *wstr; 3151 wstr++; 3152#endif 3153 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3154 if (len == (size_t)-1) 3155 return previous - start; 3156 } 3157 3158 /* failed to find the unencodable character */ 3159 return 0; 3160} 3161 3162static int 3163locale_error_handler(const char *errors, int *surrogateescape) 3164{ 3165 if (errors == NULL) { 3166 *surrogateescape = 0; 3167 return 0; 3168 } 3169 3170 if (strcmp(errors, "strict") == 0) { 3171 *surrogateescape = 0; 3172 return 0; 3173 } 3174 if (strcmp(errors, "surrogateescape") == 0) { 3175 *surrogateescape = 1; 3176 return 0; 3177 } 3178 PyErr_Format(PyExc_ValueError, 3179 "only 'strict' and 'surrogateescape' error handlers " 3180 "are supported, not '%s'", 3181 errors); 3182 return -1; 3183} 3184 3185PyObject * 3186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3187{ 3188 Py_ssize_t wlen, wlen2; 3189 wchar_t *wstr; 3190 PyObject *bytes = NULL; 3191 char *errmsg; 3192 PyObject *reason = NULL; 3193 PyObject *exc; 3194 size_t error_pos; 3195 int surrogateescape; 3196 3197 if (locale_error_handler(errors, &surrogateescape) < 0) 3198 return NULL; 3199 3200 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3201 if (wstr == NULL) 3202 return NULL; 3203 3204 wlen2 = wcslen(wstr); 3205 if (wlen2 != wlen) { 3206 PyMem_Free(wstr); 3207 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3208 return NULL; 3209 } 3210 3211 if (surrogateescape) { 3212 /* "surrogateescape" error handler */ 3213 char *str; 3214 3215 str = Py_EncodeLocale(wstr, &error_pos); 3216 if (str == NULL) { 3217 if (error_pos == (size_t)-1) { 3218 PyErr_NoMemory(); 3219 PyMem_Free(wstr); 3220 return NULL; 3221 } 3222 else { 3223 goto encode_error; 3224 } 3225 } 3226 PyMem_Free(wstr); 3227 3228 bytes = PyBytes_FromString(str); 3229 PyMem_Free(str); 3230 } 3231 else { 3232 /* strict mode */ 3233 size_t len, len2; 3234 3235 len = wcstombs(NULL, wstr, 0); 3236 if (len == (size_t)-1) { 3237 error_pos = (size_t)-1; 3238 goto encode_error; 3239 } 3240 3241 bytes = PyBytes_FromStringAndSize(NULL, len); 3242 if (bytes == NULL) { 3243 PyMem_Free(wstr); 3244 return NULL; 3245 } 3246 3247 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3248 if (len2 == (size_t)-1 || len2 > len) { 3249 error_pos = (size_t)-1; 3250 goto encode_error; 3251 } 3252 PyMem_Free(wstr); 3253 } 3254 return bytes; 3255 3256encode_error: 3257 errmsg = strerror(errno); 3258 assert(errmsg != NULL); 3259 3260 if (error_pos == (size_t)-1) 3261 error_pos = wcstombs_errorpos(wstr); 3262 3263 PyMem_Free(wstr); 3264 Py_XDECREF(bytes); 3265 3266 if (errmsg != NULL) { 3267 size_t errlen; 3268 wstr = Py_DecodeLocale(errmsg, &errlen); 3269 if (wstr != NULL) { 3270 reason = PyUnicode_FromWideChar(wstr, errlen); 3271 PyMem_RawFree(wstr); 3272 } else 3273 errmsg = NULL; 3274 } 3275 if (errmsg == NULL) 3276 reason = PyUnicode_FromString( 3277 "wcstombs() encountered an unencodable " 3278 "wide character"); 3279 if (reason == NULL) 3280 return NULL; 3281 3282 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3283 "locale", unicode, 3284 (Py_ssize_t)error_pos, 3285 (Py_ssize_t)(error_pos+1), 3286 reason); 3287 Py_DECREF(reason); 3288 if (exc != NULL) { 3289 PyCodec_StrictErrors(exc); 3290 Py_XDECREF(exc); 3291 } 3292 return NULL; 3293} 3294 3295PyObject * 3296PyUnicode_EncodeFSDefault(PyObject *unicode) 3297{ 3298#ifdef HAVE_MBCS 3299 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3300#elif defined(__APPLE__) 3301 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3302#else 3303 PyInterpreterState *interp = PyThreadState_GET()->interp; 3304 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3305 cannot use it to encode and decode filenames before it is loaded. Load 3306 the Python codec requires to encode at least its own filename. Use the C 3307 version of the locale codec until the codec registry is initialized and 3308 the Python codec is loaded. 3309 3310 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3311 cannot only rely on it: check also interp->fscodec_initialized for 3312 subinterpreters. */ 3313 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3314 return PyUnicode_AsEncodedString(unicode, 3315 Py_FileSystemDefaultEncoding, 3316 "surrogateescape"); 3317 } 3318 else { 3319 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3320 } 3321#endif 3322} 3323 3324PyObject * 3325PyUnicode_AsEncodedString(PyObject *unicode, 3326 const char *encoding, 3327 const char *errors) 3328{ 3329 PyObject *v; 3330 char lower[11]; /* Enough for any encoding shortcut */ 3331 3332 if (!PyUnicode_Check(unicode)) { 3333 PyErr_BadArgument(); 3334 return NULL; 3335 } 3336 3337 /* Shortcuts for common default encodings */ 3338 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3339 if ((strcmp(lower, "utf-8") == 0) || 3340 (strcmp(lower, "utf8") == 0)) 3341 { 3342 if (errors == NULL || strcmp(errors, "strict") == 0) 3343 return _PyUnicode_AsUTF8String(unicode, NULL); 3344 else 3345 return _PyUnicode_AsUTF8String(unicode, errors); 3346 } 3347 else if ((strcmp(lower, "latin-1") == 0) || 3348 (strcmp(lower, "latin1") == 0) || 3349 (strcmp(lower, "iso-8859-1") == 0) || 3350 (strcmp(lower, "iso8859-1") == 0)) 3351 return _PyUnicode_AsLatin1String(unicode, errors); 3352#ifdef HAVE_MBCS 3353 else if (strcmp(lower, "mbcs") == 0) 3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3355#endif 3356 else if (strcmp(lower, "ascii") == 0) 3357 return _PyUnicode_AsASCIIString(unicode, errors); 3358 } 3359 3360 /* Encode via the codec registry */ 3361 v = _PyCodec_EncodeText(unicode, encoding, errors); 3362 if (v == NULL) 3363 return NULL; 3364 3365 /* The normal path */ 3366 if (PyBytes_Check(v)) 3367 return v; 3368 3369 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3370 if (PyByteArray_Check(v)) { 3371 int error; 3372 PyObject *b; 3373 3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3375 "encoder %s returned bytearray instead of bytes; " 3376 "use codecs.encode() to encode to arbitrary types", 3377 encoding); 3378 if (error) { 3379 Py_DECREF(v); 3380 return NULL; 3381 } 3382 3383 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3384 Py_DECREF(v); 3385 return b; 3386 } 3387 3388 PyErr_Format(PyExc_TypeError, 3389 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3390 "use codecs.encode() to encode to arbitrary types", 3391 encoding, 3392 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3393 Py_DECREF(v); 3394 return NULL; 3395} 3396 3397PyObject * 3398PyUnicode_AsEncodedUnicode(PyObject *unicode, 3399 const char *encoding, 3400 const char *errors) 3401{ 3402 PyObject *v; 3403 3404 if (!PyUnicode_Check(unicode)) { 3405 PyErr_BadArgument(); 3406 goto onError; 3407 } 3408 3409 if (encoding == NULL) 3410 encoding = PyUnicode_GetDefaultEncoding(); 3411 3412 /* Encode via the codec registry */ 3413 v = PyCodec_Encode(unicode, encoding, errors); 3414 if (v == NULL) 3415 goto onError; 3416 if (!PyUnicode_Check(v)) { 3417 PyErr_Format(PyExc_TypeError, 3418 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3419 "use codecs.encode() to encode to arbitrary types", 3420 encoding, 3421 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3422 Py_DECREF(v); 3423 goto onError; 3424 } 3425 return v; 3426 3427 onError: 3428 return NULL; 3429} 3430 3431static size_t 3432mbstowcs_errorpos(const char *str, size_t len) 3433{ 3434#ifdef HAVE_MBRTOWC 3435 const char *start = str; 3436 mbstate_t mbs; 3437 size_t converted; 3438 wchar_t ch; 3439 3440 memset(&mbs, 0, sizeof mbs); 3441 while (len) 3442 { 3443 converted = mbrtowc(&ch, str, len, &mbs); 3444 if (converted == 0) 3445 /* Reached end of string */ 3446 break; 3447 if (converted == (size_t)-1 || converted == (size_t)-2) { 3448 /* Conversion error or incomplete character */ 3449 return str - start; 3450 } 3451 else { 3452 str += converted; 3453 len -= converted; 3454 } 3455 } 3456 /* failed to find the undecodable byte sequence */ 3457 return 0; 3458#endif 3459 return 0; 3460} 3461 3462PyObject* 3463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3464 const char *errors) 3465{ 3466 wchar_t smallbuf[256]; 3467 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3468 wchar_t *wstr; 3469 size_t wlen, wlen2; 3470 PyObject *unicode; 3471 int surrogateescape; 3472 size_t error_pos; 3473 char *errmsg; 3474 PyObject *reason = NULL; /* initialize to prevent gcc warning */ 3475 PyObject *exc; 3476 3477 if (locale_error_handler(errors, &surrogateescape) < 0) 3478 return NULL; 3479 3480 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3481 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3482 return NULL; 3483 } 3484 3485 if (surrogateescape) { 3486 /* "surrogateescape" error handler */ 3487 wstr = Py_DecodeLocale(str, &wlen); 3488 if (wstr == NULL) { 3489 if (wlen == (size_t)-1) 3490 PyErr_NoMemory(); 3491 else 3492 PyErr_SetFromErrno(PyExc_OSError); 3493 return NULL; 3494 } 3495 3496 unicode = PyUnicode_FromWideChar(wstr, wlen); 3497 PyMem_RawFree(wstr); 3498 } 3499 else { 3500 /* strict mode */ 3501#ifndef HAVE_BROKEN_MBSTOWCS 3502 wlen = mbstowcs(NULL, str, 0); 3503#else 3504 wlen = len; 3505#endif 3506 if (wlen == (size_t)-1) 3507 goto decode_error; 3508 if (wlen+1 <= smallbuf_len) { 3509 wstr = smallbuf; 3510 } 3511 else { 3512 wstr = PyMem_New(wchar_t, wlen+1); 3513 if (!wstr) 3514 return PyErr_NoMemory(); 3515 } 3516 3517 wlen2 = mbstowcs(wstr, str, wlen+1); 3518 if (wlen2 == (size_t)-1) { 3519 if (wstr != smallbuf) 3520 PyMem_Free(wstr); 3521 goto decode_error; 3522 } 3523#ifdef HAVE_BROKEN_MBSTOWCS 3524 assert(wlen2 == wlen); 3525#endif 3526 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3527 if (wstr != smallbuf) 3528 PyMem_Free(wstr); 3529 } 3530 return unicode; 3531 3532decode_error: 3533 errmsg = strerror(errno); 3534 assert(errmsg != NULL); 3535 3536 error_pos = mbstowcs_errorpos(str, len); 3537 if (errmsg != NULL) { 3538 size_t errlen; 3539 wstr = Py_DecodeLocale(errmsg, &errlen); 3540 if (wstr != NULL) { 3541 reason = PyUnicode_FromWideChar(wstr, errlen); 3542 PyMem_RawFree(wstr); 3543 } else 3544 errmsg = NULL; 3545 } 3546 if (errmsg == NULL) 3547 reason = PyUnicode_FromString( 3548 "mbstowcs() encountered an invalid multibyte sequence"); 3549 if (reason == NULL) 3550 return NULL; 3551 3552 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3553 "locale", str, len, 3554 (Py_ssize_t)error_pos, 3555 (Py_ssize_t)(error_pos+1), 3556 reason); 3557 Py_DECREF(reason); 3558 if (exc != NULL) { 3559 PyCodec_StrictErrors(exc); 3560 Py_XDECREF(exc); 3561 } 3562 return NULL; 3563} 3564 3565PyObject* 3566PyUnicode_DecodeLocale(const char *str, const char *errors) 3567{ 3568 Py_ssize_t size = (Py_ssize_t)strlen(str); 3569 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3570} 3571 3572 3573PyObject* 3574PyUnicode_DecodeFSDefault(const char *s) { 3575 Py_ssize_t size = (Py_ssize_t)strlen(s); 3576 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3577} 3578 3579PyObject* 3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3581{ 3582#ifdef HAVE_MBCS 3583 return PyUnicode_DecodeMBCS(s, size, NULL); 3584#elif defined(__APPLE__) 3585 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3586#else 3587 PyInterpreterState *interp = PyThreadState_GET()->interp; 3588 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3589 cannot use it to encode and decode filenames before it is loaded. Load 3590 the Python codec requires to encode at least its own filename. Use the C 3591 version of the locale codec until the codec registry is initialized and 3592 the Python codec is loaded. 3593 3594 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3595 cannot only rely on it: check also interp->fscodec_initialized for 3596 subinterpreters. */ 3597 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3598 return PyUnicode_Decode(s, size, 3599 Py_FileSystemDefaultEncoding, 3600 "surrogateescape"); 3601 } 3602 else { 3603 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3604 } 3605#endif 3606} 3607 3608 3609int 3610PyUnicode_FSConverter(PyObject* arg, void* addr) 3611{ 3612 PyObject *output = NULL; 3613 Py_ssize_t size; 3614 void *data; 3615 if (arg == NULL) { 3616 Py_DECREF(*(PyObject**)addr); 3617 return 1; 3618 } 3619 if (PyBytes_Check(arg)) { 3620 output = arg; 3621 Py_INCREF(output); 3622 } 3623 else { 3624 arg = PyUnicode_FromObject(arg); 3625 if (!arg) 3626 return 0; 3627 output = PyUnicode_EncodeFSDefault(arg); 3628 Py_DECREF(arg); 3629 if (!output) 3630 return 0; 3631 if (!PyBytes_Check(output)) { 3632 Py_DECREF(output); 3633 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3634 return 0; 3635 } 3636 } 3637 size = PyBytes_GET_SIZE(output); 3638 data = PyBytes_AS_STRING(output); 3639 if ((size_t)size != strlen(data)) { 3640 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3641 Py_DECREF(output); 3642 return 0; 3643 } 3644 *(PyObject**)addr = output; 3645 return Py_CLEANUP_SUPPORTED; 3646} 3647 3648 3649int 3650PyUnicode_FSDecoder(PyObject* arg, void* addr) 3651{ 3652 PyObject *output = NULL; 3653 if (arg == NULL) { 3654 Py_DECREF(*(PyObject**)addr); 3655 return 1; 3656 } 3657 if (PyUnicode_Check(arg)) { 3658 if (PyUnicode_READY(arg) == -1) 3659 return 0; 3660 output = arg; 3661 Py_INCREF(output); 3662 } 3663 else { 3664 arg = PyBytes_FromObject(arg); 3665 if (!arg) 3666 return 0; 3667 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3668 PyBytes_GET_SIZE(arg)); 3669 Py_DECREF(arg); 3670 if (!output) 3671 return 0; 3672 if (!PyUnicode_Check(output)) { 3673 Py_DECREF(output); 3674 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3675 return 0; 3676 } 3677 } 3678 if (PyUnicode_READY(output) == -1) { 3679 Py_DECREF(output); 3680 return 0; 3681 } 3682 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3683 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3684 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3685 Py_DECREF(output); 3686 return 0; 3687 } 3688 *(PyObject**)addr = output; 3689 return Py_CLEANUP_SUPPORTED; 3690} 3691 3692 3693char* 3694PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3695{ 3696 PyObject *bytes; 3697 3698 if (!PyUnicode_Check(unicode)) { 3699 PyErr_BadArgument(); 3700 return NULL; 3701 } 3702 if (PyUnicode_READY(unicode) == -1) 3703 return NULL; 3704 3705 if (PyUnicode_UTF8(unicode) == NULL) { 3706 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3707 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3708 if (bytes == NULL) 3709 return NULL; 3710 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3711 if (_PyUnicode_UTF8(unicode) == NULL) { 3712 PyErr_NoMemory(); 3713 Py_DECREF(bytes); 3714 return NULL; 3715 } 3716 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3717 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3718 PyBytes_AS_STRING(bytes), 3719 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3720 Py_DECREF(bytes); 3721 } 3722 3723 if (psize) 3724 *psize = PyUnicode_UTF8_LENGTH(unicode); 3725 return PyUnicode_UTF8(unicode); 3726} 3727 3728char* 3729PyUnicode_AsUTF8(PyObject *unicode) 3730{ 3731 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3732} 3733 3734Py_UNICODE * 3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3736{ 3737 const unsigned char *one_byte; 3738#if SIZEOF_WCHAR_T == 4 3739 const Py_UCS2 *two_bytes; 3740#else 3741 const Py_UCS4 *four_bytes; 3742 const Py_UCS4 *ucs4_end; 3743 Py_ssize_t num_surrogates; 3744#endif 3745 wchar_t *w; 3746 wchar_t *wchar_end; 3747 3748 if (!PyUnicode_Check(unicode)) { 3749 PyErr_BadArgument(); 3750 return NULL; 3751 } 3752 if (_PyUnicode_WSTR(unicode) == NULL) { 3753 /* Non-ASCII compact unicode object */ 3754 assert(_PyUnicode_KIND(unicode) != 0); 3755 assert(PyUnicode_IS_READY(unicode)); 3756 3757 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3758#if SIZEOF_WCHAR_T == 2 3759 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3760 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3761 num_surrogates = 0; 3762 3763 for (; four_bytes < ucs4_end; ++four_bytes) { 3764 if (*four_bytes > 0xFFFF) 3765 ++num_surrogates; 3766 } 3767 3768 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3769 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3770 if (!_PyUnicode_WSTR(unicode)) { 3771 PyErr_NoMemory(); 3772 return NULL; 3773 } 3774 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3775 3776 w = _PyUnicode_WSTR(unicode); 3777 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3778 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3779 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3780 if (*four_bytes > 0xFFFF) { 3781 assert(*four_bytes <= MAX_UNICODE); 3782 /* encode surrogate pair in this case */ 3783 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3784 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3785 } 3786 else 3787 *w = *four_bytes; 3788 3789 if (w > wchar_end) { 3790 assert(0 && "Miscalculated string end"); 3791 } 3792 } 3793 *w = 0; 3794#else 3795 /* sizeof(wchar_t) == 4 */ 3796 Py_FatalError("Impossible unicode object state, wstr and str " 3797 "should share memory already."); 3798 return NULL; 3799#endif 3800 } 3801 else { 3802 if ((size_t)_PyUnicode_LENGTH(unicode) > 3803 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 3804 PyErr_NoMemory(); 3805 return NULL; 3806 } 3807 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3808 (_PyUnicode_LENGTH(unicode) + 1)); 3809 if (!_PyUnicode_WSTR(unicode)) { 3810 PyErr_NoMemory(); 3811 return NULL; 3812 } 3813 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3814 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3815 w = _PyUnicode_WSTR(unicode); 3816 wchar_end = w + _PyUnicode_LENGTH(unicode); 3817 3818 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3819 one_byte = PyUnicode_1BYTE_DATA(unicode); 3820 for (; w < wchar_end; ++one_byte, ++w) 3821 *w = *one_byte; 3822 /* null-terminate the wstr */ 3823 *w = 0; 3824 } 3825 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3826#if SIZEOF_WCHAR_T == 4 3827 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3828 for (; w < wchar_end; ++two_bytes, ++w) 3829 *w = *two_bytes; 3830 /* null-terminate the wstr */ 3831 *w = 0; 3832#else 3833 /* sizeof(wchar_t) == 2 */ 3834 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3835 _PyUnicode_WSTR(unicode) = NULL; 3836 Py_FatalError("Impossible unicode object state, wstr " 3837 "and str should share memory already."); 3838 return NULL; 3839#endif 3840 } 3841 else { 3842 assert(0 && "This should never happen."); 3843 } 3844 } 3845 } 3846 if (size != NULL) 3847 *size = PyUnicode_WSTR_LENGTH(unicode); 3848 return _PyUnicode_WSTR(unicode); 3849} 3850 3851Py_UNICODE * 3852PyUnicode_AsUnicode(PyObject *unicode) 3853{ 3854 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3855} 3856 3857 3858Py_ssize_t 3859PyUnicode_GetSize(PyObject *unicode) 3860{ 3861 if (!PyUnicode_Check(unicode)) { 3862 PyErr_BadArgument(); 3863 goto onError; 3864 } 3865 return PyUnicode_GET_SIZE(unicode); 3866 3867 onError: 3868 return -1; 3869} 3870 3871Py_ssize_t 3872PyUnicode_GetLength(PyObject *unicode) 3873{ 3874 if (!PyUnicode_Check(unicode)) { 3875 PyErr_BadArgument(); 3876 return -1; 3877 } 3878 if (PyUnicode_READY(unicode) == -1) 3879 return -1; 3880 return PyUnicode_GET_LENGTH(unicode); 3881} 3882 3883Py_UCS4 3884PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3885{ 3886 void *data; 3887 int kind; 3888 3889 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3890 PyErr_BadArgument(); 3891 return (Py_UCS4)-1; 3892 } 3893 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3894 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3895 return (Py_UCS4)-1; 3896 } 3897 data = PyUnicode_DATA(unicode); 3898 kind = PyUnicode_KIND(unicode); 3899 return PyUnicode_READ(kind, data, index); 3900} 3901 3902int 3903PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3904{ 3905 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3906 PyErr_BadArgument(); 3907 return -1; 3908 } 3909 assert(PyUnicode_IS_READY(unicode)); 3910 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3911 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3912 return -1; 3913 } 3914 if (unicode_check_modifiable(unicode)) 3915 return -1; 3916 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3917 PyErr_SetString(PyExc_ValueError, "character out of range"); 3918 return -1; 3919 } 3920 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3921 index, ch); 3922 return 0; 3923} 3924 3925const char * 3926PyUnicode_GetDefaultEncoding(void) 3927{ 3928 return "utf-8"; 3929} 3930 3931/* create or adjust a UnicodeDecodeError */ 3932static void 3933make_decode_exception(PyObject **exceptionObject, 3934 const char *encoding, 3935 const char *input, Py_ssize_t length, 3936 Py_ssize_t startpos, Py_ssize_t endpos, 3937 const char *reason) 3938{ 3939 if (*exceptionObject == NULL) { 3940 *exceptionObject = PyUnicodeDecodeError_Create( 3941 encoding, input, length, startpos, endpos, reason); 3942 } 3943 else { 3944 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3945 goto onError; 3946 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3947 goto onError; 3948 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3949 goto onError; 3950 } 3951 return; 3952 3953onError: 3954 Py_CLEAR(*exceptionObject); 3955} 3956 3957#ifdef HAVE_MBCS 3958/* error handling callback helper: 3959 build arguments, call the callback and check the arguments, 3960 if no exception occurred, copy the replacement to the output 3961 and adjust various state variables. 3962 return 0 on success, -1 on error 3963*/ 3964 3965static int 3966unicode_decode_call_errorhandler_wchar( 3967 const char *errors, PyObject **errorHandler, 3968 const char *encoding, const char *reason, 3969 const char **input, const char **inend, Py_ssize_t *startinpos, 3970 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3971 PyObject **output, Py_ssize_t *outpos) 3972{ 3973 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3974 3975 PyObject *restuple = NULL; 3976 PyObject *repunicode = NULL; 3977 Py_ssize_t outsize; 3978 Py_ssize_t insize; 3979 Py_ssize_t requiredsize; 3980 Py_ssize_t newpos; 3981 PyObject *inputobj = NULL; 3982 wchar_t *repwstr; 3983 Py_ssize_t repwlen; 3984 3985 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 3986 outsize = _PyUnicode_WSTR_LENGTH(*output); 3987 3988 if (*errorHandler == NULL) { 3989 *errorHandler = PyCodec_LookupError(errors); 3990 if (*errorHandler == NULL) 3991 goto onError; 3992 } 3993 3994 make_decode_exception(exceptionObject, 3995 encoding, 3996 *input, *inend - *input, 3997 *startinpos, *endinpos, 3998 reason); 3999 if (*exceptionObject == NULL) 4000 goto onError; 4001 4002 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4003 if (restuple == NULL) 4004 goto onError; 4005 if (!PyTuple_Check(restuple)) { 4006 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4007 goto onError; 4008 } 4009 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4010 goto onError; 4011 4012 /* Copy back the bytes variables, which might have been modified by the 4013 callback */ 4014 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4015 if (!inputobj) 4016 goto onError; 4017 if (!PyBytes_Check(inputobj)) { 4018 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4019 } 4020 *input = PyBytes_AS_STRING(inputobj); 4021 insize = PyBytes_GET_SIZE(inputobj); 4022 *inend = *input + insize; 4023 /* we can DECREF safely, as the exception has another reference, 4024 so the object won't go away. */ 4025 Py_DECREF(inputobj); 4026 4027 if (newpos<0) 4028 newpos = insize+newpos; 4029 if (newpos<0 || newpos>insize) { 4030 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4031 goto onError; 4032 } 4033 4034 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4035 if (repwstr == NULL) 4036 goto onError; 4037 /* need more space? (at least enough for what we 4038 have+the replacement+the rest of the string (starting 4039 at the new input position), so we won't have to check space 4040 when there are no errors in the rest of the string) */ 4041 requiredsize = *outpos; 4042 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4043 goto overflow; 4044 requiredsize += repwlen; 4045 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4046 goto overflow; 4047 requiredsize += insize - newpos; 4048 if (requiredsize > outsize) { 4049 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4050 requiredsize = 2*outsize; 4051 if (unicode_resize(output, requiredsize) < 0) 4052 goto onError; 4053 } 4054 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4055 *outpos += repwlen; 4056 *endinpos = newpos; 4057 *inptr = *input + newpos; 4058 4059 /* we made it! */ 4060 Py_XDECREF(restuple); 4061 return 0; 4062 4063 overflow: 4064 PyErr_SetString(PyExc_OverflowError, 4065 "decoded result is too long for a Python string"); 4066 4067 onError: 4068 Py_XDECREF(restuple); 4069 return -1; 4070} 4071#endif /* HAVE_MBCS */ 4072 4073static int 4074unicode_decode_call_errorhandler_writer( 4075 const char *errors, PyObject **errorHandler, 4076 const char *encoding, const char *reason, 4077 const char **input, const char **inend, Py_ssize_t *startinpos, 4078 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4079 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4080{ 4081 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4082 4083 PyObject *restuple = NULL; 4084 PyObject *repunicode = NULL; 4085 Py_ssize_t insize; 4086 Py_ssize_t newpos; 4087 Py_ssize_t replen; 4088 PyObject *inputobj = NULL; 4089 4090 if (*errorHandler == NULL) { 4091 *errorHandler = PyCodec_LookupError(errors); 4092 if (*errorHandler == NULL) 4093 goto onError; 4094 } 4095 4096 make_decode_exception(exceptionObject, 4097 encoding, 4098 *input, *inend - *input, 4099 *startinpos, *endinpos, 4100 reason); 4101 if (*exceptionObject == NULL) 4102 goto onError; 4103 4104 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4105 if (restuple == NULL) 4106 goto onError; 4107 if (!PyTuple_Check(restuple)) { 4108 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4109 goto onError; 4110 } 4111 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4112 goto onError; 4113 4114 /* Copy back the bytes variables, which might have been modified by the 4115 callback */ 4116 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4117 if (!inputobj) 4118 goto onError; 4119 if (!PyBytes_Check(inputobj)) { 4120 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4121 } 4122 *input = PyBytes_AS_STRING(inputobj); 4123 insize = PyBytes_GET_SIZE(inputobj); 4124 *inend = *input + insize; 4125 /* we can DECREF safely, as the exception has another reference, 4126 so the object won't go away. */ 4127 Py_DECREF(inputobj); 4128 4129 if (newpos<0) 4130 newpos = insize+newpos; 4131 if (newpos<0 || newpos>insize) { 4132 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4133 goto onError; 4134 } 4135 4136 if (PyUnicode_READY(repunicode) < 0) 4137 goto onError; 4138 replen = PyUnicode_GET_LENGTH(repunicode); 4139 if (replen > 1) { 4140 writer->min_length += replen - 1; 4141 writer->overallocate = 1; 4142 if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4143 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4144 goto onError; 4145 } 4146 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4147 goto onError; 4148 4149 *endinpos = newpos; 4150 *inptr = *input + newpos; 4151 4152 /* we made it! */ 4153 Py_XDECREF(restuple); 4154 return 0; 4155 4156 onError: 4157 Py_XDECREF(restuple); 4158 return -1; 4159} 4160 4161/* --- UTF-7 Codec -------------------------------------------------------- */ 4162 4163/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4164 4165/* Three simple macros defining base-64. */ 4166 4167/* Is c a base-64 character? */ 4168 4169#define IS_BASE64(c) \ 4170 (((c) >= 'A' && (c) <= 'Z') || \ 4171 ((c) >= 'a' && (c) <= 'z') || \ 4172 ((c) >= '0' && (c) <= '9') || \ 4173 (c) == '+' || (c) == '/') 4174 4175/* given that c is a base-64 character, what is its base-64 value? */ 4176 4177#define FROM_BASE64(c) \ 4178 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4179 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4180 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4181 (c) == '+' ? 62 : 63) 4182 4183/* What is the base-64 character of the bottom 6 bits of n? */ 4184 4185#define TO_BASE64(n) \ 4186 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4187 4188/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4189 * decoded as itself. We are permissive on decoding; the only ASCII 4190 * byte not decoding to itself is the + which begins a base64 4191 * string. */ 4192 4193#define DECODE_DIRECT(c) \ 4194 ((c) <= 127 && (c) != '+') 4195 4196/* The UTF-7 encoder treats ASCII characters differently according to 4197 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4198 * the above). See RFC2152. This array identifies these different 4199 * sets: 4200 * 0 : "Set D" 4201 * alphanumeric and '(),-./:? 4202 * 1 : "Set O" 4203 * !"#$%&*;<=>@[]^_`{|} 4204 * 2 : "whitespace" 4205 * ht nl cr sp 4206 * 3 : special (must be base64 encoded) 4207 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4208 */ 4209 4210static 4211char utf7_category[128] = { 4212/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4213 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4214/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4216/* sp ! " # $ % & ' ( ) * + , - . / */ 4217 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4218/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4220/* @ A B C D E F G H I J K L M N O */ 4221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4222/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4224/* ` a b c d e f g h i j k l m n o */ 4225 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4226/* p q r s t u v w x y z { | } ~ del */ 4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4228}; 4229 4230/* ENCODE_DIRECT: this character should be encoded as itself. The 4231 * answer depends on whether we are encoding set O as itself, and also 4232 * on whether we are encoding whitespace as itself. RFC2152 makes it 4233 * clear that the answers to these questions vary between 4234 * applications, so this code needs to be flexible. */ 4235 4236#define ENCODE_DIRECT(c, directO, directWS) \ 4237 ((c) < 128 && (c) > 0 && \ 4238 ((utf7_category[(c)] == 0) || \ 4239 (directWS && (utf7_category[(c)] == 2)) || \ 4240 (directO && (utf7_category[(c)] == 1)))) 4241 4242PyObject * 4243PyUnicode_DecodeUTF7(const char *s, 4244 Py_ssize_t size, 4245 const char *errors) 4246{ 4247 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4248} 4249 4250/* The decoder. The only state we preserve is our read position, 4251 * i.e. how many characters we have consumed. So if we end in the 4252 * middle of a shift sequence we have to back off the read position 4253 * and the output to the beginning of the sequence, otherwise we lose 4254 * all the shift state (seen bits, number of bits seen, high 4255 * surrogate). */ 4256 4257PyObject * 4258PyUnicode_DecodeUTF7Stateful(const char *s, 4259 Py_ssize_t size, 4260 const char *errors, 4261 Py_ssize_t *consumed) 4262{ 4263 const char *starts = s; 4264 Py_ssize_t startinpos; 4265 Py_ssize_t endinpos; 4266 const char *e; 4267 _PyUnicodeWriter writer; 4268 const char *errmsg = ""; 4269 int inShift = 0; 4270 Py_ssize_t shiftOutStart; 4271 unsigned int base64bits = 0; 4272 unsigned long base64buffer = 0; 4273 Py_UCS4 surrogate = 0; 4274 PyObject *errorHandler = NULL; 4275 PyObject *exc = NULL; 4276 4277 if (size == 0) { 4278 if (consumed) 4279 *consumed = 0; 4280 _Py_RETURN_UNICODE_EMPTY(); 4281 } 4282 4283 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4284 _PyUnicodeWriter_Init(&writer); 4285 writer.min_length = size; 4286 4287 shiftOutStart = 0; 4288 e = s + size; 4289 4290 while (s < e) { 4291 Py_UCS4 ch; 4292 restart: 4293 ch = (unsigned char) *s; 4294 4295 if (inShift) { /* in a base-64 section */ 4296 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4297 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4298 base64bits += 6; 4299 s++; 4300 if (base64bits >= 16) { 4301 /* we have enough bits for a UTF-16 value */ 4302 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4303 base64bits -= 16; 4304 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4305 assert(outCh <= 0xffff); 4306 if (surrogate) { 4307 /* expecting a second surrogate */ 4308 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4309 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4310 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4311 goto onError; 4312 surrogate = 0; 4313 continue; 4314 } 4315 else { 4316 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4317 goto onError; 4318 surrogate = 0; 4319 } 4320 } 4321 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4322 /* first surrogate */ 4323 surrogate = outCh; 4324 } 4325 else { 4326 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4327 goto onError; 4328 } 4329 } 4330 } 4331 else { /* now leaving a base-64 section */ 4332 inShift = 0; 4333 s++; 4334 if (surrogate) { 4335 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4336 goto onError; 4337 surrogate = 0; 4338 } 4339 if (base64bits > 0) { /* left-over bits */ 4340 if (base64bits >= 6) { 4341 /* We've seen at least one base-64 character */ 4342 errmsg = "partial character in shift sequence"; 4343 goto utf7Error; 4344 } 4345 else { 4346 /* Some bits remain; they should be zero */ 4347 if (base64buffer != 0) { 4348 errmsg = "non-zero padding bits in shift sequence"; 4349 goto utf7Error; 4350 } 4351 } 4352 } 4353 if (ch != '-') { 4354 /* '-' is absorbed; other terminating 4355 characters are preserved */ 4356 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4357 goto onError; 4358 } 4359 } 4360 } 4361 else if ( ch == '+' ) { 4362 startinpos = s-starts; 4363 s++; /* consume '+' */ 4364 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4365 s++; 4366 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4367 goto onError; 4368 } 4369 else { /* begin base64-encoded section */ 4370 inShift = 1; 4371 shiftOutStart = writer.pos; 4372 base64bits = 0; 4373 base64buffer = 0; 4374 } 4375 } 4376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4377 s++; 4378 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4379 goto onError; 4380 } 4381 else { 4382 startinpos = s-starts; 4383 s++; 4384 errmsg = "unexpected special character"; 4385 goto utf7Error; 4386 } 4387 continue; 4388utf7Error: 4389 endinpos = s-starts; 4390 if (unicode_decode_call_errorhandler_writer( 4391 errors, &errorHandler, 4392 "utf7", errmsg, 4393 &starts, &e, &startinpos, &endinpos, &exc, &s, 4394 &writer)) 4395 goto onError; 4396 } 4397 4398 /* end of string */ 4399 4400 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4401 /* if we're in an inconsistent state, that's an error */ 4402 if (surrogate || 4403 (base64bits >= 6) || 4404 (base64bits > 0 && base64buffer != 0)) { 4405 endinpos = size; 4406 if (unicode_decode_call_errorhandler_writer( 4407 errors, &errorHandler, 4408 "utf7", "unterminated shift sequence", 4409 &starts, &e, &startinpos, &endinpos, &exc, &s, 4410 &writer)) 4411 goto onError; 4412 if (s < e) 4413 goto restart; 4414 } 4415 } 4416 4417 /* return state */ 4418 if (consumed) { 4419 if (inShift) { 4420 *consumed = startinpos; 4421 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4422 PyObject *result = PyUnicode_FromKindAndData( 4423 writer.kind, writer.data, shiftOutStart); 4424 Py_XDECREF(errorHandler); 4425 Py_XDECREF(exc); 4426 _PyUnicodeWriter_Dealloc(&writer); 4427 return result; 4428 } 4429 writer.pos = shiftOutStart; /* back off output */ 4430 } 4431 else { 4432 *consumed = s-starts; 4433 } 4434 } 4435 4436 Py_XDECREF(errorHandler); 4437 Py_XDECREF(exc); 4438 return _PyUnicodeWriter_Finish(&writer); 4439 4440 onError: 4441 Py_XDECREF(errorHandler); 4442 Py_XDECREF(exc); 4443 _PyUnicodeWriter_Dealloc(&writer); 4444 return NULL; 4445} 4446 4447 4448PyObject * 4449_PyUnicode_EncodeUTF7(PyObject *str, 4450 int base64SetO, 4451 int base64WhiteSpace, 4452 const char *errors) 4453{ 4454 int kind; 4455 void *data; 4456 Py_ssize_t len; 4457 PyObject *v; 4458 int inShift = 0; 4459 Py_ssize_t i; 4460 unsigned int base64bits = 0; 4461 unsigned long base64buffer = 0; 4462 char * out; 4463 char * start; 4464 4465 if (PyUnicode_READY(str) == -1) 4466 return NULL; 4467 kind = PyUnicode_KIND(str); 4468 data = PyUnicode_DATA(str); 4469 len = PyUnicode_GET_LENGTH(str); 4470 4471 if (len == 0) 4472 return PyBytes_FromStringAndSize(NULL, 0); 4473 4474 /* It might be possible to tighten this worst case */ 4475 if (len > PY_SSIZE_T_MAX / 8) 4476 return PyErr_NoMemory(); 4477 v = PyBytes_FromStringAndSize(NULL, len * 8); 4478 if (v == NULL) 4479 return NULL; 4480 4481 start = out = PyBytes_AS_STRING(v); 4482 for (i = 0; i < len; ++i) { 4483 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4484 4485 if (inShift) { 4486 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4487 /* shifting out */ 4488 if (base64bits) { /* output remaining bits */ 4489 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4490 base64buffer = 0; 4491 base64bits = 0; 4492 } 4493 inShift = 0; 4494 /* Characters not in the BASE64 set implicitly unshift the sequence 4495 so no '-' is required, except if the character is itself a '-' */ 4496 if (IS_BASE64(ch) || ch == '-') { 4497 *out++ = '-'; 4498 } 4499 *out++ = (char) ch; 4500 } 4501 else { 4502 goto encode_char; 4503 } 4504 } 4505 else { /* not in a shift sequence */ 4506 if (ch == '+') { 4507 *out++ = '+'; 4508 *out++ = '-'; 4509 } 4510 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4511 *out++ = (char) ch; 4512 } 4513 else { 4514 *out++ = '+'; 4515 inShift = 1; 4516 goto encode_char; 4517 } 4518 } 4519 continue; 4520encode_char: 4521 if (ch >= 0x10000) { 4522 assert(ch <= MAX_UNICODE); 4523 4524 /* code first surrogate */ 4525 base64bits += 16; 4526 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4527 while (base64bits >= 6) { 4528 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4529 base64bits -= 6; 4530 } 4531 /* prepare second surrogate */ 4532 ch = Py_UNICODE_LOW_SURROGATE(ch); 4533 } 4534 base64bits += 16; 4535 base64buffer = (base64buffer << 16) | ch; 4536 while (base64bits >= 6) { 4537 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4538 base64bits -= 6; 4539 } 4540 } 4541 if (base64bits) 4542 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4543 if (inShift) 4544 *out++ = '-'; 4545 if (_PyBytes_Resize(&v, out - start) < 0) 4546 return NULL; 4547 return v; 4548} 4549PyObject * 4550PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4551 Py_ssize_t size, 4552 int base64SetO, 4553 int base64WhiteSpace, 4554 const char *errors) 4555{ 4556 PyObject *result; 4557 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4558 if (tmp == NULL) 4559 return NULL; 4560 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4561 base64WhiteSpace, errors); 4562 Py_DECREF(tmp); 4563 return result; 4564} 4565 4566#undef IS_BASE64 4567#undef FROM_BASE64 4568#undef TO_BASE64 4569#undef DECODE_DIRECT 4570#undef ENCODE_DIRECT 4571 4572/* --- UTF-8 Codec -------------------------------------------------------- */ 4573 4574PyObject * 4575PyUnicode_DecodeUTF8(const char *s, 4576 Py_ssize_t size, 4577 const char *errors) 4578{ 4579 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4580} 4581 4582#include "stringlib/asciilib.h" 4583#include "stringlib/codecs.h" 4584#include "stringlib/undef.h" 4585 4586#include "stringlib/ucs1lib.h" 4587#include "stringlib/codecs.h" 4588#include "stringlib/undef.h" 4589 4590#include "stringlib/ucs2lib.h" 4591#include "stringlib/codecs.h" 4592#include "stringlib/undef.h" 4593 4594#include "stringlib/ucs4lib.h" 4595#include "stringlib/codecs.h" 4596#include "stringlib/undef.h" 4597 4598/* Mask to quickly check whether a C 'long' contains a 4599 non-ASCII, UTF8-encoded char. */ 4600#if (SIZEOF_LONG == 8) 4601# define ASCII_CHAR_MASK 0x8080808080808080UL 4602#elif (SIZEOF_LONG == 4) 4603# define ASCII_CHAR_MASK 0x80808080UL 4604#else 4605# error C 'long' size should be either 4 or 8! 4606#endif 4607 4608static Py_ssize_t 4609ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4610{ 4611 const char *p = start; 4612 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4613 4614 /* 4615 * Issue #17237: m68k is a bit different from most architectures in 4616 * that objects do not use "natural alignment" - for example, int and 4617 * long are only aligned at 2-byte boundaries. Therefore the assert() 4618 * won't work; also, tests have shown that skipping the "optimised 4619 * version" will even speed up m68k. 4620 */ 4621#if !defined(__m68k__) 4622#if SIZEOF_LONG <= SIZEOF_VOID_P 4623 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4624 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4625 /* Fast path, see in STRINGLIB(utf8_decode) for 4626 an explanation. */ 4627 /* Help allocation */ 4628 const char *_p = p; 4629 Py_UCS1 * q = dest; 4630 while (_p < aligned_end) { 4631 unsigned long value = *(const unsigned long *) _p; 4632 if (value & ASCII_CHAR_MASK) 4633 break; 4634 *((unsigned long *)q) = value; 4635 _p += SIZEOF_LONG; 4636 q += SIZEOF_LONG; 4637 } 4638 p = _p; 4639 while (p < end) { 4640 if ((unsigned char)*p & 0x80) 4641 break; 4642 *q++ = *p++; 4643 } 4644 return p - start; 4645 } 4646#endif 4647#endif 4648 while (p < end) { 4649 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4650 for an explanation. */ 4651 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4652 /* Help allocation */ 4653 const char *_p = p; 4654 while (_p < aligned_end) { 4655 unsigned long value = *(unsigned long *) _p; 4656 if (value & ASCII_CHAR_MASK) 4657 break; 4658 _p += SIZEOF_LONG; 4659 } 4660 p = _p; 4661 if (_p == end) 4662 break; 4663 } 4664 if ((unsigned char)*p & 0x80) 4665 break; 4666 ++p; 4667 } 4668 memcpy(dest, start, p - start); 4669 return p - start; 4670} 4671 4672PyObject * 4673PyUnicode_DecodeUTF8Stateful(const char *s, 4674 Py_ssize_t size, 4675 const char *errors, 4676 Py_ssize_t *consumed) 4677{ 4678 _PyUnicodeWriter writer; 4679 const char *starts = s; 4680 const char *end = s + size; 4681 4682 Py_ssize_t startinpos; 4683 Py_ssize_t endinpos; 4684 const char *errmsg = ""; 4685 PyObject *errorHandler = NULL; 4686 PyObject *exc = NULL; 4687 4688 if (size == 0) { 4689 if (consumed) 4690 *consumed = 0; 4691 _Py_RETURN_UNICODE_EMPTY(); 4692 } 4693 4694 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4695 if (size == 1 && (unsigned char)s[0] < 128) { 4696 if (consumed) 4697 *consumed = 1; 4698 return get_latin1_char((unsigned char)s[0]); 4699 } 4700 4701 _PyUnicodeWriter_Init(&writer); 4702 writer.min_length = size; 4703 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4704 goto onError; 4705 4706 writer.pos = ascii_decode(s, end, writer.data); 4707 s += writer.pos; 4708 while (s < end) { 4709 Py_UCS4 ch; 4710 int kind = writer.kind; 4711 if (kind == PyUnicode_1BYTE_KIND) { 4712 if (PyUnicode_IS_ASCII(writer.buffer)) 4713 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4714 else 4715 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4716 } else if (kind == PyUnicode_2BYTE_KIND) { 4717 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4718 } else { 4719 assert(kind == PyUnicode_4BYTE_KIND); 4720 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4721 } 4722 4723 switch (ch) { 4724 case 0: 4725 if (s == end || consumed) 4726 goto End; 4727 errmsg = "unexpected end of data"; 4728 startinpos = s - starts; 4729 endinpos = end - starts; 4730 break; 4731 case 1: 4732 errmsg = "invalid start byte"; 4733 startinpos = s - starts; 4734 endinpos = startinpos + 1; 4735 break; 4736 case 2: 4737 case 3: 4738 case 4: 4739 errmsg = "invalid continuation byte"; 4740 startinpos = s - starts; 4741 endinpos = startinpos + ch - 1; 4742 break; 4743 default: 4744 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4745 goto onError; 4746 continue; 4747 } 4748 4749 if (unicode_decode_call_errorhandler_writer( 4750 errors, &errorHandler, 4751 "utf-8", errmsg, 4752 &starts, &end, &startinpos, &endinpos, &exc, &s, 4753 &writer)) 4754 goto onError; 4755 } 4756 4757End: 4758 if (consumed) 4759 *consumed = s - starts; 4760 4761 Py_XDECREF(errorHandler); 4762 Py_XDECREF(exc); 4763 return _PyUnicodeWriter_Finish(&writer); 4764 4765onError: 4766 Py_XDECREF(errorHandler); 4767 Py_XDECREF(exc); 4768 _PyUnicodeWriter_Dealloc(&writer); 4769 return NULL; 4770} 4771 4772#ifdef __APPLE__ 4773 4774/* Simplified UTF-8 decoder using surrogateescape error handler, 4775 used to decode the command line arguments on Mac OS X. 4776 4777 Return a pointer to a newly allocated wide character string (use 4778 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4779 4780wchar_t* 4781_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4782{ 4783 const char *e; 4784 wchar_t *unicode; 4785 Py_ssize_t outpos; 4786 4787 /* Note: size will always be longer than the resulting Unicode 4788 character count */ 4789 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) 4790 return NULL; 4791 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4792 if (!unicode) 4793 return NULL; 4794 4795 /* Unpack UTF-8 encoded data */ 4796 e = s + size; 4797 outpos = 0; 4798 while (s < e) { 4799 Py_UCS4 ch; 4800#if SIZEOF_WCHAR_T == 4 4801 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4802#else 4803 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4804#endif 4805 if (ch > 0xFF) { 4806#if SIZEOF_WCHAR_T == 4 4807 assert(0); 4808#else 4809 assert(Py_UNICODE_IS_SURROGATE(ch)); 4810 /* compute and append the two surrogates: */ 4811 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4812 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4813#endif 4814 } 4815 else { 4816 if (!ch && s == e) 4817 break; 4818 /* surrogateescape */ 4819 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4820 } 4821 } 4822 unicode[outpos] = L'\0'; 4823 return unicode; 4824} 4825 4826#endif /* __APPLE__ */ 4827 4828/* Primary internal function which creates utf8 encoded bytes objects. 4829 4830 Allocation strategy: if the string is short, convert into a stack buffer 4831 and allocate exactly as much space needed at the end. Else allocate the 4832 maximum possible needed (4 result bytes per Unicode character), and return 4833 the excess memory at the end. 4834*/ 4835PyObject * 4836_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4837{ 4838 enum PyUnicode_Kind kind; 4839 void *data; 4840 Py_ssize_t size; 4841 4842 if (!PyUnicode_Check(unicode)) { 4843 PyErr_BadArgument(); 4844 return NULL; 4845 } 4846 4847 if (PyUnicode_READY(unicode) == -1) 4848 return NULL; 4849 4850 if (PyUnicode_UTF8(unicode)) 4851 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4852 PyUnicode_UTF8_LENGTH(unicode)); 4853 4854 kind = PyUnicode_KIND(unicode); 4855 data = PyUnicode_DATA(unicode); 4856 size = PyUnicode_GET_LENGTH(unicode); 4857 4858 switch (kind) { 4859 default: 4860 assert(0); 4861 case PyUnicode_1BYTE_KIND: 4862 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4863 assert(!PyUnicode_IS_ASCII(unicode)); 4864 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4865 case PyUnicode_2BYTE_KIND: 4866 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4867 case PyUnicode_4BYTE_KIND: 4868 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4869 } 4870} 4871 4872PyObject * 4873PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4874 Py_ssize_t size, 4875 const char *errors) 4876{ 4877 PyObject *v, *unicode; 4878 4879 unicode = PyUnicode_FromUnicode(s, size); 4880 if (unicode == NULL) 4881 return NULL; 4882 v = _PyUnicode_AsUTF8String(unicode, errors); 4883 Py_DECREF(unicode); 4884 return v; 4885} 4886 4887PyObject * 4888PyUnicode_AsUTF8String(PyObject *unicode) 4889{ 4890 return _PyUnicode_AsUTF8String(unicode, NULL); 4891} 4892 4893/* --- UTF-32 Codec ------------------------------------------------------- */ 4894 4895PyObject * 4896PyUnicode_DecodeUTF32(const char *s, 4897 Py_ssize_t size, 4898 const char *errors, 4899 int *byteorder) 4900{ 4901 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4902} 4903 4904PyObject * 4905PyUnicode_DecodeUTF32Stateful(const char *s, 4906 Py_ssize_t size, 4907 const char *errors, 4908 int *byteorder, 4909 Py_ssize_t *consumed) 4910{ 4911 const char *starts = s; 4912 Py_ssize_t startinpos; 4913 Py_ssize_t endinpos; 4914 _PyUnicodeWriter writer; 4915 const unsigned char *q, *e; 4916 int le, bo = 0; /* assume native ordering by default */ 4917 const char *encoding; 4918 const char *errmsg = ""; 4919 PyObject *errorHandler = NULL; 4920 PyObject *exc = NULL; 4921 4922 q = (unsigned char *)s; 4923 e = q + size; 4924 4925 if (byteorder) 4926 bo = *byteorder; 4927 4928 /* Check for BOM marks (U+FEFF) in the input and adjust current 4929 byte order setting accordingly. In native mode, the leading BOM 4930 mark is skipped, in all other modes, it is copied to the output 4931 stream as-is (giving a ZWNBSP character). */ 4932 if (bo == 0 && size >= 4) { 4933 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4934 if (bom == 0x0000FEFF) { 4935 bo = -1; 4936 q += 4; 4937 } 4938 else if (bom == 0xFFFE0000) { 4939 bo = 1; 4940 q += 4; 4941 } 4942 if (byteorder) 4943 *byteorder = bo; 4944 } 4945 4946 if (q == e) { 4947 if (consumed) 4948 *consumed = size; 4949 _Py_RETURN_UNICODE_EMPTY(); 4950 } 4951 4952#ifdef WORDS_BIGENDIAN 4953 le = bo < 0; 4954#else 4955 le = bo <= 0; 4956#endif 4957 encoding = le ? "utf-32-le" : "utf-32-be"; 4958 4959 _PyUnicodeWriter_Init(&writer); 4960 writer.min_length = (e - q + 3) / 4; 4961 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4962 goto onError; 4963 4964 while (1) { 4965 Py_UCS4 ch = 0; 4966 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 4967 4968 if (e - q >= 4) { 4969 enum PyUnicode_Kind kind = writer.kind; 4970 void *data = writer.data; 4971 const unsigned char *last = e - 4; 4972 Py_ssize_t pos = writer.pos; 4973 if (le) { 4974 do { 4975 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4976 if (ch > maxch) 4977 break; 4978 if (kind != PyUnicode_1BYTE_KIND && 4979 Py_UNICODE_IS_SURROGATE(ch)) 4980 break; 4981 PyUnicode_WRITE(kind, data, pos++, ch); 4982 q += 4; 4983 } while (q <= last); 4984 } 4985 else { 4986 do { 4987 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 4988 if (ch > maxch) 4989 break; 4990 if (kind != PyUnicode_1BYTE_KIND && 4991 Py_UNICODE_IS_SURROGATE(ch)) 4992 break; 4993 PyUnicode_WRITE(kind, data, pos++, ch); 4994 q += 4; 4995 } while (q <= last); 4996 } 4997 writer.pos = pos; 4998 } 4999 5000 if (Py_UNICODE_IS_SURROGATE(ch)) { 5001 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5002 startinpos = ((const char *)q) - starts; 5003 endinpos = startinpos + 4; 5004 } 5005 else if (ch <= maxch) { 5006 if (q == e || consumed) 5007 break; 5008 /* remaining bytes at the end? (size should be divisible by 4) */ 5009 errmsg = "truncated data"; 5010 startinpos = ((const char *)q) - starts; 5011 endinpos = ((const char *)e) - starts; 5012 } 5013 else { 5014 if (ch < 0x110000) { 5015 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5016 goto onError; 5017 q += 4; 5018 continue; 5019 } 5020 errmsg = "code point not in range(0x110000)"; 5021 startinpos = ((const char *)q) - starts; 5022 endinpos = startinpos + 4; 5023 } 5024 5025 /* The remaining input chars are ignored if the callback 5026 chooses to skip the input */ 5027 if (unicode_decode_call_errorhandler_writer( 5028 errors, &errorHandler, 5029 encoding, errmsg, 5030 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5031 &writer)) 5032 goto onError; 5033 } 5034 5035 if (consumed) 5036 *consumed = (const char *)q-starts; 5037 5038 Py_XDECREF(errorHandler); 5039 Py_XDECREF(exc); 5040 return _PyUnicodeWriter_Finish(&writer); 5041 5042 onError: 5043 _PyUnicodeWriter_Dealloc(&writer); 5044 Py_XDECREF(errorHandler); 5045 Py_XDECREF(exc); 5046 return NULL; 5047} 5048 5049PyObject * 5050_PyUnicode_EncodeUTF32(PyObject *str, 5051 const char *errors, 5052 int byteorder) 5053{ 5054 enum PyUnicode_Kind kind; 5055 const void *data; 5056 Py_ssize_t len; 5057 PyObject *v; 5058 PY_UINT32_T *out; 5059#if PY_LITTLE_ENDIAN 5060 int native_ordering = byteorder <= 0; 5061#else 5062 int native_ordering = byteorder >= 0; 5063#endif 5064 const char *encoding; 5065 Py_ssize_t nsize, pos; 5066 PyObject *errorHandler = NULL; 5067 PyObject *exc = NULL; 5068 PyObject *rep = NULL; 5069 5070 if (!PyUnicode_Check(str)) { 5071 PyErr_BadArgument(); 5072 return NULL; 5073 } 5074 if (PyUnicode_READY(str) == -1) 5075 return NULL; 5076 kind = PyUnicode_KIND(str); 5077 data = PyUnicode_DATA(str); 5078 len = PyUnicode_GET_LENGTH(str); 5079 5080 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5081 return PyErr_NoMemory(); 5082 nsize = len + (byteorder == 0); 5083 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5084 if (v == NULL) 5085 return NULL; 5086 5087 /* output buffer is 4-bytes aligned */ 5088 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5089 out = (PY_UINT32_T *)PyBytes_AS_STRING(v); 5090 if (byteorder == 0) 5091 *out++ = 0xFEFF; 5092 if (len == 0) 5093 goto done; 5094 5095 if (byteorder == -1) 5096 encoding = "utf-32-le"; 5097 else if (byteorder == 1) 5098 encoding = "utf-32-be"; 5099 else 5100 encoding = "utf-32"; 5101 5102 if (kind == PyUnicode_1BYTE_KIND) { 5103 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5104 goto done; 5105 } 5106 5107 pos = 0; 5108 while (pos < len) { 5109 Py_ssize_t repsize, moreunits; 5110 5111 if (kind == PyUnicode_2BYTE_KIND) { 5112 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5113 &out, native_ordering); 5114 } 5115 else { 5116 assert(kind == PyUnicode_4BYTE_KIND); 5117 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5118 &out, native_ordering); 5119 } 5120 if (pos == len) 5121 break; 5122 5123 rep = unicode_encode_call_errorhandler( 5124 errors, &errorHandler, 5125 encoding, "surrogates not allowed", 5126 str, &exc, pos, pos + 1, &pos); 5127 if (!rep) 5128 goto error; 5129 5130 if (PyBytes_Check(rep)) { 5131 repsize = PyBytes_GET_SIZE(rep); 5132 if (repsize & 3) { 5133 raise_encode_exception(&exc, encoding, 5134 str, pos - 1, pos, 5135 "surrogates not allowed"); 5136 goto error; 5137 } 5138 moreunits = repsize / 4; 5139 } 5140 else { 5141 assert(PyUnicode_Check(rep)); 5142 if (PyUnicode_READY(rep) < 0) 5143 goto error; 5144 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5145 if (!PyUnicode_IS_ASCII(rep)) { 5146 raise_encode_exception(&exc, encoding, 5147 str, pos - 1, pos, 5148 "surrogates not allowed"); 5149 goto error; 5150 } 5151 } 5152 5153 /* four bytes are reserved for each surrogate */ 5154 if (moreunits > 1) { 5155 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v); 5156 Py_ssize_t morebytes = 4 * (moreunits - 1); 5157 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5158 /* integer overflow */ 5159 PyErr_NoMemory(); 5160 goto error; 5161 } 5162 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5163 goto error; 5164 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos; 5165 } 5166 5167 if (PyBytes_Check(rep)) { 5168 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5169 out += moreunits; 5170 } else /* rep is unicode */ { 5171 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5172 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5173 &out, native_ordering); 5174 } 5175 5176 Py_CLEAR(rep); 5177 } 5178 5179 /* Cut back to size actually needed. This is necessary for, for example, 5180 encoding of a string containing isolated surrogates and the 'ignore' 5181 handler is used. */ 5182 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5183 if (nsize != PyBytes_GET_SIZE(v)) 5184 _PyBytes_Resize(&v, nsize); 5185 Py_XDECREF(errorHandler); 5186 Py_XDECREF(exc); 5187 done: 5188 return v; 5189 error: 5190 Py_XDECREF(rep); 5191 Py_XDECREF(errorHandler); 5192 Py_XDECREF(exc); 5193 Py_XDECREF(v); 5194 return NULL; 5195} 5196 5197PyObject * 5198PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5199 Py_ssize_t size, 5200 const char *errors, 5201 int byteorder) 5202{ 5203 PyObject *result; 5204 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5205 if (tmp == NULL) 5206 return NULL; 5207 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5208 Py_DECREF(tmp); 5209 return result; 5210} 5211 5212PyObject * 5213PyUnicode_AsUTF32String(PyObject *unicode) 5214{ 5215 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5216} 5217 5218/* --- UTF-16 Codec ------------------------------------------------------- */ 5219 5220PyObject * 5221PyUnicode_DecodeUTF16(const char *s, 5222 Py_ssize_t size, 5223 const char *errors, 5224 int *byteorder) 5225{ 5226 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5227} 5228 5229PyObject * 5230PyUnicode_DecodeUTF16Stateful(const char *s, 5231 Py_ssize_t size, 5232 const char *errors, 5233 int *byteorder, 5234 Py_ssize_t *consumed) 5235{ 5236 const char *starts = s; 5237 Py_ssize_t startinpos; 5238 Py_ssize_t endinpos; 5239 _PyUnicodeWriter writer; 5240 const unsigned char *q, *e; 5241 int bo = 0; /* assume native ordering by default */ 5242 int native_ordering; 5243 const char *errmsg = ""; 5244 PyObject *errorHandler = NULL; 5245 PyObject *exc = NULL; 5246 const char *encoding; 5247 5248 q = (unsigned char *)s; 5249 e = q + size; 5250 5251 if (byteorder) 5252 bo = *byteorder; 5253 5254 /* Check for BOM marks (U+FEFF) in the input and adjust current 5255 byte order setting accordingly. In native mode, the leading BOM 5256 mark is skipped, in all other modes, it is copied to the output 5257 stream as-is (giving a ZWNBSP character). */ 5258 if (bo == 0 && size >= 2) { 5259 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5260 if (bom == 0xFEFF) { 5261 q += 2; 5262 bo = -1; 5263 } 5264 else if (bom == 0xFFFE) { 5265 q += 2; 5266 bo = 1; 5267 } 5268 if (byteorder) 5269 *byteorder = bo; 5270 } 5271 5272 if (q == e) { 5273 if (consumed) 5274 *consumed = size; 5275 _Py_RETURN_UNICODE_EMPTY(); 5276 } 5277 5278#if PY_LITTLE_ENDIAN 5279 native_ordering = bo <= 0; 5280 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5281#else 5282 native_ordering = bo >= 0; 5283 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5284#endif 5285 5286 /* Note: size will always be longer than the resulting Unicode 5287 character count */ 5288 _PyUnicodeWriter_Init(&writer); 5289 writer.min_length = (e - q + 1) / 2; 5290 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5291 goto onError; 5292 5293 while (1) { 5294 Py_UCS4 ch = 0; 5295 if (e - q >= 2) { 5296 int kind = writer.kind; 5297 if (kind == PyUnicode_1BYTE_KIND) { 5298 if (PyUnicode_IS_ASCII(writer.buffer)) 5299 ch = asciilib_utf16_decode(&q, e, 5300 (Py_UCS1*)writer.data, &writer.pos, 5301 native_ordering); 5302 else 5303 ch = ucs1lib_utf16_decode(&q, e, 5304 (Py_UCS1*)writer.data, &writer.pos, 5305 native_ordering); 5306 } else if (kind == PyUnicode_2BYTE_KIND) { 5307 ch = ucs2lib_utf16_decode(&q, e, 5308 (Py_UCS2*)writer.data, &writer.pos, 5309 native_ordering); 5310 } else { 5311 assert(kind == PyUnicode_4BYTE_KIND); 5312 ch = ucs4lib_utf16_decode(&q, e, 5313 (Py_UCS4*)writer.data, &writer.pos, 5314 native_ordering); 5315 } 5316 } 5317 5318 switch (ch) 5319 { 5320 case 0: 5321 /* remaining byte at the end? (size should be even) */ 5322 if (q == e || consumed) 5323 goto End; 5324 errmsg = "truncated data"; 5325 startinpos = ((const char *)q) - starts; 5326 endinpos = ((const char *)e) - starts; 5327 break; 5328 /* The remaining input chars are ignored if the callback 5329 chooses to skip the input */ 5330 case 1: 5331 q -= 2; 5332 if (consumed) 5333 goto End; 5334 errmsg = "unexpected end of data"; 5335 startinpos = ((const char *)q) - starts; 5336 endinpos = ((const char *)e) - starts; 5337 break; 5338 case 2: 5339 errmsg = "illegal encoding"; 5340 startinpos = ((const char *)q) - 2 - starts; 5341 endinpos = startinpos + 2; 5342 break; 5343 case 3: 5344 errmsg = "illegal UTF-16 surrogate"; 5345 startinpos = ((const char *)q) - 4 - starts; 5346 endinpos = startinpos + 2; 5347 break; 5348 default: 5349 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5350 goto onError; 5351 continue; 5352 } 5353 5354 if (unicode_decode_call_errorhandler_writer( 5355 errors, 5356 &errorHandler, 5357 encoding, errmsg, 5358 &starts, 5359 (const char **)&e, 5360 &startinpos, 5361 &endinpos, 5362 &exc, 5363 (const char **)&q, 5364 &writer)) 5365 goto onError; 5366 } 5367 5368End: 5369 if (consumed) 5370 *consumed = (const char *)q-starts; 5371 5372 Py_XDECREF(errorHandler); 5373 Py_XDECREF(exc); 5374 return _PyUnicodeWriter_Finish(&writer); 5375 5376 onError: 5377 _PyUnicodeWriter_Dealloc(&writer); 5378 Py_XDECREF(errorHandler); 5379 Py_XDECREF(exc); 5380 return NULL; 5381} 5382 5383PyObject * 5384_PyUnicode_EncodeUTF16(PyObject *str, 5385 const char *errors, 5386 int byteorder) 5387{ 5388 enum PyUnicode_Kind kind; 5389 const void *data; 5390 Py_ssize_t len; 5391 PyObject *v; 5392 unsigned short *out; 5393 Py_ssize_t pairs; 5394#if PY_BIG_ENDIAN 5395 int native_ordering = byteorder >= 0; 5396#else 5397 int native_ordering = byteorder <= 0; 5398#endif 5399 const char *encoding; 5400 Py_ssize_t nsize, pos; 5401 PyObject *errorHandler = NULL; 5402 PyObject *exc = NULL; 5403 PyObject *rep = NULL; 5404 5405 if (!PyUnicode_Check(str)) { 5406 PyErr_BadArgument(); 5407 return NULL; 5408 } 5409 if (PyUnicode_READY(str) == -1) 5410 return NULL; 5411 kind = PyUnicode_KIND(str); 5412 data = PyUnicode_DATA(str); 5413 len = PyUnicode_GET_LENGTH(str); 5414 5415 pairs = 0; 5416 if (kind == PyUnicode_4BYTE_KIND) { 5417 const Py_UCS4 *in = (const Py_UCS4 *)data; 5418 const Py_UCS4 *end = in + len; 5419 while (in < end) 5420 if (*in++ >= 0x10000) 5421 pairs++; 5422 } 5423 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5424 return PyErr_NoMemory(); 5425 nsize = len + pairs + (byteorder == 0); 5426 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5427 if (v == NULL) 5428 return NULL; 5429 5430 /* output buffer is 2-bytes aligned */ 5431 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5432 out = (unsigned short *)PyBytes_AS_STRING(v); 5433 if (byteorder == 0) 5434 *out++ = 0xFEFF; 5435 if (len == 0) 5436 goto done; 5437 5438 if (kind == PyUnicode_1BYTE_KIND) { 5439 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5440 goto done; 5441 } 5442 5443 if (byteorder < 0) 5444 encoding = "utf-16-le"; 5445 else if (byteorder > 0) 5446 encoding = "utf-16-be"; 5447 else 5448 encoding = "utf-16"; 5449 5450 pos = 0; 5451 while (pos < len) { 5452 Py_ssize_t repsize, moreunits; 5453 5454 if (kind == PyUnicode_2BYTE_KIND) { 5455 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5456 &out, native_ordering); 5457 } 5458 else { 5459 assert(kind == PyUnicode_4BYTE_KIND); 5460 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5461 &out, native_ordering); 5462 } 5463 if (pos == len) 5464 break; 5465 5466 rep = unicode_encode_call_errorhandler( 5467 errors, &errorHandler, 5468 encoding, "surrogates not allowed", 5469 str, &exc, pos, pos + 1, &pos); 5470 if (!rep) 5471 goto error; 5472 5473 if (PyBytes_Check(rep)) { 5474 repsize = PyBytes_GET_SIZE(rep); 5475 if (repsize & 1) { 5476 raise_encode_exception(&exc, encoding, 5477 str, pos - 1, pos, 5478 "surrogates not allowed"); 5479 goto error; 5480 } 5481 moreunits = repsize / 2; 5482 } 5483 else { 5484 assert(PyUnicode_Check(rep)); 5485 if (PyUnicode_READY(rep) < 0) 5486 goto error; 5487 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5488 if (!PyUnicode_IS_ASCII(rep)) { 5489 raise_encode_exception(&exc, encoding, 5490 str, pos - 1, pos, 5491 "surrogates not allowed"); 5492 goto error; 5493 } 5494 } 5495 5496 /* two bytes are reserved for each surrogate */ 5497 if (moreunits > 1) { 5498 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5499 Py_ssize_t morebytes = 2 * (moreunits - 1); 5500 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5501 /* integer overflow */ 5502 PyErr_NoMemory(); 5503 goto error; 5504 } 5505 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5506 goto error; 5507 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5508 } 5509 5510 if (PyBytes_Check(rep)) { 5511 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5512 out += moreunits; 5513 } else /* rep is unicode */ { 5514 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5515 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5516 &out, native_ordering); 5517 } 5518 5519 Py_CLEAR(rep); 5520 } 5521 5522 /* Cut back to size actually needed. This is necessary for, for example, 5523 encoding of a string containing isolated surrogates and the 'ignore' handler 5524 is used. */ 5525 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5526 if (nsize != PyBytes_GET_SIZE(v)) 5527 _PyBytes_Resize(&v, nsize); 5528 Py_XDECREF(errorHandler); 5529 Py_XDECREF(exc); 5530 done: 5531 return v; 5532 error: 5533 Py_XDECREF(rep); 5534 Py_XDECREF(errorHandler); 5535 Py_XDECREF(exc); 5536 Py_XDECREF(v); 5537 return NULL; 5538#undef STORECHAR 5539} 5540 5541PyObject * 5542PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5543 Py_ssize_t size, 5544 const char *errors, 5545 int byteorder) 5546{ 5547 PyObject *result; 5548 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5549 if (tmp == NULL) 5550 return NULL; 5551 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5552 Py_DECREF(tmp); 5553 return result; 5554} 5555 5556PyObject * 5557PyUnicode_AsUTF16String(PyObject *unicode) 5558{ 5559 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5560} 5561 5562/* --- Unicode Escape Codec ----------------------------------------------- */ 5563 5564/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5565 if all the escapes in the string make it still a valid ASCII string. 5566 Returns -1 if any escapes were found which cause the string to 5567 pop out of ASCII range. Otherwise returns the length of the 5568 required buffer to hold the string. 5569 */ 5570static Py_ssize_t 5571length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5572{ 5573 const unsigned char *p = (const unsigned char *)s; 5574 const unsigned char *end = p + size; 5575 Py_ssize_t length = 0; 5576 5577 if (size < 0) 5578 return -1; 5579 5580 for (; p < end; ++p) { 5581 if (*p > 127) { 5582 /* Non-ASCII */ 5583 return -1; 5584 } 5585 else if (*p != '\\') { 5586 /* Normal character */ 5587 ++length; 5588 } 5589 else { 5590 /* Backslash-escape, check next char */ 5591 ++p; 5592 /* Escape sequence reaches till end of string or 5593 non-ASCII follow-up. */ 5594 if (p >= end || *p > 127) 5595 return -1; 5596 switch (*p) { 5597 case '\n': 5598 /* backslash + \n result in zero characters */ 5599 break; 5600 case '\\': case '\'': case '\"': 5601 case 'b': case 'f': case 't': 5602 case 'n': case 'r': case 'v': case 'a': 5603 ++length; 5604 break; 5605 case '0': case '1': case '2': case '3': 5606 case '4': case '5': case '6': case '7': 5607 case 'x': case 'u': case 'U': case 'N': 5608 /* these do not guarantee ASCII characters */ 5609 return -1; 5610 default: 5611 /* count the backslash + the other character */ 5612 length += 2; 5613 } 5614 } 5615 } 5616 return length; 5617} 5618 5619static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5620 5621PyObject * 5622PyUnicode_DecodeUnicodeEscape(const char *s, 5623 Py_ssize_t size, 5624 const char *errors) 5625{ 5626 const char *starts = s; 5627 Py_ssize_t startinpos; 5628 Py_ssize_t endinpos; 5629 _PyUnicodeWriter writer; 5630 const char *end; 5631 char* message; 5632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5633 PyObject *errorHandler = NULL; 5634 PyObject *exc = NULL; 5635 Py_ssize_t len; 5636 5637 len = length_of_escaped_ascii_string(s, size); 5638 if (len == 0) 5639 _Py_RETURN_UNICODE_EMPTY(); 5640 5641 /* After length_of_escaped_ascii_string() there are two alternatives, 5642 either the string is pure ASCII with named escapes like \n, etc. 5643 and we determined it's exact size (common case) 5644 or it contains \x, \u, ... escape sequences. then we create a 5645 legacy wchar string and resize it at the end of this function. */ 5646 _PyUnicodeWriter_Init(&writer); 5647 if (len > 0) { 5648 writer.min_length = len; 5649 } 5650 else { 5651 /* Escaped strings will always be longer than the resulting 5652 Unicode string, so we start with size here and then reduce the 5653 length after conversion to the true value. 5654 (but if the error callback returns a long replacement string 5655 we'll have to allocate more space) */ 5656 writer.min_length = size; 5657 } 5658 5659 if (size == 0) 5660 return _PyUnicodeWriter_Finish(&writer); 5661 end = s + size; 5662 5663 while (s < end) { 5664 unsigned char c; 5665 Py_UCS4 x; 5666 int digits; 5667 5668 /* Non-escape characters are interpreted as Unicode ordinals */ 5669 if (*s != '\\') { 5670 x = (unsigned char)*s; 5671 s++; 5672 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5673 goto onError; 5674 continue; 5675 } 5676 5677 startinpos = s-starts; 5678 /* \ - Escapes */ 5679 s++; 5680 c = *s++; 5681 if (s > end) 5682 c = '\0'; /* Invalid after \ */ 5683 5684 switch (c) { 5685 5686 /* \x escapes */ 5687#define WRITECHAR(ch) \ 5688 do { \ 5689 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5690 goto onError; \ 5691 } while(0) 5692 5693 case '\n': break; 5694 case '\\': WRITECHAR('\\'); break; 5695 case '\'': WRITECHAR('\''); break; 5696 case '\"': WRITECHAR('\"'); break; 5697 case 'b': WRITECHAR('\b'); break; 5698 /* FF */ 5699 case 'f': WRITECHAR('\014'); break; 5700 case 't': WRITECHAR('\t'); break; 5701 case 'n': WRITECHAR('\n'); break; 5702 case 'r': WRITECHAR('\r'); break; 5703 /* VT */ 5704 case 'v': WRITECHAR('\013'); break; 5705 /* BEL, not classic C */ 5706 case 'a': WRITECHAR('\007'); break; 5707 5708 /* \OOO (octal) escapes */ 5709 case '0': case '1': case '2': case '3': 5710 case '4': case '5': case '6': case '7': 5711 x = s[-1] - '0'; 5712 if (s < end && '0' <= *s && *s <= '7') { 5713 x = (x<<3) + *s++ - '0'; 5714 if (s < end && '0' <= *s && *s <= '7') 5715 x = (x<<3) + *s++ - '0'; 5716 } 5717 WRITECHAR(x); 5718 break; 5719 5720 /* hex escapes */ 5721 /* \xXX */ 5722 case 'x': 5723 digits = 2; 5724 message = "truncated \\xXX escape"; 5725 goto hexescape; 5726 5727 /* \uXXXX */ 5728 case 'u': 5729 digits = 4; 5730 message = "truncated \\uXXXX escape"; 5731 goto hexescape; 5732 5733 /* \UXXXXXXXX */ 5734 case 'U': 5735 digits = 8; 5736 message = "truncated \\UXXXXXXXX escape"; 5737 hexescape: 5738 chr = 0; 5739 if (end - s < digits) { 5740 /* count only hex digits */ 5741 for (; s < end; ++s) { 5742 c = (unsigned char)*s; 5743 if (!Py_ISXDIGIT(c)) 5744 goto error; 5745 } 5746 goto error; 5747 } 5748 for (; digits--; ++s) { 5749 c = (unsigned char)*s; 5750 if (!Py_ISXDIGIT(c)) 5751 goto error; 5752 chr = (chr<<4) & ~0xF; 5753 if (c >= '0' && c <= '9') 5754 chr += c - '0'; 5755 else if (c >= 'a' && c <= 'f') 5756 chr += 10 + c - 'a'; 5757 else 5758 chr += 10 + c - 'A'; 5759 } 5760 if (chr == 0xffffffff && PyErr_Occurred()) 5761 /* _decoding_error will have already written into the 5762 target buffer. */ 5763 break; 5764 store: 5765 /* when we get here, chr is a 32-bit unicode character */ 5766 message = "illegal Unicode character"; 5767 if (chr > MAX_UNICODE) 5768 goto error; 5769 WRITECHAR(chr); 5770 break; 5771 5772 /* \N{name} */ 5773 case 'N': 5774 message = "malformed \\N character escape"; 5775 if (ucnhash_CAPI == NULL) { 5776 /* load the unicode data module */ 5777 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5778 PyUnicodeData_CAPSULE_NAME, 1); 5779 if (ucnhash_CAPI == NULL) 5780 goto ucnhashError; 5781 } 5782 if (*s == '{') { 5783 const char *start = s+1; 5784 /* look for the closing brace */ 5785 while (*s != '}' && s < end) 5786 s++; 5787 if (s > start && s < end && *s == '}') { 5788 /* found a name. look it up in the unicode database */ 5789 message = "unknown Unicode character name"; 5790 s++; 5791 if (s - start - 1 <= INT_MAX && 5792 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5793 &chr, 0)) 5794 goto store; 5795 } 5796 } 5797 goto error; 5798 5799 default: 5800 if (s > end) { 5801 message = "\\ at end of string"; 5802 s--; 5803 goto error; 5804 } 5805 else { 5806 WRITECHAR('\\'); 5807 WRITECHAR((unsigned char)s[-1]); 5808 } 5809 break; 5810 } 5811 continue; 5812 5813 error: 5814 endinpos = s-starts; 5815 if (unicode_decode_call_errorhandler_writer( 5816 errors, &errorHandler, 5817 "unicodeescape", message, 5818 &starts, &end, &startinpos, &endinpos, &exc, &s, 5819 &writer)) 5820 goto onError; 5821 continue; 5822 } 5823#undef WRITECHAR 5824 5825 Py_XDECREF(errorHandler); 5826 Py_XDECREF(exc); 5827 return _PyUnicodeWriter_Finish(&writer); 5828 5829 ucnhashError: 5830 PyErr_SetString( 5831 PyExc_UnicodeError, 5832 "\\N escapes not supported (can't load unicodedata module)" 5833 ); 5834 _PyUnicodeWriter_Dealloc(&writer); 5835 Py_XDECREF(errorHandler); 5836 Py_XDECREF(exc); 5837 return NULL; 5838 5839 onError: 5840 _PyUnicodeWriter_Dealloc(&writer); 5841 Py_XDECREF(errorHandler); 5842 Py_XDECREF(exc); 5843 return NULL; 5844} 5845 5846/* Return a Unicode-Escape string version of the Unicode object. 5847 5848 If quotes is true, the string is enclosed in u"" or u'' quotes as 5849 appropriate. 5850 5851*/ 5852 5853PyObject * 5854PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5855{ 5856 Py_ssize_t i, len; 5857 PyObject *repr; 5858 char *p; 5859 int kind; 5860 void *data; 5861 Py_ssize_t expandsize = 0; 5862 5863 /* Initial allocation is based on the longest-possible character 5864 escape. 5865 5866 For UCS1 strings it's '\xxx', 4 bytes per source character. 5867 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5868 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5869 */ 5870 5871 if (!PyUnicode_Check(unicode)) { 5872 PyErr_BadArgument(); 5873 return NULL; 5874 } 5875 if (PyUnicode_READY(unicode) == -1) 5876 return NULL; 5877 len = PyUnicode_GET_LENGTH(unicode); 5878 kind = PyUnicode_KIND(unicode); 5879 data = PyUnicode_DATA(unicode); 5880 switch (kind) { 5881 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5882 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5883 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5884 } 5885 5886 if (len == 0) 5887 return PyBytes_FromStringAndSize(NULL, 0); 5888 5889 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5890 return PyErr_NoMemory(); 5891 5892 repr = PyBytes_FromStringAndSize(NULL, 5893 2 5894 + expandsize*len 5895 + 1); 5896 if (repr == NULL) 5897 return NULL; 5898 5899 p = PyBytes_AS_STRING(repr); 5900 5901 for (i = 0; i < len; i++) { 5902 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5903 5904 /* Escape backslashes */ 5905 if (ch == '\\') { 5906 *p++ = '\\'; 5907 *p++ = (char) ch; 5908 continue; 5909 } 5910 5911 /* Map 21-bit characters to '\U00xxxxxx' */ 5912 else if (ch >= 0x10000) { 5913 assert(ch <= MAX_UNICODE); 5914 *p++ = '\\'; 5915 *p++ = 'U'; 5916 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5917 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5918 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5919 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5920 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5921 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5922 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5923 *p++ = Py_hexdigits[ch & 0x0000000F]; 5924 continue; 5925 } 5926 5927 /* Map 16-bit characters to '\uxxxx' */ 5928 if (ch >= 256) { 5929 *p++ = '\\'; 5930 *p++ = 'u'; 5931 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5932 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5933 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5934 *p++ = Py_hexdigits[ch & 0x000F]; 5935 } 5936 5937 /* Map special whitespace to '\t', \n', '\r' */ 5938 else if (ch == '\t') { 5939 *p++ = '\\'; 5940 *p++ = 't'; 5941 } 5942 else if (ch == '\n') { 5943 *p++ = '\\'; 5944 *p++ = 'n'; 5945 } 5946 else if (ch == '\r') { 5947 *p++ = '\\'; 5948 *p++ = 'r'; 5949 } 5950 5951 /* Map non-printable US ASCII to '\xhh' */ 5952 else if (ch < ' ' || ch >= 0x7F) { 5953 *p++ = '\\'; 5954 *p++ = 'x'; 5955 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5956 *p++ = Py_hexdigits[ch & 0x000F]; 5957 } 5958 5959 /* Copy everything else as-is */ 5960 else 5961 *p++ = (char) ch; 5962 } 5963 5964 assert(p - PyBytes_AS_STRING(repr) > 0); 5965 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5966 return NULL; 5967 return repr; 5968} 5969 5970PyObject * 5971PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5972 Py_ssize_t size) 5973{ 5974 PyObject *result; 5975 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5976 if (tmp == NULL) 5977 return NULL; 5978 result = PyUnicode_AsUnicodeEscapeString(tmp); 5979 Py_DECREF(tmp); 5980 return result; 5981} 5982 5983/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5984 5985PyObject * 5986PyUnicode_DecodeRawUnicodeEscape(const char *s, 5987 Py_ssize_t size, 5988 const char *errors) 5989{ 5990 const char *starts = s; 5991 Py_ssize_t startinpos; 5992 Py_ssize_t endinpos; 5993 _PyUnicodeWriter writer; 5994 const char *end; 5995 const char *bs; 5996 PyObject *errorHandler = NULL; 5997 PyObject *exc = NULL; 5998 5999 if (size == 0) 6000 _Py_RETURN_UNICODE_EMPTY(); 6001 6002 /* Escaped strings will always be longer than the resulting 6003 Unicode string, so we start with size here and then reduce the 6004 length after conversion to the true value. (But decoding error 6005 handler might have to resize the string) */ 6006 _PyUnicodeWriter_Init(&writer); 6007 writer.min_length = size; 6008 6009 end = s + size; 6010 while (s < end) { 6011 unsigned char c; 6012 Py_UCS4 x; 6013 int i; 6014 int count; 6015 6016 /* Non-escape characters are interpreted as Unicode ordinals */ 6017 if (*s != '\\') { 6018 x = (unsigned char)*s++; 6019 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6020 goto onError; 6021 continue; 6022 } 6023 startinpos = s-starts; 6024 6025 /* \u-escapes are only interpreted iff the number of leading 6026 backslashes if odd */ 6027 bs = s; 6028 for (;s < end;) { 6029 if (*s != '\\') 6030 break; 6031 x = (unsigned char)*s++; 6032 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6033 goto onError; 6034 } 6035 if (((s - bs) & 1) == 0 || 6036 s >= end || 6037 (*s != 'u' && *s != 'U')) { 6038 continue; 6039 } 6040 writer.pos--; 6041 count = *s=='u' ? 4 : 8; 6042 s++; 6043 6044 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6045 for (x = 0, i = 0; i < count; ++i, ++s) { 6046 c = (unsigned char)*s; 6047 if (!Py_ISXDIGIT(c)) { 6048 endinpos = s-starts; 6049 if (unicode_decode_call_errorhandler_writer( 6050 errors, &errorHandler, 6051 "rawunicodeescape", "truncated \\uXXXX", 6052 &starts, &end, &startinpos, &endinpos, &exc, &s, 6053 &writer)) 6054 goto onError; 6055 goto nextByte; 6056 } 6057 x = (x<<4) & ~0xF; 6058 if (c >= '0' && c <= '9') 6059 x += c - '0'; 6060 else if (c >= 'a' && c <= 'f') 6061 x += 10 + c - 'a'; 6062 else 6063 x += 10 + c - 'A'; 6064 } 6065 if (x <= MAX_UNICODE) { 6066 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6067 goto onError; 6068 } 6069 else { 6070 endinpos = s-starts; 6071 if (unicode_decode_call_errorhandler_writer( 6072 errors, &errorHandler, 6073 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6074 &starts, &end, &startinpos, &endinpos, &exc, &s, 6075 &writer)) 6076 goto onError; 6077 } 6078 nextByte: 6079 ; 6080 } 6081 Py_XDECREF(errorHandler); 6082 Py_XDECREF(exc); 6083 return _PyUnicodeWriter_Finish(&writer); 6084 6085 onError: 6086 _PyUnicodeWriter_Dealloc(&writer); 6087 Py_XDECREF(errorHandler); 6088 Py_XDECREF(exc); 6089 return NULL; 6090} 6091 6092 6093PyObject * 6094PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6095{ 6096 PyObject *repr; 6097 char *p; 6098 char *q; 6099 Py_ssize_t expandsize, pos; 6100 int kind; 6101 void *data; 6102 Py_ssize_t len; 6103 6104 if (!PyUnicode_Check(unicode)) { 6105 PyErr_BadArgument(); 6106 return NULL; 6107 } 6108 if (PyUnicode_READY(unicode) == -1) 6109 return NULL; 6110 kind = PyUnicode_KIND(unicode); 6111 data = PyUnicode_DATA(unicode); 6112 len = PyUnicode_GET_LENGTH(unicode); 6113 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6114 bytes, and 1 byte characters 4. */ 6115 expandsize = kind * 2 + 2; 6116 6117 if (len > PY_SSIZE_T_MAX / expandsize) 6118 return PyErr_NoMemory(); 6119 6120 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6121 if (repr == NULL) 6122 return NULL; 6123 if (len == 0) 6124 return repr; 6125 6126 p = q = PyBytes_AS_STRING(repr); 6127 for (pos = 0; pos < len; pos++) { 6128 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6129 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6130 if (ch >= 0x10000) { 6131 assert(ch <= MAX_UNICODE); 6132 *p++ = '\\'; 6133 *p++ = 'U'; 6134 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6135 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6136 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6137 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6138 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6139 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6140 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6141 *p++ = Py_hexdigits[ch & 15]; 6142 } 6143 /* Map 16-bit characters to '\uxxxx' */ 6144 else if (ch >= 256) { 6145 *p++ = '\\'; 6146 *p++ = 'u'; 6147 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6148 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6149 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6150 *p++ = Py_hexdigits[ch & 15]; 6151 } 6152 /* Copy everything else as-is */ 6153 else 6154 *p++ = (char) ch; 6155 } 6156 6157 assert(p > q); 6158 if (_PyBytes_Resize(&repr, p - q) < 0) 6159 return NULL; 6160 return repr; 6161} 6162 6163PyObject * 6164PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6165 Py_ssize_t size) 6166{ 6167 PyObject *result; 6168 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6169 if (tmp == NULL) 6170 return NULL; 6171 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6172 Py_DECREF(tmp); 6173 return result; 6174} 6175 6176/* --- Unicode Internal Codec ------------------------------------------- */ 6177 6178PyObject * 6179_PyUnicode_DecodeUnicodeInternal(const char *s, 6180 Py_ssize_t size, 6181 const char *errors) 6182{ 6183 const char *starts = s; 6184 Py_ssize_t startinpos; 6185 Py_ssize_t endinpos; 6186 _PyUnicodeWriter writer; 6187 const char *end; 6188 const char *reason; 6189 PyObject *errorHandler = NULL; 6190 PyObject *exc = NULL; 6191 6192 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6193 "unicode_internal codec has been deprecated", 6194 1)) 6195 return NULL; 6196 6197 if (size == 0) 6198 _Py_RETURN_UNICODE_EMPTY(); 6199 6200 _PyUnicodeWriter_Init(&writer); 6201 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6202 PyErr_NoMemory(); 6203 goto onError; 6204 } 6205 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6206 6207 end = s + size; 6208 while (s < end) { 6209 Py_UNICODE uch; 6210 Py_UCS4 ch; 6211 if (end - s < Py_UNICODE_SIZE) { 6212 endinpos = end-starts; 6213 reason = "truncated input"; 6214 goto error; 6215 } 6216 /* We copy the raw representation one byte at a time because the 6217 pointer may be unaligned (see test_codeccallbacks). */ 6218 ((char *) &uch)[0] = s[0]; 6219 ((char *) &uch)[1] = s[1]; 6220#ifdef Py_UNICODE_WIDE 6221 ((char *) &uch)[2] = s[2]; 6222 ((char *) &uch)[3] = s[3]; 6223#endif 6224 ch = uch; 6225#ifdef Py_UNICODE_WIDE 6226 /* We have to sanity check the raw data, otherwise doom looms for 6227 some malformed UCS-4 data. */ 6228 if (ch > 0x10ffff) { 6229 endinpos = s - starts + Py_UNICODE_SIZE; 6230 reason = "illegal code point (> 0x10FFFF)"; 6231 goto error; 6232 } 6233#endif 6234 s += Py_UNICODE_SIZE; 6235#ifndef Py_UNICODE_WIDE 6236 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6237 { 6238 Py_UNICODE uch2; 6239 ((char *) &uch2)[0] = s[0]; 6240 ((char *) &uch2)[1] = s[1]; 6241 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6242 { 6243 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6244 s += Py_UNICODE_SIZE; 6245 } 6246 } 6247#endif 6248 6249 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6250 goto onError; 6251 continue; 6252 6253 error: 6254 startinpos = s - starts; 6255 if (unicode_decode_call_errorhandler_writer( 6256 errors, &errorHandler, 6257 "unicode_internal", reason, 6258 &starts, &end, &startinpos, &endinpos, &exc, &s, 6259 &writer)) 6260 goto onError; 6261 } 6262 6263 Py_XDECREF(errorHandler); 6264 Py_XDECREF(exc); 6265 return _PyUnicodeWriter_Finish(&writer); 6266 6267 onError: 6268 _PyUnicodeWriter_Dealloc(&writer); 6269 Py_XDECREF(errorHandler); 6270 Py_XDECREF(exc); 6271 return NULL; 6272} 6273 6274/* --- Latin-1 Codec ------------------------------------------------------ */ 6275 6276PyObject * 6277PyUnicode_DecodeLatin1(const char *s, 6278 Py_ssize_t size, 6279 const char *errors) 6280{ 6281 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6282 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6283} 6284 6285/* create or adjust a UnicodeEncodeError */ 6286static void 6287make_encode_exception(PyObject **exceptionObject, 6288 const char *encoding, 6289 PyObject *unicode, 6290 Py_ssize_t startpos, Py_ssize_t endpos, 6291 const char *reason) 6292{ 6293 if (*exceptionObject == NULL) { 6294 *exceptionObject = PyObject_CallFunction( 6295 PyExc_UnicodeEncodeError, "sOnns", 6296 encoding, unicode, startpos, endpos, reason); 6297 } 6298 else { 6299 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6300 goto onError; 6301 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6302 goto onError; 6303 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6304 goto onError; 6305 return; 6306 onError: 6307 Py_CLEAR(*exceptionObject); 6308 } 6309} 6310 6311/* raises a UnicodeEncodeError */ 6312static void 6313raise_encode_exception(PyObject **exceptionObject, 6314 const char *encoding, 6315 PyObject *unicode, 6316 Py_ssize_t startpos, Py_ssize_t endpos, 6317 const char *reason) 6318{ 6319 make_encode_exception(exceptionObject, 6320 encoding, unicode, startpos, endpos, reason); 6321 if (*exceptionObject != NULL) 6322 PyCodec_StrictErrors(*exceptionObject); 6323} 6324 6325/* error handling callback helper: 6326 build arguments, call the callback and check the arguments, 6327 put the result into newpos and return the replacement string, which 6328 has to be freed by the caller */ 6329static PyObject * 6330unicode_encode_call_errorhandler(const char *errors, 6331 PyObject **errorHandler, 6332 const char *encoding, const char *reason, 6333 PyObject *unicode, PyObject **exceptionObject, 6334 Py_ssize_t startpos, Py_ssize_t endpos, 6335 Py_ssize_t *newpos) 6336{ 6337 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6338 Py_ssize_t len; 6339 PyObject *restuple; 6340 PyObject *resunicode; 6341 6342 if (*errorHandler == NULL) { 6343 *errorHandler = PyCodec_LookupError(errors); 6344 if (*errorHandler == NULL) 6345 return NULL; 6346 } 6347 6348 if (PyUnicode_READY(unicode) == -1) 6349 return NULL; 6350 len = PyUnicode_GET_LENGTH(unicode); 6351 6352 make_encode_exception(exceptionObject, 6353 encoding, unicode, startpos, endpos, reason); 6354 if (*exceptionObject == NULL) 6355 return NULL; 6356 6357 restuple = PyObject_CallFunctionObjArgs( 6358 *errorHandler, *exceptionObject, NULL); 6359 if (restuple == NULL) 6360 return NULL; 6361 if (!PyTuple_Check(restuple)) { 6362 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6363 Py_DECREF(restuple); 6364 return NULL; 6365 } 6366 if (!PyArg_ParseTuple(restuple, argparse, 6367 &resunicode, newpos)) { 6368 Py_DECREF(restuple); 6369 return NULL; 6370 } 6371 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6372 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6373 Py_DECREF(restuple); 6374 return NULL; 6375 } 6376 if (*newpos<0) 6377 *newpos = len + *newpos; 6378 if (*newpos<0 || *newpos>len) { 6379 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6380 Py_DECREF(restuple); 6381 return NULL; 6382 } 6383 Py_INCREF(resunicode); 6384 Py_DECREF(restuple); 6385 return resunicode; 6386} 6387 6388static PyObject * 6389unicode_encode_ucs1(PyObject *unicode, 6390 const char *errors, 6391 unsigned int limit) 6392{ 6393 /* input state */ 6394 Py_ssize_t pos=0, size; 6395 int kind; 6396 void *data; 6397 /* output object */ 6398 PyObject *res; 6399 /* pointer into the output */ 6400 char *str; 6401 /* current output position */ 6402 Py_ssize_t ressize; 6403 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6404 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6405 PyObject *errorHandler = NULL; 6406 PyObject *exc = NULL; 6407 /* the following variable is used for caching string comparisons 6408 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6409 int known_errorHandler = -1; 6410 6411 if (PyUnicode_READY(unicode) == -1) 6412 return NULL; 6413 size = PyUnicode_GET_LENGTH(unicode); 6414 kind = PyUnicode_KIND(unicode); 6415 data = PyUnicode_DATA(unicode); 6416 /* allocate enough for a simple encoding without 6417 replacements, if we need more, we'll resize */ 6418 if (size == 0) 6419 return PyBytes_FromStringAndSize(NULL, 0); 6420 res = PyBytes_FromStringAndSize(NULL, size); 6421 if (res == NULL) 6422 return NULL; 6423 str = PyBytes_AS_STRING(res); 6424 ressize = size; 6425 6426 while (pos < size) { 6427 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6428 6429 /* can we encode this? */ 6430 if (c<limit) { 6431 /* no overflow check, because we know that the space is enough */ 6432 *str++ = (char)c; 6433 ++pos; 6434 } 6435 else { 6436 Py_ssize_t requiredsize; 6437 PyObject *repunicode; 6438 Py_ssize_t repsize, newpos, respos, i; 6439 /* startpos for collecting unencodable chars */ 6440 Py_ssize_t collstart = pos; 6441 Py_ssize_t collend = pos; 6442 /* find all unecodable characters */ 6443 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6444 ++collend; 6445 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6446 if (known_errorHandler==-1) { 6447 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6448 known_errorHandler = 1; 6449 else if (!strcmp(errors, "replace")) 6450 known_errorHandler = 2; 6451 else if (!strcmp(errors, "ignore")) 6452 known_errorHandler = 3; 6453 else if (!strcmp(errors, "xmlcharrefreplace")) 6454 known_errorHandler = 4; 6455 else 6456 known_errorHandler = 0; 6457 } 6458 switch (known_errorHandler) { 6459 case 1: /* strict */ 6460 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6461 goto onError; 6462 case 2: /* replace */ 6463 while (collstart++ < collend) 6464 *str++ = '?'; /* fall through */ 6465 case 3: /* ignore */ 6466 pos = collend; 6467 break; 6468 case 4: /* xmlcharrefreplace */ 6469 respos = str - PyBytes_AS_STRING(res); 6470 requiredsize = respos; 6471 /* determine replacement size */ 6472 for (i = collstart; i < collend; ++i) { 6473 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6474 Py_ssize_t incr; 6475 if (ch < 10) 6476 incr = 2+1+1; 6477 else if (ch < 100) 6478 incr = 2+2+1; 6479 else if (ch < 1000) 6480 incr = 2+3+1; 6481 else if (ch < 10000) 6482 incr = 2+4+1; 6483 else if (ch < 100000) 6484 incr = 2+5+1; 6485 else if (ch < 1000000) 6486 incr = 2+6+1; 6487 else { 6488 assert(ch <= MAX_UNICODE); 6489 incr = 2+7+1; 6490 } 6491 if (requiredsize > PY_SSIZE_T_MAX - incr) 6492 goto overflow; 6493 requiredsize += incr; 6494 } 6495 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6496 goto overflow; 6497 requiredsize += size - collend; 6498 if (requiredsize > ressize) { 6499 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6500 requiredsize = 2*ressize; 6501 if (_PyBytes_Resize(&res, requiredsize)) 6502 goto onError; 6503 str = PyBytes_AS_STRING(res) + respos; 6504 ressize = requiredsize; 6505 } 6506 /* generate replacement */ 6507 for (i = collstart; i < collend; ++i) { 6508 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6509 } 6510 pos = collend; 6511 break; 6512 default: 6513 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6514 encoding, reason, unicode, &exc, 6515 collstart, collend, &newpos); 6516 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6517 PyUnicode_READY(repunicode) == -1)) 6518 goto onError; 6519 if (PyBytes_Check(repunicode)) { 6520 /* Directly copy bytes result to output. */ 6521 repsize = PyBytes_Size(repunicode); 6522 if (repsize > 1) { 6523 /* Make room for all additional bytes. */ 6524 respos = str - PyBytes_AS_STRING(res); 6525 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6526 Py_DECREF(repunicode); 6527 goto overflow; 6528 } 6529 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6530 Py_DECREF(repunicode); 6531 goto onError; 6532 } 6533 str = PyBytes_AS_STRING(res) + respos; 6534 ressize += repsize-1; 6535 } 6536 memcpy(str, PyBytes_AsString(repunicode), repsize); 6537 str += repsize; 6538 pos = newpos; 6539 Py_DECREF(repunicode); 6540 break; 6541 } 6542 /* need more space? (at least enough for what we 6543 have+the replacement+the rest of the string, so 6544 we won't have to check space for encodable characters) */ 6545 respos = str - PyBytes_AS_STRING(res); 6546 repsize = PyUnicode_GET_LENGTH(repunicode); 6547 requiredsize = respos; 6548 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6549 goto overflow; 6550 requiredsize += repsize; 6551 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6552 goto overflow; 6553 requiredsize += size - collend; 6554 if (requiredsize > ressize) { 6555 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6556 requiredsize = 2*ressize; 6557 if (_PyBytes_Resize(&res, requiredsize)) { 6558 Py_DECREF(repunicode); 6559 goto onError; 6560 } 6561 str = PyBytes_AS_STRING(res) + respos; 6562 ressize = requiredsize; 6563 } 6564 /* check if there is anything unencodable in the replacement 6565 and copy it to the output */ 6566 for (i = 0; repsize-->0; ++i, ++str) { 6567 c = PyUnicode_READ_CHAR(repunicode, i); 6568 if (c >= limit) { 6569 raise_encode_exception(&exc, encoding, unicode, 6570 pos, pos+1, reason); 6571 Py_DECREF(repunicode); 6572 goto onError; 6573 } 6574 *str = (char)c; 6575 } 6576 pos = newpos; 6577 Py_DECREF(repunicode); 6578 } 6579 } 6580 } 6581 /* Resize if we allocated to much */ 6582 size = str - PyBytes_AS_STRING(res); 6583 if (size < ressize) { /* If this falls res will be NULL */ 6584 assert(size >= 0); 6585 if (_PyBytes_Resize(&res, size) < 0) 6586 goto onError; 6587 } 6588 6589 Py_XDECREF(errorHandler); 6590 Py_XDECREF(exc); 6591 return res; 6592 6593 overflow: 6594 PyErr_SetString(PyExc_OverflowError, 6595 "encoded result is too long for a Python string"); 6596 6597 onError: 6598 Py_XDECREF(res); 6599 Py_XDECREF(errorHandler); 6600 Py_XDECREF(exc); 6601 return NULL; 6602} 6603 6604/* Deprecated */ 6605PyObject * 6606PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6607 Py_ssize_t size, 6608 const char *errors) 6609{ 6610 PyObject *result; 6611 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6612 if (unicode == NULL) 6613 return NULL; 6614 result = unicode_encode_ucs1(unicode, errors, 256); 6615 Py_DECREF(unicode); 6616 return result; 6617} 6618 6619PyObject * 6620_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6621{ 6622 if (!PyUnicode_Check(unicode)) { 6623 PyErr_BadArgument(); 6624 return NULL; 6625 } 6626 if (PyUnicode_READY(unicode) == -1) 6627 return NULL; 6628 /* Fast path: if it is a one-byte string, construct 6629 bytes object directly. */ 6630 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6631 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6632 PyUnicode_GET_LENGTH(unicode)); 6633 /* Non-Latin-1 characters present. Defer to above function to 6634 raise the exception. */ 6635 return unicode_encode_ucs1(unicode, errors, 256); 6636} 6637 6638PyObject* 6639PyUnicode_AsLatin1String(PyObject *unicode) 6640{ 6641 return _PyUnicode_AsLatin1String(unicode, NULL); 6642} 6643 6644/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6645 6646PyObject * 6647PyUnicode_DecodeASCII(const char *s, 6648 Py_ssize_t size, 6649 const char *errors) 6650{ 6651 const char *starts = s; 6652 _PyUnicodeWriter writer; 6653 int kind; 6654 void *data; 6655 Py_ssize_t startinpos; 6656 Py_ssize_t endinpos; 6657 Py_ssize_t outpos; 6658 const char *e; 6659 PyObject *errorHandler = NULL; 6660 PyObject *exc = NULL; 6661 6662 if (size == 0) 6663 _Py_RETURN_UNICODE_EMPTY(); 6664 6665 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6666 if (size == 1 && (unsigned char)s[0] < 128) 6667 return get_latin1_char((unsigned char)s[0]); 6668 6669 _PyUnicodeWriter_Init(&writer); 6670 writer.min_length = size; 6671 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6672 return NULL; 6673 6674 e = s + size; 6675 data = writer.data; 6676 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6677 writer.pos = outpos; 6678 if (writer.pos == size) 6679 return _PyUnicodeWriter_Finish(&writer); 6680 6681 s += writer.pos; 6682 kind = writer.kind; 6683 while (s < e) { 6684 unsigned char c = (unsigned char)*s; 6685 if (c < 128) { 6686 PyUnicode_WRITE(kind, data, writer.pos, c); 6687 writer.pos++; 6688 ++s; 6689 } 6690 else { 6691 startinpos = s-starts; 6692 endinpos = startinpos + 1; 6693 if (unicode_decode_call_errorhandler_writer( 6694 errors, &errorHandler, 6695 "ascii", "ordinal not in range(128)", 6696 &starts, &e, &startinpos, &endinpos, &exc, &s, 6697 &writer)) 6698 goto onError; 6699 kind = writer.kind; 6700 data = writer.data; 6701 } 6702 } 6703 Py_XDECREF(errorHandler); 6704 Py_XDECREF(exc); 6705 return _PyUnicodeWriter_Finish(&writer); 6706 6707 onError: 6708 _PyUnicodeWriter_Dealloc(&writer); 6709 Py_XDECREF(errorHandler); 6710 Py_XDECREF(exc); 6711 return NULL; 6712} 6713 6714/* Deprecated */ 6715PyObject * 6716PyUnicode_EncodeASCII(const Py_UNICODE *p, 6717 Py_ssize_t size, 6718 const char *errors) 6719{ 6720 PyObject *result; 6721 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6722 if (unicode == NULL) 6723 return NULL; 6724 result = unicode_encode_ucs1(unicode, errors, 128); 6725 Py_DECREF(unicode); 6726 return result; 6727} 6728 6729PyObject * 6730_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6731{ 6732 if (!PyUnicode_Check(unicode)) { 6733 PyErr_BadArgument(); 6734 return NULL; 6735 } 6736 if (PyUnicode_READY(unicode) == -1) 6737 return NULL; 6738 /* Fast path: if it is an ASCII-only string, construct bytes object 6739 directly. Else defer to above function to raise the exception. */ 6740 if (PyUnicode_IS_ASCII(unicode)) 6741 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6742 PyUnicode_GET_LENGTH(unicode)); 6743 return unicode_encode_ucs1(unicode, errors, 128); 6744} 6745 6746PyObject * 6747PyUnicode_AsASCIIString(PyObject *unicode) 6748{ 6749 return _PyUnicode_AsASCIIString(unicode, NULL); 6750} 6751 6752#ifdef HAVE_MBCS 6753 6754/* --- MBCS codecs for Windows -------------------------------------------- */ 6755 6756#if SIZEOF_INT < SIZEOF_SIZE_T 6757#define NEED_RETRY 6758#endif 6759 6760#ifndef WC_ERR_INVALID_CHARS 6761# define WC_ERR_INVALID_CHARS 0x0080 6762#endif 6763 6764static char* 6765code_page_name(UINT code_page, PyObject **obj) 6766{ 6767 *obj = NULL; 6768 if (code_page == CP_ACP) 6769 return "mbcs"; 6770 if (code_page == CP_UTF7) 6771 return "CP_UTF7"; 6772 if (code_page == CP_UTF8) 6773 return "CP_UTF8"; 6774 6775 *obj = PyBytes_FromFormat("cp%u", code_page); 6776 if (*obj == NULL) 6777 return NULL; 6778 return PyBytes_AS_STRING(*obj); 6779} 6780 6781static DWORD 6782decode_code_page_flags(UINT code_page) 6783{ 6784 if (code_page == CP_UTF7) { 6785 /* The CP_UTF7 decoder only supports flags=0 */ 6786 return 0; 6787 } 6788 else 6789 return MB_ERR_INVALID_CHARS; 6790} 6791 6792/* 6793 * Decode a byte string from a Windows code page into unicode object in strict 6794 * mode. 6795 * 6796 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6797 * OSError and returns -1 on other error. 6798 */ 6799static int 6800decode_code_page_strict(UINT code_page, 6801 PyObject **v, 6802 const char *in, 6803 int insize) 6804{ 6805 const DWORD flags = decode_code_page_flags(code_page); 6806 wchar_t *out; 6807 DWORD outsize; 6808 6809 /* First get the size of the result */ 6810 assert(insize > 0); 6811 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6812 if (outsize <= 0) 6813 goto error; 6814 6815 if (*v == NULL) { 6816 /* Create unicode object */ 6817 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6818 *v = (PyObject*)_PyUnicode_New(outsize); 6819 if (*v == NULL) 6820 return -1; 6821 out = PyUnicode_AS_UNICODE(*v); 6822 } 6823 else { 6824 /* Extend unicode object */ 6825 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6826 if (unicode_resize(v, n + outsize) < 0) 6827 return -1; 6828 out = PyUnicode_AS_UNICODE(*v) + n; 6829 } 6830 6831 /* Do the conversion */ 6832 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6833 if (outsize <= 0) 6834 goto error; 6835 return insize; 6836 6837error: 6838 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6839 return -2; 6840 PyErr_SetFromWindowsErr(0); 6841 return -1; 6842} 6843 6844/* 6845 * Decode a byte string from a code page into unicode object with an error 6846 * handler. 6847 * 6848 * Returns consumed size if succeed, or raise an OSError or 6849 * UnicodeDecodeError exception and returns -1 on error. 6850 */ 6851static int 6852decode_code_page_errors(UINT code_page, 6853 PyObject **v, 6854 const char *in, const int size, 6855 const char *errors, int final) 6856{ 6857 const char *startin = in; 6858 const char *endin = in + size; 6859 const DWORD flags = decode_code_page_flags(code_page); 6860 /* Ideally, we should get reason from FormatMessage. This is the Windows 6861 2000 English version of the message. */ 6862 const char *reason = "No mapping for the Unicode character exists " 6863 "in the target code page."; 6864 /* each step cannot decode more than 1 character, but a character can be 6865 represented as a surrogate pair */ 6866 wchar_t buffer[2], *startout, *out; 6867 int insize; 6868 Py_ssize_t outsize; 6869 PyObject *errorHandler = NULL; 6870 PyObject *exc = NULL; 6871 PyObject *encoding_obj = NULL; 6872 char *encoding; 6873 DWORD err; 6874 int ret = -1; 6875 6876 assert(size > 0); 6877 6878 encoding = code_page_name(code_page, &encoding_obj); 6879 if (encoding == NULL) 6880 return -1; 6881 6882 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 6883 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6884 UnicodeDecodeError. */ 6885 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6886 if (exc != NULL) { 6887 PyCodec_StrictErrors(exc); 6888 Py_CLEAR(exc); 6889 } 6890 goto error; 6891 } 6892 6893 if (*v == NULL) { 6894 /* Create unicode object */ 6895 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6896 PyErr_NoMemory(); 6897 goto error; 6898 } 6899 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6900 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6901 if (*v == NULL) 6902 goto error; 6903 startout = PyUnicode_AS_UNICODE(*v); 6904 } 6905 else { 6906 /* Extend unicode object */ 6907 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6908 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6909 PyErr_NoMemory(); 6910 goto error; 6911 } 6912 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6913 goto error; 6914 startout = PyUnicode_AS_UNICODE(*v) + n; 6915 } 6916 6917 /* Decode the byte string character per character */ 6918 out = startout; 6919 while (in < endin) 6920 { 6921 /* Decode a character */ 6922 insize = 1; 6923 do 6924 { 6925 outsize = MultiByteToWideChar(code_page, flags, 6926 in, insize, 6927 buffer, Py_ARRAY_LENGTH(buffer)); 6928 if (outsize > 0) 6929 break; 6930 err = GetLastError(); 6931 if (err != ERROR_NO_UNICODE_TRANSLATION 6932 && err != ERROR_INSUFFICIENT_BUFFER) 6933 { 6934 PyErr_SetFromWindowsErr(0); 6935 goto error; 6936 } 6937 insize++; 6938 } 6939 /* 4=maximum length of a UTF-8 sequence */ 6940 while (insize <= 4 && (in + insize) <= endin); 6941 6942 if (outsize <= 0) { 6943 Py_ssize_t startinpos, endinpos, outpos; 6944 6945 /* last character in partial decode? */ 6946 if (in + insize >= endin && !final) 6947 break; 6948 6949 startinpos = in - startin; 6950 endinpos = startinpos + 1; 6951 outpos = out - PyUnicode_AS_UNICODE(*v); 6952 if (unicode_decode_call_errorhandler_wchar( 6953 errors, &errorHandler, 6954 encoding, reason, 6955 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6956 v, &outpos)) 6957 { 6958 goto error; 6959 } 6960 out = PyUnicode_AS_UNICODE(*v) + outpos; 6961 } 6962 else { 6963 in += insize; 6964 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6965 out += outsize; 6966 } 6967 } 6968 6969 /* write a NUL character at the end */ 6970 *out = 0; 6971 6972 /* Extend unicode object */ 6973 outsize = out - startout; 6974 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6975 if (unicode_resize(v, outsize) < 0) 6976 goto error; 6977 /* (in - startin) <= size and size is an int */ 6978 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 6979 6980error: 6981 Py_XDECREF(encoding_obj); 6982 Py_XDECREF(errorHandler); 6983 Py_XDECREF(exc); 6984 return ret; 6985} 6986 6987static PyObject * 6988decode_code_page_stateful(int code_page, 6989 const char *s, Py_ssize_t size, 6990 const char *errors, Py_ssize_t *consumed) 6991{ 6992 PyObject *v = NULL; 6993 int chunk_size, final, converted, done; 6994 6995 if (code_page < 0) { 6996 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6997 return NULL; 6998 } 6999 7000 if (consumed) 7001 *consumed = 0; 7002 7003 do 7004 { 7005#ifdef NEED_RETRY 7006 if (size > INT_MAX) { 7007 chunk_size = INT_MAX; 7008 final = 0; 7009 done = 0; 7010 } 7011 else 7012#endif 7013 { 7014 chunk_size = (int)size; 7015 final = (consumed == NULL); 7016 done = 1; 7017 } 7018 7019 if (chunk_size == 0 && done) { 7020 if (v != NULL) 7021 break; 7022 _Py_RETURN_UNICODE_EMPTY(); 7023 } 7024 7025 converted = decode_code_page_strict(code_page, &v, 7026 s, chunk_size); 7027 if (converted == -2) 7028 converted = decode_code_page_errors(code_page, &v, 7029 s, chunk_size, 7030 errors, final); 7031 assert(converted != 0 || done); 7032 7033 if (converted < 0) { 7034 Py_XDECREF(v); 7035 return NULL; 7036 } 7037 7038 if (consumed) 7039 *consumed += converted; 7040 7041 s += converted; 7042 size -= converted; 7043 } while (!done); 7044 7045 return unicode_result(v); 7046} 7047 7048PyObject * 7049PyUnicode_DecodeCodePageStateful(int code_page, 7050 const char *s, 7051 Py_ssize_t size, 7052 const char *errors, 7053 Py_ssize_t *consumed) 7054{ 7055 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7056} 7057 7058PyObject * 7059PyUnicode_DecodeMBCSStateful(const char *s, 7060 Py_ssize_t size, 7061 const char *errors, 7062 Py_ssize_t *consumed) 7063{ 7064 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7065} 7066 7067PyObject * 7068PyUnicode_DecodeMBCS(const char *s, 7069 Py_ssize_t size, 7070 const char *errors) 7071{ 7072 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7073} 7074 7075static DWORD 7076encode_code_page_flags(UINT code_page, const char *errors) 7077{ 7078 if (code_page == CP_UTF8) { 7079 return WC_ERR_INVALID_CHARS; 7080 } 7081 else if (code_page == CP_UTF7) { 7082 /* CP_UTF7 only supports flags=0 */ 7083 return 0; 7084 } 7085 else { 7086 if (errors != NULL && strcmp(errors, "replace") == 0) 7087 return 0; 7088 else 7089 return WC_NO_BEST_FIT_CHARS; 7090 } 7091} 7092 7093/* 7094 * Encode a Unicode string to a Windows code page into a byte string in strict 7095 * mode. 7096 * 7097 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7098 * an OSError and returns -1 on other error. 7099 */ 7100static int 7101encode_code_page_strict(UINT code_page, PyObject **outbytes, 7102 PyObject *unicode, Py_ssize_t offset, int len, 7103 const char* errors) 7104{ 7105 BOOL usedDefaultChar = FALSE; 7106 BOOL *pusedDefaultChar = &usedDefaultChar; 7107 int outsize; 7108 PyObject *exc = NULL; 7109 wchar_t *p; 7110 Py_ssize_t size; 7111 const DWORD flags = encode_code_page_flags(code_page, NULL); 7112 char *out; 7113 /* Create a substring so that we can get the UTF-16 representation 7114 of just the slice under consideration. */ 7115 PyObject *substring; 7116 7117 assert(len > 0); 7118 7119 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7120 pusedDefaultChar = &usedDefaultChar; 7121 else 7122 pusedDefaultChar = NULL; 7123 7124 substring = PyUnicode_Substring(unicode, offset, offset+len); 7125 if (substring == NULL) 7126 return -1; 7127 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7128 if (p == NULL) { 7129 Py_DECREF(substring); 7130 return -1; 7131 } 7132 assert(size <= INT_MAX); 7133 7134 /* First get the size of the result */ 7135 outsize = WideCharToMultiByte(code_page, flags, 7136 p, (int)size, 7137 NULL, 0, 7138 NULL, pusedDefaultChar); 7139 if (outsize <= 0) 7140 goto error; 7141 /* If we used a default char, then we failed! */ 7142 if (pusedDefaultChar && *pusedDefaultChar) { 7143 Py_DECREF(substring); 7144 return -2; 7145 } 7146 7147 if (*outbytes == NULL) { 7148 /* Create string object */ 7149 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7150 if (*outbytes == NULL) { 7151 Py_DECREF(substring); 7152 return -1; 7153 } 7154 out = PyBytes_AS_STRING(*outbytes); 7155 } 7156 else { 7157 /* Extend string object */ 7158 const Py_ssize_t n = PyBytes_Size(*outbytes); 7159 if (outsize > PY_SSIZE_T_MAX - n) { 7160 PyErr_NoMemory(); 7161 Py_DECREF(substring); 7162 return -1; 7163 } 7164 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7165 Py_DECREF(substring); 7166 return -1; 7167 } 7168 out = PyBytes_AS_STRING(*outbytes) + n; 7169 } 7170 7171 /* Do the conversion */ 7172 outsize = WideCharToMultiByte(code_page, flags, 7173 p, (int)size, 7174 out, outsize, 7175 NULL, pusedDefaultChar); 7176 Py_CLEAR(substring); 7177 if (outsize <= 0) 7178 goto error; 7179 if (pusedDefaultChar && *pusedDefaultChar) 7180 return -2; 7181 return 0; 7182 7183error: 7184 Py_XDECREF(substring); 7185 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7186 return -2; 7187 PyErr_SetFromWindowsErr(0); 7188 return -1; 7189} 7190 7191/* 7192 * Encode a Unicode string to a Windows code page into a byte string using a 7193 * error handler. 7194 * 7195 * Returns consumed characters if succeed, or raise an OSError and returns 7196 * -1 on other error. 7197 */ 7198static int 7199encode_code_page_errors(UINT code_page, PyObject **outbytes, 7200 PyObject *unicode, Py_ssize_t unicode_offset, 7201 Py_ssize_t insize, const char* errors) 7202{ 7203 const DWORD flags = encode_code_page_flags(code_page, errors); 7204 Py_ssize_t pos = unicode_offset; 7205 Py_ssize_t endin = unicode_offset + insize; 7206 /* Ideally, we should get reason from FormatMessage. This is the Windows 7207 2000 English version of the message. */ 7208 const char *reason = "invalid character"; 7209 /* 4=maximum length of a UTF-8 sequence */ 7210 char buffer[4]; 7211 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7212 Py_ssize_t outsize; 7213 char *out; 7214 PyObject *errorHandler = NULL; 7215 PyObject *exc = NULL; 7216 PyObject *encoding_obj = NULL; 7217 char *encoding; 7218 Py_ssize_t newpos, newoutsize; 7219 PyObject *rep; 7220 int ret = -1; 7221 7222 assert(insize > 0); 7223 7224 encoding = code_page_name(code_page, &encoding_obj); 7225 if (encoding == NULL) 7226 return -1; 7227 7228 if (errors == NULL || strcmp(errors, "strict") == 0) { 7229 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7230 then we raise a UnicodeEncodeError. */ 7231 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7232 if (exc != NULL) { 7233 PyCodec_StrictErrors(exc); 7234 Py_DECREF(exc); 7235 } 7236 Py_XDECREF(encoding_obj); 7237 return -1; 7238 } 7239 7240 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7241 pusedDefaultChar = &usedDefaultChar; 7242 else 7243 pusedDefaultChar = NULL; 7244 7245 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7246 PyErr_NoMemory(); 7247 goto error; 7248 } 7249 outsize = insize * Py_ARRAY_LENGTH(buffer); 7250 7251 if (*outbytes == NULL) { 7252 /* Create string object */ 7253 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7254 if (*outbytes == NULL) 7255 goto error; 7256 out = PyBytes_AS_STRING(*outbytes); 7257 } 7258 else { 7259 /* Extend string object */ 7260 Py_ssize_t n = PyBytes_Size(*outbytes); 7261 if (n > PY_SSIZE_T_MAX - outsize) { 7262 PyErr_NoMemory(); 7263 goto error; 7264 } 7265 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7266 goto error; 7267 out = PyBytes_AS_STRING(*outbytes) + n; 7268 } 7269 7270 /* Encode the string character per character */ 7271 while (pos < endin) 7272 { 7273 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7274 wchar_t chars[2]; 7275 int charsize; 7276 if (ch < 0x10000) { 7277 chars[0] = (wchar_t)ch; 7278 charsize = 1; 7279 } 7280 else { 7281 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7282 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7283 charsize = 2; 7284 } 7285 7286 outsize = WideCharToMultiByte(code_page, flags, 7287 chars, charsize, 7288 buffer, Py_ARRAY_LENGTH(buffer), 7289 NULL, pusedDefaultChar); 7290 if (outsize > 0) { 7291 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7292 { 7293 pos++; 7294 memcpy(out, buffer, outsize); 7295 out += outsize; 7296 continue; 7297 } 7298 } 7299 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7300 PyErr_SetFromWindowsErr(0); 7301 goto error; 7302 } 7303 7304 rep = unicode_encode_call_errorhandler( 7305 errors, &errorHandler, encoding, reason, 7306 unicode, &exc, 7307 pos, pos + 1, &newpos); 7308 if (rep == NULL) 7309 goto error; 7310 pos = newpos; 7311 7312 if (PyBytes_Check(rep)) { 7313 outsize = PyBytes_GET_SIZE(rep); 7314 if (outsize != 1) { 7315 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7316 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7317 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7318 Py_DECREF(rep); 7319 goto error; 7320 } 7321 out = PyBytes_AS_STRING(*outbytes) + offset; 7322 } 7323 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7324 out += outsize; 7325 } 7326 else { 7327 Py_ssize_t i; 7328 enum PyUnicode_Kind kind; 7329 void *data; 7330 7331 if (PyUnicode_READY(rep) == -1) { 7332 Py_DECREF(rep); 7333 goto error; 7334 } 7335 7336 outsize = PyUnicode_GET_LENGTH(rep); 7337 if (outsize != 1) { 7338 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7339 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7340 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7341 Py_DECREF(rep); 7342 goto error; 7343 } 7344 out = PyBytes_AS_STRING(*outbytes) + offset; 7345 } 7346 kind = PyUnicode_KIND(rep); 7347 data = PyUnicode_DATA(rep); 7348 for (i=0; i < outsize; i++) { 7349 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7350 if (ch > 127) { 7351 raise_encode_exception(&exc, 7352 encoding, unicode, 7353 pos, pos + 1, 7354 "unable to encode error handler result to ASCII"); 7355 Py_DECREF(rep); 7356 goto error; 7357 } 7358 *out = (unsigned char)ch; 7359 out++; 7360 } 7361 } 7362 Py_DECREF(rep); 7363 } 7364 /* write a NUL byte */ 7365 *out = 0; 7366 outsize = out - PyBytes_AS_STRING(*outbytes); 7367 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7368 if (_PyBytes_Resize(outbytes, outsize) < 0) 7369 goto error; 7370 ret = 0; 7371 7372error: 7373 Py_XDECREF(encoding_obj); 7374 Py_XDECREF(errorHandler); 7375 Py_XDECREF(exc); 7376 return ret; 7377} 7378 7379static PyObject * 7380encode_code_page(int code_page, 7381 PyObject *unicode, 7382 const char *errors) 7383{ 7384 Py_ssize_t len; 7385 PyObject *outbytes = NULL; 7386 Py_ssize_t offset; 7387 int chunk_len, ret, done; 7388 7389 if (!PyUnicode_Check(unicode)) { 7390 PyErr_BadArgument(); 7391 return NULL; 7392 } 7393 7394 if (PyUnicode_READY(unicode) == -1) 7395 return NULL; 7396 len = PyUnicode_GET_LENGTH(unicode); 7397 7398 if (code_page < 0) { 7399 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7400 return NULL; 7401 } 7402 7403 if (len == 0) 7404 return PyBytes_FromStringAndSize(NULL, 0); 7405 7406 offset = 0; 7407 do 7408 { 7409#ifdef NEED_RETRY 7410 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7411 chunks. */ 7412 if (len > INT_MAX/2) { 7413 chunk_len = INT_MAX/2; 7414 done = 0; 7415 } 7416 else 7417#endif 7418 { 7419 chunk_len = (int)len; 7420 done = 1; 7421 } 7422 7423 ret = encode_code_page_strict(code_page, &outbytes, 7424 unicode, offset, chunk_len, 7425 errors); 7426 if (ret == -2) 7427 ret = encode_code_page_errors(code_page, &outbytes, 7428 unicode, offset, 7429 chunk_len, errors); 7430 if (ret < 0) { 7431 Py_XDECREF(outbytes); 7432 return NULL; 7433 } 7434 7435 offset += chunk_len; 7436 len -= chunk_len; 7437 } while (!done); 7438 7439 return outbytes; 7440} 7441 7442PyObject * 7443PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7444 Py_ssize_t size, 7445 const char *errors) 7446{ 7447 PyObject *unicode, *res; 7448 unicode = PyUnicode_FromUnicode(p, size); 7449 if (unicode == NULL) 7450 return NULL; 7451 res = encode_code_page(CP_ACP, unicode, errors); 7452 Py_DECREF(unicode); 7453 return res; 7454} 7455 7456PyObject * 7457PyUnicode_EncodeCodePage(int code_page, 7458 PyObject *unicode, 7459 const char *errors) 7460{ 7461 return encode_code_page(code_page, unicode, errors); 7462} 7463 7464PyObject * 7465PyUnicode_AsMBCSString(PyObject *unicode) 7466{ 7467 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7468} 7469 7470#undef NEED_RETRY 7471 7472#endif /* HAVE_MBCS */ 7473 7474/* --- Character Mapping Codec -------------------------------------------- */ 7475 7476static int 7477charmap_decode_string(const char *s, 7478 Py_ssize_t size, 7479 PyObject *mapping, 7480 const char *errors, 7481 _PyUnicodeWriter *writer) 7482{ 7483 const char *starts = s; 7484 const char *e; 7485 Py_ssize_t startinpos, endinpos; 7486 PyObject *errorHandler = NULL, *exc = NULL; 7487 Py_ssize_t maplen; 7488 enum PyUnicode_Kind mapkind; 7489 void *mapdata; 7490 Py_UCS4 x; 7491 unsigned char ch; 7492 7493 if (PyUnicode_READY(mapping) == -1) 7494 return -1; 7495 7496 maplen = PyUnicode_GET_LENGTH(mapping); 7497 mapdata = PyUnicode_DATA(mapping); 7498 mapkind = PyUnicode_KIND(mapping); 7499 7500 e = s + size; 7501 7502 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7503 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7504 * is disabled in encoding aliases, latin1 is preferred because 7505 * its implementation is faster. */ 7506 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7507 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7508 Py_UCS4 maxchar = writer->maxchar; 7509 7510 assert (writer->kind == PyUnicode_1BYTE_KIND); 7511 while (s < e) { 7512 ch = *s; 7513 x = mapdata_ucs1[ch]; 7514 if (x > maxchar) { 7515 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7516 goto onError; 7517 maxchar = writer->maxchar; 7518 outdata = (Py_UCS1 *)writer->data; 7519 } 7520 outdata[writer->pos] = x; 7521 writer->pos++; 7522 ++s; 7523 } 7524 return 0; 7525 } 7526 7527 while (s < e) { 7528 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7529 enum PyUnicode_Kind outkind = writer->kind; 7530 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7531 if (outkind == PyUnicode_1BYTE_KIND) { 7532 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7533 Py_UCS4 maxchar = writer->maxchar; 7534 while (s < e) { 7535 ch = *s; 7536 x = mapdata_ucs2[ch]; 7537 if (x > maxchar) 7538 goto Error; 7539 outdata[writer->pos] = x; 7540 writer->pos++; 7541 ++s; 7542 } 7543 break; 7544 } 7545 else if (outkind == PyUnicode_2BYTE_KIND) { 7546 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7547 while (s < e) { 7548 ch = *s; 7549 x = mapdata_ucs2[ch]; 7550 if (x == 0xFFFE) 7551 goto Error; 7552 outdata[writer->pos] = x; 7553 writer->pos++; 7554 ++s; 7555 } 7556 break; 7557 } 7558 } 7559 ch = *s; 7560 7561 if (ch < maplen) 7562 x = PyUnicode_READ(mapkind, mapdata, ch); 7563 else 7564 x = 0xfffe; /* invalid value */ 7565Error: 7566 if (x == 0xfffe) 7567 { 7568 /* undefined mapping */ 7569 startinpos = s-starts; 7570 endinpos = startinpos+1; 7571 if (unicode_decode_call_errorhandler_writer( 7572 errors, &errorHandler, 7573 "charmap", "character maps to <undefined>", 7574 &starts, &e, &startinpos, &endinpos, &exc, &s, 7575 writer)) { 7576 goto onError; 7577 } 7578 continue; 7579 } 7580 7581 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7582 goto onError; 7583 ++s; 7584 } 7585 Py_XDECREF(errorHandler); 7586 Py_XDECREF(exc); 7587 return 0; 7588 7589onError: 7590 Py_XDECREF(errorHandler); 7591 Py_XDECREF(exc); 7592 return -1; 7593} 7594 7595static int 7596charmap_decode_mapping(const char *s, 7597 Py_ssize_t size, 7598 PyObject *mapping, 7599 const char *errors, 7600 _PyUnicodeWriter *writer) 7601{ 7602 const char *starts = s; 7603 const char *e; 7604 Py_ssize_t startinpos, endinpos; 7605 PyObject *errorHandler = NULL, *exc = NULL; 7606 unsigned char ch; 7607 PyObject *key, *item = NULL; 7608 7609 e = s + size; 7610 7611 while (s < e) { 7612 ch = *s; 7613 7614 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7615 key = PyLong_FromLong((long)ch); 7616 if (key == NULL) 7617 goto onError; 7618 7619 item = PyObject_GetItem(mapping, key); 7620 Py_DECREF(key); 7621 if (item == NULL) { 7622 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7623 /* No mapping found means: mapping is undefined. */ 7624 PyErr_Clear(); 7625 goto Undefined; 7626 } else 7627 goto onError; 7628 } 7629 7630 /* Apply mapping */ 7631 if (item == Py_None) 7632 goto Undefined; 7633 if (PyLong_Check(item)) { 7634 long value = PyLong_AS_LONG(item); 7635 if (value == 0xFFFE) 7636 goto Undefined; 7637 if (value < 0 || value > MAX_UNICODE) { 7638 PyErr_Format(PyExc_TypeError, 7639 "character mapping must be in range(0x%lx)", 7640 (unsigned long)MAX_UNICODE + 1); 7641 goto onError; 7642 } 7643 7644 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7645 goto onError; 7646 } 7647 else if (PyUnicode_Check(item)) { 7648 if (PyUnicode_READY(item) == -1) 7649 goto onError; 7650 if (PyUnicode_GET_LENGTH(item) == 1) { 7651 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7652 if (value == 0xFFFE) 7653 goto Undefined; 7654 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7655 goto onError; 7656 } 7657 else { 7658 writer->overallocate = 1; 7659 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7660 goto onError; 7661 } 7662 } 7663 else { 7664 /* wrong return value */ 7665 PyErr_SetString(PyExc_TypeError, 7666 "character mapping must return integer, None or str"); 7667 goto onError; 7668 } 7669 Py_CLEAR(item); 7670 ++s; 7671 continue; 7672 7673Undefined: 7674 /* undefined mapping */ 7675 Py_CLEAR(item); 7676 startinpos = s-starts; 7677 endinpos = startinpos+1; 7678 if (unicode_decode_call_errorhandler_writer( 7679 errors, &errorHandler, 7680 "charmap", "character maps to <undefined>", 7681 &starts, &e, &startinpos, &endinpos, &exc, &s, 7682 writer)) { 7683 goto onError; 7684 } 7685 } 7686 Py_XDECREF(errorHandler); 7687 Py_XDECREF(exc); 7688 return 0; 7689 7690onError: 7691 Py_XDECREF(item); 7692 Py_XDECREF(errorHandler); 7693 Py_XDECREF(exc); 7694 return -1; 7695} 7696 7697PyObject * 7698PyUnicode_DecodeCharmap(const char *s, 7699 Py_ssize_t size, 7700 PyObject *mapping, 7701 const char *errors) 7702{ 7703 _PyUnicodeWriter writer; 7704 7705 /* Default to Latin-1 */ 7706 if (mapping == NULL) 7707 return PyUnicode_DecodeLatin1(s, size, errors); 7708 7709 if (size == 0) 7710 _Py_RETURN_UNICODE_EMPTY(); 7711 _PyUnicodeWriter_Init(&writer); 7712 writer.min_length = size; 7713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7714 goto onError; 7715 7716 if (PyUnicode_CheckExact(mapping)) { 7717 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7718 goto onError; 7719 } 7720 else { 7721 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7722 goto onError; 7723 } 7724 return _PyUnicodeWriter_Finish(&writer); 7725 7726 onError: 7727 _PyUnicodeWriter_Dealloc(&writer); 7728 return NULL; 7729} 7730 7731/* Charmap encoding: the lookup table */ 7732 7733struct encoding_map { 7734 PyObject_HEAD 7735 unsigned char level1[32]; 7736 int count2, count3; 7737 unsigned char level23[1]; 7738}; 7739 7740static PyObject* 7741encoding_map_size(PyObject *obj, PyObject* args) 7742{ 7743 struct encoding_map *map = (struct encoding_map*)obj; 7744 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7745 128*map->count3); 7746} 7747 7748static PyMethodDef encoding_map_methods[] = { 7749 {"size", encoding_map_size, METH_NOARGS, 7750 PyDoc_STR("Return the size (in bytes) of this object") }, 7751 { 0 } 7752}; 7753 7754static void 7755encoding_map_dealloc(PyObject* o) 7756{ 7757 PyObject_FREE(o); 7758} 7759 7760static PyTypeObject EncodingMapType = { 7761 PyVarObject_HEAD_INIT(NULL, 0) 7762 "EncodingMap", /*tp_name*/ 7763 sizeof(struct encoding_map), /*tp_basicsize*/ 7764 0, /*tp_itemsize*/ 7765 /* methods */ 7766 encoding_map_dealloc, /*tp_dealloc*/ 7767 0, /*tp_print*/ 7768 0, /*tp_getattr*/ 7769 0, /*tp_setattr*/ 7770 0, /*tp_reserved*/ 7771 0, /*tp_repr*/ 7772 0, /*tp_as_number*/ 7773 0, /*tp_as_sequence*/ 7774 0, /*tp_as_mapping*/ 7775 0, /*tp_hash*/ 7776 0, /*tp_call*/ 7777 0, /*tp_str*/ 7778 0, /*tp_getattro*/ 7779 0, /*tp_setattro*/ 7780 0, /*tp_as_buffer*/ 7781 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7782 0, /*tp_doc*/ 7783 0, /*tp_traverse*/ 7784 0, /*tp_clear*/ 7785 0, /*tp_richcompare*/ 7786 0, /*tp_weaklistoffset*/ 7787 0, /*tp_iter*/ 7788 0, /*tp_iternext*/ 7789 encoding_map_methods, /*tp_methods*/ 7790 0, /*tp_members*/ 7791 0, /*tp_getset*/ 7792 0, /*tp_base*/ 7793 0, /*tp_dict*/ 7794 0, /*tp_descr_get*/ 7795 0, /*tp_descr_set*/ 7796 0, /*tp_dictoffset*/ 7797 0, /*tp_init*/ 7798 0, /*tp_alloc*/ 7799 0, /*tp_new*/ 7800 0, /*tp_free*/ 7801 0, /*tp_is_gc*/ 7802}; 7803 7804PyObject* 7805PyUnicode_BuildEncodingMap(PyObject* string) 7806{ 7807 PyObject *result; 7808 struct encoding_map *mresult; 7809 int i; 7810 int need_dict = 0; 7811 unsigned char level1[32]; 7812 unsigned char level2[512]; 7813 unsigned char *mlevel1, *mlevel2, *mlevel3; 7814 int count2 = 0, count3 = 0; 7815 int kind; 7816 void *data; 7817 Py_ssize_t length; 7818 Py_UCS4 ch; 7819 7820 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7821 PyErr_BadArgument(); 7822 return NULL; 7823 } 7824 kind = PyUnicode_KIND(string); 7825 data = PyUnicode_DATA(string); 7826 length = PyUnicode_GET_LENGTH(string); 7827 length = Py_MIN(length, 256); 7828 memset(level1, 0xFF, sizeof level1); 7829 memset(level2, 0xFF, sizeof level2); 7830 7831 /* If there isn't a one-to-one mapping of NULL to \0, 7832 or if there are non-BMP characters, we need to use 7833 a mapping dictionary. */ 7834 if (PyUnicode_READ(kind, data, 0) != 0) 7835 need_dict = 1; 7836 for (i = 1; i < length; i++) { 7837 int l1, l2; 7838 ch = PyUnicode_READ(kind, data, i); 7839 if (ch == 0 || ch > 0xFFFF) { 7840 need_dict = 1; 7841 break; 7842 } 7843 if (ch == 0xFFFE) 7844 /* unmapped character */ 7845 continue; 7846 l1 = ch >> 11; 7847 l2 = ch >> 7; 7848 if (level1[l1] == 0xFF) 7849 level1[l1] = count2++; 7850 if (level2[l2] == 0xFF) 7851 level2[l2] = count3++; 7852 } 7853 7854 if (count2 >= 0xFF || count3 >= 0xFF) 7855 need_dict = 1; 7856 7857 if (need_dict) { 7858 PyObject *result = PyDict_New(); 7859 PyObject *key, *value; 7860 if (!result) 7861 return NULL; 7862 for (i = 0; i < length; i++) { 7863 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7864 value = PyLong_FromLong(i); 7865 if (!key || !value) 7866 goto failed1; 7867 if (PyDict_SetItem(result, key, value) == -1) 7868 goto failed1; 7869 Py_DECREF(key); 7870 Py_DECREF(value); 7871 } 7872 return result; 7873 failed1: 7874 Py_XDECREF(key); 7875 Py_XDECREF(value); 7876 Py_DECREF(result); 7877 return NULL; 7878 } 7879 7880 /* Create a three-level trie */ 7881 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7882 16*count2 + 128*count3 - 1); 7883 if (!result) 7884 return PyErr_NoMemory(); 7885 PyObject_Init(result, &EncodingMapType); 7886 mresult = (struct encoding_map*)result; 7887 mresult->count2 = count2; 7888 mresult->count3 = count3; 7889 mlevel1 = mresult->level1; 7890 mlevel2 = mresult->level23; 7891 mlevel3 = mresult->level23 + 16*count2; 7892 memcpy(mlevel1, level1, 32); 7893 memset(mlevel2, 0xFF, 16*count2); 7894 memset(mlevel3, 0, 128*count3); 7895 count3 = 0; 7896 for (i = 1; i < length; i++) { 7897 int o1, o2, o3, i2, i3; 7898 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7899 if (ch == 0xFFFE) 7900 /* unmapped character */ 7901 continue; 7902 o1 = ch>>11; 7903 o2 = (ch>>7) & 0xF; 7904 i2 = 16*mlevel1[o1] + o2; 7905 if (mlevel2[i2] == 0xFF) 7906 mlevel2[i2] = count3++; 7907 o3 = ch & 0x7F; 7908 i3 = 128*mlevel2[i2] + o3; 7909 mlevel3[i3] = i; 7910 } 7911 return result; 7912} 7913 7914static int 7915encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7916{ 7917 struct encoding_map *map = (struct encoding_map*)mapping; 7918 int l1 = c>>11; 7919 int l2 = (c>>7) & 0xF; 7920 int l3 = c & 0x7F; 7921 int i; 7922 7923 if (c > 0xFFFF) 7924 return -1; 7925 if (c == 0) 7926 return 0; 7927 /* level 1*/ 7928 i = map->level1[l1]; 7929 if (i == 0xFF) { 7930 return -1; 7931 } 7932 /* level 2*/ 7933 i = map->level23[16*i+l2]; 7934 if (i == 0xFF) { 7935 return -1; 7936 } 7937 /* level 3 */ 7938 i = map->level23[16*map->count2 + 128*i + l3]; 7939 if (i == 0) { 7940 return -1; 7941 } 7942 return i; 7943} 7944 7945/* Lookup the character ch in the mapping. If the character 7946 can't be found, Py_None is returned (or NULL, if another 7947 error occurred). */ 7948static PyObject * 7949charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7950{ 7951 PyObject *w = PyLong_FromLong((long)c); 7952 PyObject *x; 7953 7954 if (w == NULL) 7955 return NULL; 7956 x = PyObject_GetItem(mapping, w); 7957 Py_DECREF(w); 7958 if (x == NULL) { 7959 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7960 /* No mapping found means: mapping is undefined. */ 7961 PyErr_Clear(); 7962 x = Py_None; 7963 Py_INCREF(x); 7964 return x; 7965 } else 7966 return NULL; 7967 } 7968 else if (x == Py_None) 7969 return x; 7970 else if (PyLong_Check(x)) { 7971 long value = PyLong_AS_LONG(x); 7972 if (value < 0 || value > 255) { 7973 PyErr_SetString(PyExc_TypeError, 7974 "character mapping must be in range(256)"); 7975 Py_DECREF(x); 7976 return NULL; 7977 } 7978 return x; 7979 } 7980 else if (PyBytes_Check(x)) 7981 return x; 7982 else { 7983 /* wrong return value */ 7984 PyErr_Format(PyExc_TypeError, 7985 "character mapping must return integer, bytes or None, not %.400s", 7986 x->ob_type->tp_name); 7987 Py_DECREF(x); 7988 return NULL; 7989 } 7990} 7991 7992static int 7993charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7994{ 7995 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7996 /* exponentially overallocate to minimize reallocations */ 7997 if (requiredsize < 2*outsize) 7998 requiredsize = 2*outsize; 7999 if (_PyBytes_Resize(outobj, requiredsize)) 8000 return -1; 8001 return 0; 8002} 8003 8004typedef enum charmapencode_result { 8005 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8006} charmapencode_result; 8007/* lookup the character, put the result in the output string and adjust 8008 various state variables. Resize the output bytes object if not enough 8009 space is available. Return a new reference to the object that 8010 was put in the output buffer, or Py_None, if the mapping was undefined 8011 (in which case no character was written) or NULL, if a 8012 reallocation error occurred. The caller must decref the result */ 8013static charmapencode_result 8014charmapencode_output(Py_UCS4 c, PyObject *mapping, 8015 PyObject **outobj, Py_ssize_t *outpos) 8016{ 8017 PyObject *rep; 8018 char *outstart; 8019 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8020 8021 if (Py_TYPE(mapping) == &EncodingMapType) { 8022 int res = encoding_map_lookup(c, mapping); 8023 Py_ssize_t requiredsize = *outpos+1; 8024 if (res == -1) 8025 return enc_FAILED; 8026 if (outsize<requiredsize) 8027 if (charmapencode_resize(outobj, outpos, requiredsize)) 8028 return enc_EXCEPTION; 8029 outstart = PyBytes_AS_STRING(*outobj); 8030 outstart[(*outpos)++] = (char)res; 8031 return enc_SUCCESS; 8032 } 8033 8034 rep = charmapencode_lookup(c, mapping); 8035 if (rep==NULL) 8036 return enc_EXCEPTION; 8037 else if (rep==Py_None) { 8038 Py_DECREF(rep); 8039 return enc_FAILED; 8040 } else { 8041 if (PyLong_Check(rep)) { 8042 Py_ssize_t requiredsize = *outpos+1; 8043 if (outsize<requiredsize) 8044 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8045 Py_DECREF(rep); 8046 return enc_EXCEPTION; 8047 } 8048 outstart = PyBytes_AS_STRING(*outobj); 8049 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8050 } 8051 else { 8052 const char *repchars = PyBytes_AS_STRING(rep); 8053 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8054 Py_ssize_t requiredsize = *outpos+repsize; 8055 if (outsize<requiredsize) 8056 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8057 Py_DECREF(rep); 8058 return enc_EXCEPTION; 8059 } 8060 outstart = PyBytes_AS_STRING(*outobj); 8061 memcpy(outstart + *outpos, repchars, repsize); 8062 *outpos += repsize; 8063 } 8064 } 8065 Py_DECREF(rep); 8066 return enc_SUCCESS; 8067} 8068 8069/* handle an error in PyUnicode_EncodeCharmap 8070 Return 0 on success, -1 on error */ 8071static int 8072charmap_encoding_error( 8073 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8074 PyObject **exceptionObject, 8075 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8076 PyObject **res, Py_ssize_t *respos) 8077{ 8078 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8079 Py_ssize_t size, repsize; 8080 Py_ssize_t newpos; 8081 enum PyUnicode_Kind kind; 8082 void *data; 8083 Py_ssize_t index; 8084 /* startpos for collecting unencodable chars */ 8085 Py_ssize_t collstartpos = *inpos; 8086 Py_ssize_t collendpos = *inpos+1; 8087 Py_ssize_t collpos; 8088 char *encoding = "charmap"; 8089 char *reason = "character maps to <undefined>"; 8090 charmapencode_result x; 8091 Py_UCS4 ch; 8092 int val; 8093 8094 if (PyUnicode_READY(unicode) == -1) 8095 return -1; 8096 size = PyUnicode_GET_LENGTH(unicode); 8097 /* find all unencodable characters */ 8098 while (collendpos < size) { 8099 PyObject *rep; 8100 if (Py_TYPE(mapping) == &EncodingMapType) { 8101 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8102 val = encoding_map_lookup(ch, mapping); 8103 if (val != -1) 8104 break; 8105 ++collendpos; 8106 continue; 8107 } 8108 8109 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8110 rep = charmapencode_lookup(ch, mapping); 8111 if (rep==NULL) 8112 return -1; 8113 else if (rep!=Py_None) { 8114 Py_DECREF(rep); 8115 break; 8116 } 8117 Py_DECREF(rep); 8118 ++collendpos; 8119 } 8120 /* cache callback name lookup 8121 * (if not done yet, i.e. it's the first error) */ 8122 if (*known_errorHandler==-1) { 8123 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8124 *known_errorHandler = 1; 8125 else if (!strcmp(errors, "replace")) 8126 *known_errorHandler = 2; 8127 else if (!strcmp(errors, "ignore")) 8128 *known_errorHandler = 3; 8129 else if (!strcmp(errors, "xmlcharrefreplace")) 8130 *known_errorHandler = 4; 8131 else 8132 *known_errorHandler = 0; 8133 } 8134 switch (*known_errorHandler) { 8135 case 1: /* strict */ 8136 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8137 return -1; 8138 case 2: /* replace */ 8139 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8140 x = charmapencode_output('?', mapping, res, respos); 8141 if (x==enc_EXCEPTION) { 8142 return -1; 8143 } 8144 else if (x==enc_FAILED) { 8145 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8146 return -1; 8147 } 8148 } 8149 /* fall through */ 8150 case 3: /* ignore */ 8151 *inpos = collendpos; 8152 break; 8153 case 4: /* xmlcharrefreplace */ 8154 /* generate replacement (temporarily (mis)uses p) */ 8155 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8156 char buffer[2+29+1+1]; 8157 char *cp; 8158 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8159 for (cp = buffer; *cp; ++cp) { 8160 x = charmapencode_output(*cp, mapping, res, respos); 8161 if (x==enc_EXCEPTION) 8162 return -1; 8163 else if (x==enc_FAILED) { 8164 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8165 return -1; 8166 } 8167 } 8168 } 8169 *inpos = collendpos; 8170 break; 8171 default: 8172 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8173 encoding, reason, unicode, exceptionObject, 8174 collstartpos, collendpos, &newpos); 8175 if (repunicode == NULL) 8176 return -1; 8177 if (PyBytes_Check(repunicode)) { 8178 /* Directly copy bytes result to output. */ 8179 Py_ssize_t outsize = PyBytes_Size(*res); 8180 Py_ssize_t requiredsize; 8181 repsize = PyBytes_Size(repunicode); 8182 requiredsize = *respos + repsize; 8183 if (requiredsize > outsize) 8184 /* Make room for all additional bytes. */ 8185 if (charmapencode_resize(res, respos, requiredsize)) { 8186 Py_DECREF(repunicode); 8187 return -1; 8188 } 8189 memcpy(PyBytes_AsString(*res) + *respos, 8190 PyBytes_AsString(repunicode), repsize); 8191 *respos += repsize; 8192 *inpos = newpos; 8193 Py_DECREF(repunicode); 8194 break; 8195 } 8196 /* generate replacement */ 8197 if (PyUnicode_READY(repunicode) == -1) { 8198 Py_DECREF(repunicode); 8199 return -1; 8200 } 8201 repsize = PyUnicode_GET_LENGTH(repunicode); 8202 data = PyUnicode_DATA(repunicode); 8203 kind = PyUnicode_KIND(repunicode); 8204 for (index = 0; index < repsize; index++) { 8205 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8206 x = charmapencode_output(repch, mapping, res, respos); 8207 if (x==enc_EXCEPTION) { 8208 Py_DECREF(repunicode); 8209 return -1; 8210 } 8211 else if (x==enc_FAILED) { 8212 Py_DECREF(repunicode); 8213 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8214 return -1; 8215 } 8216 } 8217 *inpos = newpos; 8218 Py_DECREF(repunicode); 8219 } 8220 return 0; 8221} 8222 8223PyObject * 8224_PyUnicode_EncodeCharmap(PyObject *unicode, 8225 PyObject *mapping, 8226 const char *errors) 8227{ 8228 /* output object */ 8229 PyObject *res = NULL; 8230 /* current input position */ 8231 Py_ssize_t inpos = 0; 8232 Py_ssize_t size; 8233 /* current output position */ 8234 Py_ssize_t respos = 0; 8235 PyObject *errorHandler = NULL; 8236 PyObject *exc = NULL; 8237 /* the following variable is used for caching string comparisons 8238 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8239 * 3=ignore, 4=xmlcharrefreplace */ 8240 int known_errorHandler = -1; 8241 void *data; 8242 int kind; 8243 8244 if (PyUnicode_READY(unicode) == -1) 8245 return NULL; 8246 size = PyUnicode_GET_LENGTH(unicode); 8247 data = PyUnicode_DATA(unicode); 8248 kind = PyUnicode_KIND(unicode); 8249 8250 /* Default to Latin-1 */ 8251 if (mapping == NULL) 8252 return unicode_encode_ucs1(unicode, errors, 256); 8253 8254 /* allocate enough for a simple encoding without 8255 replacements, if we need more, we'll resize */ 8256 res = PyBytes_FromStringAndSize(NULL, size); 8257 if (res == NULL) 8258 goto onError; 8259 if (size == 0) 8260 return res; 8261 8262 while (inpos<size) { 8263 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8264 /* try to encode it */ 8265 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8266 if (x==enc_EXCEPTION) /* error */ 8267 goto onError; 8268 if (x==enc_FAILED) { /* unencodable character */ 8269 if (charmap_encoding_error(unicode, &inpos, mapping, 8270 &exc, 8271 &known_errorHandler, &errorHandler, errors, 8272 &res, &respos)) { 8273 goto onError; 8274 } 8275 } 8276 else 8277 /* done with this character => adjust input position */ 8278 ++inpos; 8279 } 8280 8281 /* Resize if we allocated to much */ 8282 if (respos<PyBytes_GET_SIZE(res)) 8283 if (_PyBytes_Resize(&res, respos) < 0) 8284 goto onError; 8285 8286 Py_XDECREF(exc); 8287 Py_XDECREF(errorHandler); 8288 return res; 8289 8290 onError: 8291 Py_XDECREF(res); 8292 Py_XDECREF(exc); 8293 Py_XDECREF(errorHandler); 8294 return NULL; 8295} 8296 8297/* Deprecated */ 8298PyObject * 8299PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8300 Py_ssize_t size, 8301 PyObject *mapping, 8302 const char *errors) 8303{ 8304 PyObject *result; 8305 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8306 if (unicode == NULL) 8307 return NULL; 8308 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8309 Py_DECREF(unicode); 8310 return result; 8311} 8312 8313PyObject * 8314PyUnicode_AsCharmapString(PyObject *unicode, 8315 PyObject *mapping) 8316{ 8317 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8318 PyErr_BadArgument(); 8319 return NULL; 8320 } 8321 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8322} 8323 8324/* create or adjust a UnicodeTranslateError */ 8325static void 8326make_translate_exception(PyObject **exceptionObject, 8327 PyObject *unicode, 8328 Py_ssize_t startpos, Py_ssize_t endpos, 8329 const char *reason) 8330{ 8331 if (*exceptionObject == NULL) { 8332 *exceptionObject = _PyUnicodeTranslateError_Create( 8333 unicode, startpos, endpos, reason); 8334 } 8335 else { 8336 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8337 goto onError; 8338 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8339 goto onError; 8340 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8341 goto onError; 8342 return; 8343 onError: 8344 Py_CLEAR(*exceptionObject); 8345 } 8346} 8347 8348/* error handling callback helper: 8349 build arguments, call the callback and check the arguments, 8350 put the result into newpos and return the replacement string, which 8351 has to be freed by the caller */ 8352static PyObject * 8353unicode_translate_call_errorhandler(const char *errors, 8354 PyObject **errorHandler, 8355 const char *reason, 8356 PyObject *unicode, PyObject **exceptionObject, 8357 Py_ssize_t startpos, Py_ssize_t endpos, 8358 Py_ssize_t *newpos) 8359{ 8360 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8361 8362 Py_ssize_t i_newpos; 8363 PyObject *restuple; 8364 PyObject *resunicode; 8365 8366 if (*errorHandler == NULL) { 8367 *errorHandler = PyCodec_LookupError(errors); 8368 if (*errorHandler == NULL) 8369 return NULL; 8370 } 8371 8372 make_translate_exception(exceptionObject, 8373 unicode, startpos, endpos, reason); 8374 if (*exceptionObject == NULL) 8375 return NULL; 8376 8377 restuple = PyObject_CallFunctionObjArgs( 8378 *errorHandler, *exceptionObject, NULL); 8379 if (restuple == NULL) 8380 return NULL; 8381 if (!PyTuple_Check(restuple)) { 8382 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8383 Py_DECREF(restuple); 8384 return NULL; 8385 } 8386 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8387 &resunicode, &i_newpos)) { 8388 Py_DECREF(restuple); 8389 return NULL; 8390 } 8391 if (i_newpos<0) 8392 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8393 else 8394 *newpos = i_newpos; 8395 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8397 Py_DECREF(restuple); 8398 return NULL; 8399 } 8400 Py_INCREF(resunicode); 8401 Py_DECREF(restuple); 8402 return resunicode; 8403} 8404 8405/* Lookup the character ch in the mapping and put the result in result, 8406 which must be decrefed by the caller. 8407 Return 0 on success, -1 on error */ 8408static int 8409charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8410{ 8411 PyObject *w = PyLong_FromLong((long)c); 8412 PyObject *x; 8413 8414 if (w == NULL) 8415 return -1; 8416 x = PyObject_GetItem(mapping, w); 8417 Py_DECREF(w); 8418 if (x == NULL) { 8419 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8420 /* No mapping found means: use 1:1 mapping. */ 8421 PyErr_Clear(); 8422 *result = NULL; 8423 return 0; 8424 } else 8425 return -1; 8426 } 8427 else if (x == Py_None) { 8428 *result = x; 8429 return 0; 8430 } 8431 else if (PyLong_Check(x)) { 8432 long value = PyLong_AS_LONG(x); 8433 if (value < 0 || value > MAX_UNICODE) { 8434 PyErr_Format(PyExc_ValueError, 8435 "character mapping must be in range(0x%x)", 8436 MAX_UNICODE+1); 8437 Py_DECREF(x); 8438 return -1; 8439 } 8440 *result = x; 8441 return 0; 8442 } 8443 else if (PyUnicode_Check(x)) { 8444 *result = x; 8445 return 0; 8446 } 8447 else { 8448 /* wrong return value */ 8449 PyErr_SetString(PyExc_TypeError, 8450 "character mapping must return integer, None or str"); 8451 Py_DECREF(x); 8452 return -1; 8453 } 8454} 8455 8456/* lookup the character, write the result into the writer. 8457 Return 1 if the result was written into the writer, return 0 if the mapping 8458 was undefined, raise an exception return -1 on error. */ 8459static int 8460charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8461 _PyUnicodeWriter *writer) 8462{ 8463 PyObject *item; 8464 8465 if (charmaptranslate_lookup(ch, mapping, &item)) 8466 return -1; 8467 8468 if (item == NULL) { 8469 /* not found => default to 1:1 mapping */ 8470 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8471 return -1; 8472 } 8473 return 1; 8474 } 8475 8476 if (item == Py_None) { 8477 Py_DECREF(item); 8478 return 0; 8479 } 8480 8481 if (PyLong_Check(item)) { 8482 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8483 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8484 used it */ 8485 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8486 Py_DECREF(item); 8487 return -1; 8488 } 8489 Py_DECREF(item); 8490 return 1; 8491 } 8492 8493 if (!PyUnicode_Check(item)) { 8494 Py_DECREF(item); 8495 return -1; 8496 } 8497 8498 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8499 Py_DECREF(item); 8500 return -1; 8501 } 8502 8503 Py_DECREF(item); 8504 return 1; 8505} 8506 8507static int 8508unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8509 Py_UCS1 *translate) 8510{ 8511 PyObject *item = NULL; 8512 int ret = 0; 8513 8514 if (charmaptranslate_lookup(ch, mapping, &item)) { 8515 return -1; 8516 } 8517 8518 if (item == Py_None) { 8519 /* deletion */ 8520 translate[ch] = 0xfe; 8521 } 8522 else if (item == NULL) { 8523 /* not found => default to 1:1 mapping */ 8524 translate[ch] = ch; 8525 return 1; 8526 } 8527 else if (PyLong_Check(item)) { 8528 long replace = PyLong_AS_LONG(item); 8529 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8530 used it */ 8531 if (127 < replace) { 8532 /* invalid character or character outside ASCII: 8533 skip the fast translate */ 8534 goto exit; 8535 } 8536 translate[ch] = (Py_UCS1)replace; 8537 } 8538 else if (PyUnicode_Check(item)) { 8539 Py_UCS4 replace; 8540 8541 if (PyUnicode_READY(item) == -1) { 8542 Py_DECREF(item); 8543 return -1; 8544 } 8545 if (PyUnicode_GET_LENGTH(item) != 1) 8546 goto exit; 8547 8548 replace = PyUnicode_READ_CHAR(item, 0); 8549 if (replace > 127) 8550 goto exit; 8551 translate[ch] = (Py_UCS1)replace; 8552 } 8553 else { 8554 /* not None, NULL, long or unicode */ 8555 goto exit; 8556 } 8557 ret = 1; 8558 8559 exit: 8560 Py_DECREF(item); 8561 return ret; 8562} 8563 8564/* Fast path for ascii => ascii translation. Return 1 if the whole string 8565 was translated into writer, return 0 if the input string was partially 8566 translated into writer, raise an exception and return -1 on error. */ 8567static int 8568unicode_fast_translate(PyObject *input, PyObject *mapping, 8569 _PyUnicodeWriter *writer, int ignore) 8570{ 8571 Py_UCS1 ascii_table[128], ch, ch2; 8572 Py_ssize_t len; 8573 Py_UCS1 *in, *end, *out; 8574 int res = 0; 8575 8576 if (PyUnicode_READY(input) == -1) 8577 return -1; 8578 if (!PyUnicode_IS_ASCII(input)) 8579 return 0; 8580 len = PyUnicode_GET_LENGTH(input); 8581 8582 memset(ascii_table, 0xff, 128); 8583 8584 in = PyUnicode_1BYTE_DATA(input); 8585 end = in + len; 8586 8587 assert(PyUnicode_IS_ASCII(writer->buffer)); 8588 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8589 out = PyUnicode_1BYTE_DATA(writer->buffer); 8590 8591 for (; in < end; in++) { 8592 ch = *in; 8593 ch2 = ascii_table[ch]; 8594 if (ch2 == 0xff) { 8595 int translate = unicode_fast_translate_lookup(mapping, ch, 8596 ascii_table); 8597 if (translate < 0) 8598 return -1; 8599 if (translate == 0) 8600 goto exit; 8601 ch2 = ascii_table[ch]; 8602 } 8603 if (ch2 == 0xfe) { 8604 if (ignore) 8605 continue; 8606 goto exit; 8607 } 8608 assert(ch2 < 128); 8609 *out = ch2; 8610 out++; 8611 } 8612 res = 1; 8613 8614exit: 8615 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8616 return res; 8617} 8618 8619PyObject * 8620_PyUnicode_TranslateCharmap(PyObject *input, 8621 PyObject *mapping, 8622 const char *errors) 8623{ 8624 /* input object */ 8625 char *data; 8626 Py_ssize_t size, i; 8627 int kind; 8628 /* output buffer */ 8629 _PyUnicodeWriter writer; 8630 /* error handler */ 8631 char *reason = "character maps to <undefined>"; 8632 PyObject *errorHandler = NULL; 8633 PyObject *exc = NULL; 8634 int ignore; 8635 int res; 8636 8637 if (mapping == NULL) { 8638 PyErr_BadArgument(); 8639 return NULL; 8640 } 8641 8642 if (PyUnicode_READY(input) == -1) 8643 return NULL; 8644 data = (char*)PyUnicode_DATA(input); 8645 kind = PyUnicode_KIND(input); 8646 size = PyUnicode_GET_LENGTH(input); 8647 8648 if (size == 0) { 8649 Py_INCREF(input); 8650 return input; 8651 } 8652 8653 /* allocate enough for a simple 1:1 translation without 8654 replacements, if we need more, we'll resize */ 8655 _PyUnicodeWriter_Init(&writer); 8656 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8657 goto onError; 8658 8659 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8660 8661 res = unicode_fast_translate(input, mapping, &writer, ignore); 8662 if (res < 0) { 8663 _PyUnicodeWriter_Dealloc(&writer); 8664 return NULL; 8665 } 8666 if (res == 1) 8667 return _PyUnicodeWriter_Finish(&writer); 8668 8669 i = writer.pos; 8670 while (i<size) { 8671 /* try to encode it */ 8672 int translate; 8673 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8674 Py_ssize_t newpos; 8675 /* startpos for collecting untranslatable chars */ 8676 Py_ssize_t collstart; 8677 Py_ssize_t collend; 8678 Py_UCS4 ch; 8679 8680 ch = PyUnicode_READ(kind, data, i); 8681 translate = charmaptranslate_output(ch, mapping, &writer); 8682 if (translate < 0) 8683 goto onError; 8684 8685 if (translate != 0) { 8686 /* it worked => adjust input pointer */ 8687 ++i; 8688 continue; 8689 } 8690 8691 /* untranslatable character */ 8692 collstart = i; 8693 collend = i+1; 8694 8695 /* find all untranslatable characters */ 8696 while (collend < size) { 8697 PyObject *x; 8698 ch = PyUnicode_READ(kind, data, collend); 8699 if (charmaptranslate_lookup(ch, mapping, &x)) 8700 goto onError; 8701 Py_XDECREF(x); 8702 if (x != Py_None) 8703 break; 8704 ++collend; 8705 } 8706 8707 if (ignore) { 8708 i = collend; 8709 } 8710 else { 8711 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8712 reason, input, &exc, 8713 collstart, collend, &newpos); 8714 if (repunicode == NULL) 8715 goto onError; 8716 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 8717 Py_DECREF(repunicode); 8718 goto onError; 8719 } 8720 Py_DECREF(repunicode); 8721 i = newpos; 8722 } 8723 } 8724 Py_XDECREF(exc); 8725 Py_XDECREF(errorHandler); 8726 return _PyUnicodeWriter_Finish(&writer); 8727 8728 onError: 8729 _PyUnicodeWriter_Dealloc(&writer); 8730 Py_XDECREF(exc); 8731 Py_XDECREF(errorHandler); 8732 return NULL; 8733} 8734 8735/* Deprecated. Use PyUnicode_Translate instead. */ 8736PyObject * 8737PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8738 Py_ssize_t size, 8739 PyObject *mapping, 8740 const char *errors) 8741{ 8742 PyObject *result; 8743 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8744 if (!unicode) 8745 return NULL; 8746 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8747 Py_DECREF(unicode); 8748 return result; 8749} 8750 8751PyObject * 8752PyUnicode_Translate(PyObject *str, 8753 PyObject *mapping, 8754 const char *errors) 8755{ 8756 PyObject *result; 8757 8758 str = PyUnicode_FromObject(str); 8759 if (str == NULL) 8760 return NULL; 8761 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8762 Py_DECREF(str); 8763 return result; 8764} 8765 8766static Py_UCS4 8767fix_decimal_and_space_to_ascii(PyObject *self) 8768{ 8769 /* No need to call PyUnicode_READY(self) because this function is only 8770 called as a callback from fixup() which does it already. */ 8771 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8772 const int kind = PyUnicode_KIND(self); 8773 void *data = PyUnicode_DATA(self); 8774 Py_UCS4 maxchar = 127, ch, fixed; 8775 int modified = 0; 8776 Py_ssize_t i; 8777 8778 for (i = 0; i < len; ++i) { 8779 ch = PyUnicode_READ(kind, data, i); 8780 fixed = 0; 8781 if (ch > 127) { 8782 if (Py_UNICODE_ISSPACE(ch)) 8783 fixed = ' '; 8784 else { 8785 const int decimal = Py_UNICODE_TODECIMAL(ch); 8786 if (decimal >= 0) 8787 fixed = '0' + decimal; 8788 } 8789 if (fixed != 0) { 8790 modified = 1; 8791 maxchar = Py_MAX(maxchar, fixed); 8792 PyUnicode_WRITE(kind, data, i, fixed); 8793 } 8794 else 8795 maxchar = Py_MAX(maxchar, ch); 8796 } 8797 } 8798 8799 return (modified) ? maxchar : 0; 8800} 8801 8802PyObject * 8803_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8804{ 8805 if (!PyUnicode_Check(unicode)) { 8806 PyErr_BadInternalCall(); 8807 return NULL; 8808 } 8809 if (PyUnicode_READY(unicode) == -1) 8810 return NULL; 8811 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8812 /* If the string is already ASCII, just return the same string */ 8813 Py_INCREF(unicode); 8814 return unicode; 8815 } 8816 return fixup(unicode, fix_decimal_and_space_to_ascii); 8817} 8818 8819PyObject * 8820PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8821 Py_ssize_t length) 8822{ 8823 PyObject *decimal; 8824 Py_ssize_t i; 8825 Py_UCS4 maxchar; 8826 enum PyUnicode_Kind kind; 8827 void *data; 8828 8829 maxchar = 127; 8830 for (i = 0; i < length; i++) { 8831 Py_UCS4 ch = s[i]; 8832 if (ch > 127) { 8833 int decimal = Py_UNICODE_TODECIMAL(ch); 8834 if (decimal >= 0) 8835 ch = '0' + decimal; 8836 maxchar = Py_MAX(maxchar, ch); 8837 } 8838 } 8839 8840 /* Copy to a new string */ 8841 decimal = PyUnicode_New(length, maxchar); 8842 if (decimal == NULL) 8843 return decimal; 8844 kind = PyUnicode_KIND(decimal); 8845 data = PyUnicode_DATA(decimal); 8846 /* Iterate over code points */ 8847 for (i = 0; i < length; i++) { 8848 Py_UCS4 ch = s[i]; 8849 if (ch > 127) { 8850 int decimal = Py_UNICODE_TODECIMAL(ch); 8851 if (decimal >= 0) 8852 ch = '0' + decimal; 8853 } 8854 PyUnicode_WRITE(kind, data, i, ch); 8855 } 8856 return unicode_result(decimal); 8857} 8858/* --- Decimal Encoder ---------------------------------------------------- */ 8859 8860int 8861PyUnicode_EncodeDecimal(Py_UNICODE *s, 8862 Py_ssize_t length, 8863 char *output, 8864 const char *errors) 8865{ 8866 PyObject *unicode; 8867 Py_ssize_t i; 8868 enum PyUnicode_Kind kind; 8869 void *data; 8870 8871 if (output == NULL) { 8872 PyErr_BadArgument(); 8873 return -1; 8874 } 8875 8876 unicode = PyUnicode_FromUnicode(s, length); 8877 if (unicode == NULL) 8878 return -1; 8879 8880 if (PyUnicode_READY(unicode) == -1) { 8881 Py_DECREF(unicode); 8882 return -1; 8883 } 8884 kind = PyUnicode_KIND(unicode); 8885 data = PyUnicode_DATA(unicode); 8886 8887 for (i=0; i < length; ) { 8888 PyObject *exc; 8889 Py_UCS4 ch; 8890 int decimal; 8891 Py_ssize_t startpos; 8892 8893 ch = PyUnicode_READ(kind, data, i); 8894 8895 if (Py_UNICODE_ISSPACE(ch)) { 8896 *output++ = ' '; 8897 i++; 8898 continue; 8899 } 8900 decimal = Py_UNICODE_TODECIMAL(ch); 8901 if (decimal >= 0) { 8902 *output++ = '0' + decimal; 8903 i++; 8904 continue; 8905 } 8906 if (0 < ch && ch < 256) { 8907 *output++ = (char)ch; 8908 i++; 8909 continue; 8910 } 8911 8912 startpos = i; 8913 exc = NULL; 8914 raise_encode_exception(&exc, "decimal", unicode, 8915 startpos, startpos+1, 8916 "invalid decimal Unicode string"); 8917 Py_XDECREF(exc); 8918 Py_DECREF(unicode); 8919 return -1; 8920 } 8921 /* 0-terminate the output string */ 8922 *output++ = '\0'; 8923 Py_DECREF(unicode); 8924 return 0; 8925} 8926 8927/* --- Helpers ------------------------------------------------------------ */ 8928 8929/* helper macro to fixup start/end slice values */ 8930#define ADJUST_INDICES(start, end, len) \ 8931 if (end > len) \ 8932 end = len; \ 8933 else if (end < 0) { \ 8934 end += len; \ 8935 if (end < 0) \ 8936 end = 0; \ 8937 } \ 8938 if (start < 0) { \ 8939 start += len; \ 8940 if (start < 0) \ 8941 start = 0; \ 8942 } 8943 8944static Py_ssize_t 8945any_find_slice(int direction, PyObject* s1, PyObject* s2, 8946 Py_ssize_t start, 8947 Py_ssize_t end) 8948{ 8949 int kind1, kind2; 8950 void *buf1, *buf2; 8951 Py_ssize_t len1, len2, result; 8952 8953 kind1 = PyUnicode_KIND(s1); 8954 kind2 = PyUnicode_KIND(s2); 8955 if (kind1 < kind2) 8956 return -1; 8957 8958 len1 = PyUnicode_GET_LENGTH(s1); 8959 len2 = PyUnicode_GET_LENGTH(s2); 8960 ADJUST_INDICES(start, end, len1); 8961 if (end - start < len2) 8962 return -1; 8963 8964 buf1 = PyUnicode_DATA(s1); 8965 buf2 = PyUnicode_DATA(s2); 8966 if (len2 == 1) { 8967 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 8968 result = findchar((const char *)buf1 + kind1*start, 8969 kind1, end - start, ch, direction); 8970 if (result == -1) 8971 return -1; 8972 else 8973 return start + result; 8974 } 8975 8976 if (kind2 != kind1) { 8977 buf2 = _PyUnicode_AsKind(s2, kind1); 8978 if (!buf2) 8979 return -2; 8980 } 8981 8982 if (direction > 0) { 8983 switch (kind1) { 8984 case PyUnicode_1BYTE_KIND: 8985 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8986 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8987 else 8988 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8989 break; 8990 case PyUnicode_2BYTE_KIND: 8991 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8992 break; 8993 case PyUnicode_4BYTE_KIND: 8994 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8995 break; 8996 default: 8997 assert(0); result = -2; 8998 } 8999 } 9000 else { 9001 switch (kind1) { 9002 case PyUnicode_1BYTE_KIND: 9003 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9004 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9005 else 9006 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9007 break; 9008 case PyUnicode_2BYTE_KIND: 9009 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9010 break; 9011 case PyUnicode_4BYTE_KIND: 9012 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9013 break; 9014 default: 9015 assert(0); result = -2; 9016 } 9017 } 9018 9019 if (kind2 != kind1) 9020 PyMem_Free(buf2); 9021 9022 return result; 9023} 9024 9025Py_ssize_t 9026_PyUnicode_InsertThousandsGrouping( 9027 PyObject *unicode, Py_ssize_t index, 9028 Py_ssize_t n_buffer, 9029 void *digits, Py_ssize_t n_digits, 9030 Py_ssize_t min_width, 9031 const char *grouping, PyObject *thousands_sep, 9032 Py_UCS4 *maxchar) 9033{ 9034 unsigned int kind, thousands_sep_kind; 9035 char *data, *thousands_sep_data; 9036 Py_ssize_t thousands_sep_len; 9037 Py_ssize_t len; 9038 9039 if (unicode != NULL) { 9040 kind = PyUnicode_KIND(unicode); 9041 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9042 } 9043 else { 9044 kind = PyUnicode_1BYTE_KIND; 9045 data = NULL; 9046 } 9047 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9048 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9049 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9050 if (unicode != NULL && thousands_sep_kind != kind) { 9051 if (thousands_sep_kind < kind) { 9052 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9053 if (!thousands_sep_data) 9054 return -1; 9055 } 9056 else { 9057 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9058 if (!data) 9059 return -1; 9060 } 9061 } 9062 9063 switch (kind) { 9064 case PyUnicode_1BYTE_KIND: 9065 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9066 len = asciilib_InsertThousandsGrouping( 9067 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9068 min_width, grouping, 9069 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9070 else 9071 len = ucs1lib_InsertThousandsGrouping( 9072 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9073 min_width, grouping, 9074 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9075 break; 9076 case PyUnicode_2BYTE_KIND: 9077 len = ucs2lib_InsertThousandsGrouping( 9078 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9079 min_width, grouping, 9080 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9081 break; 9082 case PyUnicode_4BYTE_KIND: 9083 len = ucs4lib_InsertThousandsGrouping( 9084 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9085 min_width, grouping, 9086 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9087 break; 9088 default: 9089 assert(0); 9090 return -1; 9091 } 9092 if (unicode != NULL && thousands_sep_kind != kind) { 9093 if (thousands_sep_kind < kind) 9094 PyMem_Free(thousands_sep_data); 9095 else 9096 PyMem_Free(data); 9097 } 9098 if (unicode == NULL) { 9099 *maxchar = 127; 9100 if (len != n_digits) { 9101 *maxchar = Py_MAX(*maxchar, 9102 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9103 } 9104 } 9105 return len; 9106} 9107 9108 9109Py_ssize_t 9110PyUnicode_Count(PyObject *str, 9111 PyObject *substr, 9112 Py_ssize_t start, 9113 Py_ssize_t end) 9114{ 9115 Py_ssize_t result; 9116 PyObject* str_obj; 9117 PyObject* sub_obj; 9118 int kind1, kind2; 9119 void *buf1 = NULL, *buf2 = NULL; 9120 Py_ssize_t len1, len2; 9121 9122 str_obj = PyUnicode_FromObject(str); 9123 if (!str_obj) 9124 return -1; 9125 sub_obj = PyUnicode_FromObject(substr); 9126 if (!sub_obj) { 9127 Py_DECREF(str_obj); 9128 return -1; 9129 } 9130 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9131 Py_DECREF(sub_obj); 9132 Py_DECREF(str_obj); 9133 return -1; 9134 } 9135 9136 kind1 = PyUnicode_KIND(str_obj); 9137 kind2 = PyUnicode_KIND(sub_obj); 9138 if (kind1 < kind2) { 9139 Py_DECREF(sub_obj); 9140 Py_DECREF(str_obj); 9141 return 0; 9142 } 9143 9144 len1 = PyUnicode_GET_LENGTH(str_obj); 9145 len2 = PyUnicode_GET_LENGTH(sub_obj); 9146 ADJUST_INDICES(start, end, len1); 9147 if (end - start < len2) { 9148 Py_DECREF(sub_obj); 9149 Py_DECREF(str_obj); 9150 return 0; 9151 } 9152 9153 buf1 = PyUnicode_DATA(str_obj); 9154 buf2 = PyUnicode_DATA(sub_obj); 9155 if (kind2 != kind1) { 9156 buf2 = _PyUnicode_AsKind(sub_obj, kind1); 9157 if (!buf2) 9158 goto onError; 9159 } 9160 9161 switch (kind1) { 9162 case PyUnicode_1BYTE_KIND: 9163 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9164 result = asciilib_count( 9165 ((Py_UCS1*)buf1) + start, end - start, 9166 buf2, len2, PY_SSIZE_T_MAX 9167 ); 9168 else 9169 result = ucs1lib_count( 9170 ((Py_UCS1*)buf1) + start, end - start, 9171 buf2, len2, PY_SSIZE_T_MAX 9172 ); 9173 break; 9174 case PyUnicode_2BYTE_KIND: 9175 result = ucs2lib_count( 9176 ((Py_UCS2*)buf1) + start, end - start, 9177 buf2, len2, PY_SSIZE_T_MAX 9178 ); 9179 break; 9180 case PyUnicode_4BYTE_KIND: 9181 result = ucs4lib_count( 9182 ((Py_UCS4*)buf1) + start, end - start, 9183 buf2, len2, PY_SSIZE_T_MAX 9184 ); 9185 break; 9186 default: 9187 assert(0); result = 0; 9188 } 9189 9190 Py_DECREF(sub_obj); 9191 Py_DECREF(str_obj); 9192 9193 if (kind2 != kind1) 9194 PyMem_Free(buf2); 9195 9196 return result; 9197 onError: 9198 Py_DECREF(sub_obj); 9199 Py_DECREF(str_obj); 9200 if (kind2 != kind1 && buf2) 9201 PyMem_Free(buf2); 9202 return -1; 9203} 9204 9205Py_ssize_t 9206PyUnicode_Find(PyObject *str, 9207 PyObject *sub, 9208 Py_ssize_t start, 9209 Py_ssize_t end, 9210 int direction) 9211{ 9212 Py_ssize_t result; 9213 9214 str = PyUnicode_FromObject(str); 9215 if (!str) 9216 return -2; 9217 sub = PyUnicode_FromObject(sub); 9218 if (!sub) { 9219 Py_DECREF(str); 9220 return -2; 9221 } 9222 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9223 Py_DECREF(sub); 9224 Py_DECREF(str); 9225 return -2; 9226 } 9227 9228 result = any_find_slice(direction, 9229 str, sub, start, end 9230 ); 9231 9232 Py_DECREF(str); 9233 Py_DECREF(sub); 9234 9235 return result; 9236} 9237 9238Py_ssize_t 9239PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9240 Py_ssize_t start, Py_ssize_t end, 9241 int direction) 9242{ 9243 int kind; 9244 Py_ssize_t result; 9245 if (PyUnicode_READY(str) == -1) 9246 return -2; 9247 if (start < 0 || end < 0) { 9248 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9249 return -2; 9250 } 9251 if (end > PyUnicode_GET_LENGTH(str)) 9252 end = PyUnicode_GET_LENGTH(str); 9253 if (start >= end) 9254 return -1; 9255 kind = PyUnicode_KIND(str); 9256 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9257 kind, end-start, ch, direction); 9258 if (result == -1) 9259 return -1; 9260 else 9261 return start + result; 9262} 9263 9264static int 9265tailmatch(PyObject *self, 9266 PyObject *substring, 9267 Py_ssize_t start, 9268 Py_ssize_t end, 9269 int direction) 9270{ 9271 int kind_self; 9272 int kind_sub; 9273 void *data_self; 9274 void *data_sub; 9275 Py_ssize_t offset; 9276 Py_ssize_t i; 9277 Py_ssize_t end_sub; 9278 9279 if (PyUnicode_READY(self) == -1 || 9280 PyUnicode_READY(substring) == -1) 9281 return -1; 9282 9283 if (PyUnicode_GET_LENGTH(substring) == 0) 9284 return 1; 9285 9286 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9287 end -= PyUnicode_GET_LENGTH(substring); 9288 if (end < start) 9289 return 0; 9290 9291 kind_self = PyUnicode_KIND(self); 9292 data_self = PyUnicode_DATA(self); 9293 kind_sub = PyUnicode_KIND(substring); 9294 data_sub = PyUnicode_DATA(substring); 9295 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9296 9297 if (direction > 0) 9298 offset = end; 9299 else 9300 offset = start; 9301 9302 if (PyUnicode_READ(kind_self, data_self, offset) == 9303 PyUnicode_READ(kind_sub, data_sub, 0) && 9304 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9305 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9306 /* If both are of the same kind, memcmp is sufficient */ 9307 if (kind_self == kind_sub) { 9308 return ! memcmp((char *)data_self + 9309 (offset * PyUnicode_KIND(substring)), 9310 data_sub, 9311 PyUnicode_GET_LENGTH(substring) * 9312 PyUnicode_KIND(substring)); 9313 } 9314 /* otherwise we have to compare each character by first accesing it */ 9315 else { 9316 /* We do not need to compare 0 and len(substring)-1 because 9317 the if statement above ensured already that they are equal 9318 when we end up here. */ 9319 for (i = 1; i < end_sub; ++i) { 9320 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9321 PyUnicode_READ(kind_sub, data_sub, i)) 9322 return 0; 9323 } 9324 return 1; 9325 } 9326 } 9327 9328 return 0; 9329} 9330 9331Py_ssize_t 9332PyUnicode_Tailmatch(PyObject *str, 9333 PyObject *substr, 9334 Py_ssize_t start, 9335 Py_ssize_t end, 9336 int direction) 9337{ 9338 Py_ssize_t result; 9339 9340 str = PyUnicode_FromObject(str); 9341 if (str == NULL) 9342 return -1; 9343 substr = PyUnicode_FromObject(substr); 9344 if (substr == NULL) { 9345 Py_DECREF(str); 9346 return -1; 9347 } 9348 9349 result = tailmatch(str, substr, 9350 start, end, direction); 9351 Py_DECREF(str); 9352 Py_DECREF(substr); 9353 return result; 9354} 9355 9356/* Apply fixfct filter to the Unicode object self and return a 9357 reference to the modified object */ 9358 9359static PyObject * 9360fixup(PyObject *self, 9361 Py_UCS4 (*fixfct)(PyObject *s)) 9362{ 9363 PyObject *u; 9364 Py_UCS4 maxchar_old, maxchar_new = 0; 9365 PyObject *v; 9366 9367 u = _PyUnicode_Copy(self); 9368 if (u == NULL) 9369 return NULL; 9370 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9371 9372 /* fix functions return the new maximum character in a string, 9373 if the kind of the resulting unicode object does not change, 9374 everything is fine. Otherwise we need to change the string kind 9375 and re-run the fix function. */ 9376 maxchar_new = fixfct(u); 9377 9378 if (maxchar_new == 0) { 9379 /* no changes */; 9380 if (PyUnicode_CheckExact(self)) { 9381 Py_DECREF(u); 9382 Py_INCREF(self); 9383 return self; 9384 } 9385 else 9386 return u; 9387 } 9388 9389 maxchar_new = align_maxchar(maxchar_new); 9390 9391 if (maxchar_new == maxchar_old) 9392 return u; 9393 9394 /* In case the maximum character changed, we need to 9395 convert the string to the new category. */ 9396 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9397 if (v == NULL) { 9398 Py_DECREF(u); 9399 return NULL; 9400 } 9401 if (maxchar_new > maxchar_old) { 9402 /* If the maxchar increased so that the kind changed, not all 9403 characters are representable anymore and we need to fix the 9404 string again. This only happens in very few cases. */ 9405 _PyUnicode_FastCopyCharacters(v, 0, 9406 self, 0, PyUnicode_GET_LENGTH(self)); 9407 maxchar_old = fixfct(v); 9408 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9409 } 9410 else { 9411 _PyUnicode_FastCopyCharacters(v, 0, 9412 u, 0, PyUnicode_GET_LENGTH(self)); 9413 } 9414 Py_DECREF(u); 9415 assert(_PyUnicode_CheckConsistency(v, 1)); 9416 return v; 9417} 9418 9419static PyObject * 9420ascii_upper_or_lower(PyObject *self, int lower) 9421{ 9422 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9423 char *resdata, *data = PyUnicode_DATA(self); 9424 PyObject *res; 9425 9426 res = PyUnicode_New(len, 127); 9427 if (res == NULL) 9428 return NULL; 9429 resdata = PyUnicode_DATA(res); 9430 if (lower) 9431 _Py_bytes_lower(resdata, data, len); 9432 else 9433 _Py_bytes_upper(resdata, data, len); 9434 return res; 9435} 9436 9437static Py_UCS4 9438handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9439{ 9440 Py_ssize_t j; 9441 int final_sigma; 9442 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9443 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9444 9445 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9446 9447 where ! is a negation and \p{xxx} is a character with property xxx. 9448 */ 9449 for (j = i - 1; j >= 0; j--) { 9450 c = PyUnicode_READ(kind, data, j); 9451 if (!_PyUnicode_IsCaseIgnorable(c)) 9452 break; 9453 } 9454 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9455 if (final_sigma) { 9456 for (j = i + 1; j < length; j++) { 9457 c = PyUnicode_READ(kind, data, j); 9458 if (!_PyUnicode_IsCaseIgnorable(c)) 9459 break; 9460 } 9461 final_sigma = j == length || !_PyUnicode_IsCased(c); 9462 } 9463 return (final_sigma) ? 0x3C2 : 0x3C3; 9464} 9465 9466static int 9467lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9468 Py_UCS4 c, Py_UCS4 *mapped) 9469{ 9470 /* Obscure special case. */ 9471 if (c == 0x3A3) { 9472 mapped[0] = handle_capital_sigma(kind, data, length, i); 9473 return 1; 9474 } 9475 return _PyUnicode_ToLowerFull(c, mapped); 9476} 9477 9478static Py_ssize_t 9479do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9480{ 9481 Py_ssize_t i, k = 0; 9482 int n_res, j; 9483 Py_UCS4 c, mapped[3]; 9484 9485 c = PyUnicode_READ(kind, data, 0); 9486 n_res = _PyUnicode_ToUpperFull(c, mapped); 9487 for (j = 0; j < n_res; j++) { 9488 *maxchar = Py_MAX(*maxchar, mapped[j]); 9489 res[k++] = mapped[j]; 9490 } 9491 for (i = 1; i < length; i++) { 9492 c = PyUnicode_READ(kind, data, i); 9493 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9494 for (j = 0; j < n_res; j++) { 9495 *maxchar = Py_MAX(*maxchar, mapped[j]); 9496 res[k++] = mapped[j]; 9497 } 9498 } 9499 return k; 9500} 9501 9502static Py_ssize_t 9503do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9504 Py_ssize_t i, k = 0; 9505 9506 for (i = 0; i < length; i++) { 9507 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9508 int n_res, j; 9509 if (Py_UNICODE_ISUPPER(c)) { 9510 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9511 } 9512 else if (Py_UNICODE_ISLOWER(c)) { 9513 n_res = _PyUnicode_ToUpperFull(c, mapped); 9514 } 9515 else { 9516 n_res = 1; 9517 mapped[0] = c; 9518 } 9519 for (j = 0; j < n_res; j++) { 9520 *maxchar = Py_MAX(*maxchar, mapped[j]); 9521 res[k++] = mapped[j]; 9522 } 9523 } 9524 return k; 9525} 9526 9527static Py_ssize_t 9528do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9529 Py_UCS4 *maxchar, int lower) 9530{ 9531 Py_ssize_t i, k = 0; 9532 9533 for (i = 0; i < length; i++) { 9534 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9535 int n_res, j; 9536 if (lower) 9537 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9538 else 9539 n_res = _PyUnicode_ToUpperFull(c, mapped); 9540 for (j = 0; j < n_res; j++) { 9541 *maxchar = Py_MAX(*maxchar, mapped[j]); 9542 res[k++] = mapped[j]; 9543 } 9544 } 9545 return k; 9546} 9547 9548static Py_ssize_t 9549do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9550{ 9551 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9552} 9553 9554static Py_ssize_t 9555do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9556{ 9557 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9558} 9559 9560static Py_ssize_t 9561do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9562{ 9563 Py_ssize_t i, k = 0; 9564 9565 for (i = 0; i < length; i++) { 9566 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9567 Py_UCS4 mapped[3]; 9568 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9569 for (j = 0; j < n_res; j++) { 9570 *maxchar = Py_MAX(*maxchar, mapped[j]); 9571 res[k++] = mapped[j]; 9572 } 9573 } 9574 return k; 9575} 9576 9577static Py_ssize_t 9578do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9579{ 9580 Py_ssize_t i, k = 0; 9581 int previous_is_cased; 9582 9583 previous_is_cased = 0; 9584 for (i = 0; i < length; i++) { 9585 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9586 Py_UCS4 mapped[3]; 9587 int n_res, j; 9588 9589 if (previous_is_cased) 9590 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9591 else 9592 n_res = _PyUnicode_ToTitleFull(c, mapped); 9593 9594 for (j = 0; j < n_res; j++) { 9595 *maxchar = Py_MAX(*maxchar, mapped[j]); 9596 res[k++] = mapped[j]; 9597 } 9598 9599 previous_is_cased = _PyUnicode_IsCased(c); 9600 } 9601 return k; 9602} 9603 9604static PyObject * 9605case_operation(PyObject *self, 9606 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9607{ 9608 PyObject *res = NULL; 9609 Py_ssize_t length, newlength = 0; 9610 int kind, outkind; 9611 void *data, *outdata; 9612 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9613 9614 assert(PyUnicode_IS_READY(self)); 9615 9616 kind = PyUnicode_KIND(self); 9617 data = PyUnicode_DATA(self); 9618 length = PyUnicode_GET_LENGTH(self); 9619 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9620 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9621 return NULL; 9622 } 9623 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9624 if (tmp == NULL) 9625 return PyErr_NoMemory(); 9626 newlength = perform(kind, data, length, tmp, &maxchar); 9627 res = PyUnicode_New(newlength, maxchar); 9628 if (res == NULL) 9629 goto leave; 9630 tmpend = tmp + newlength; 9631 outdata = PyUnicode_DATA(res); 9632 outkind = PyUnicode_KIND(res); 9633 switch (outkind) { 9634 case PyUnicode_1BYTE_KIND: 9635 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9636 break; 9637 case PyUnicode_2BYTE_KIND: 9638 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9639 break; 9640 case PyUnicode_4BYTE_KIND: 9641 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9642 break; 9643 default: 9644 assert(0); 9645 break; 9646 } 9647 leave: 9648 PyMem_FREE(tmp); 9649 return res; 9650} 9651 9652PyObject * 9653PyUnicode_Join(PyObject *separator, PyObject *seq) 9654{ 9655 PyObject *sep = NULL; 9656 Py_ssize_t seplen; 9657 PyObject *res = NULL; /* the result */ 9658 PyObject *fseq; /* PySequence_Fast(seq) */ 9659 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9660 PyObject **items; 9661 PyObject *item; 9662 Py_ssize_t sz, i, res_offset; 9663 Py_UCS4 maxchar; 9664 Py_UCS4 item_maxchar; 9665 int use_memcpy; 9666 unsigned char *res_data = NULL, *sep_data = NULL; 9667 PyObject *last_obj; 9668 unsigned int kind = 0; 9669 9670 fseq = PySequence_Fast(seq, "can only join an iterable"); 9671 if (fseq == NULL) { 9672 return NULL; 9673 } 9674 9675 /* NOTE: the following code can't call back into Python code, 9676 * so we are sure that fseq won't be mutated. 9677 */ 9678 9679 seqlen = PySequence_Fast_GET_SIZE(fseq); 9680 /* If empty sequence, return u"". */ 9681 if (seqlen == 0) { 9682 Py_DECREF(fseq); 9683 _Py_RETURN_UNICODE_EMPTY(); 9684 } 9685 9686 /* If singleton sequence with an exact Unicode, return that. */ 9687 last_obj = NULL; 9688 items = PySequence_Fast_ITEMS(fseq); 9689 if (seqlen == 1) { 9690 if (PyUnicode_CheckExact(items[0])) { 9691 res = items[0]; 9692 Py_INCREF(res); 9693 Py_DECREF(fseq); 9694 return res; 9695 } 9696 seplen = 0; 9697 maxchar = 0; 9698 } 9699 else { 9700 /* Set up sep and seplen */ 9701 if (separator == NULL) { 9702 /* fall back to a blank space separator */ 9703 sep = PyUnicode_FromOrdinal(' '); 9704 if (!sep) 9705 goto onError; 9706 seplen = 1; 9707 maxchar = 32; 9708 } 9709 else { 9710 if (!PyUnicode_Check(separator)) { 9711 PyErr_Format(PyExc_TypeError, 9712 "separator: expected str instance," 9713 " %.80s found", 9714 Py_TYPE(separator)->tp_name); 9715 goto onError; 9716 } 9717 if (PyUnicode_READY(separator)) 9718 goto onError; 9719 sep = separator; 9720 seplen = PyUnicode_GET_LENGTH(separator); 9721 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9722 /* inc refcount to keep this code path symmetric with the 9723 above case of a blank separator */ 9724 Py_INCREF(sep); 9725 } 9726 last_obj = sep; 9727 } 9728 9729 /* There are at least two things to join, or else we have a subclass 9730 * of str in the sequence. 9731 * Do a pre-pass to figure out the total amount of space we'll 9732 * need (sz), and see whether all argument are strings. 9733 */ 9734 sz = 0; 9735#ifdef Py_DEBUG 9736 use_memcpy = 0; 9737#else 9738 use_memcpy = 1; 9739#endif 9740 for (i = 0; i < seqlen; i++) { 9741 const Py_ssize_t old_sz = sz; 9742 item = items[i]; 9743 if (!PyUnicode_Check(item)) { 9744 PyErr_Format(PyExc_TypeError, 9745 "sequence item %zd: expected str instance," 9746 " %.80s found", 9747 i, Py_TYPE(item)->tp_name); 9748 goto onError; 9749 } 9750 if (PyUnicode_READY(item) == -1) 9751 goto onError; 9752 sz += PyUnicode_GET_LENGTH(item); 9753 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9754 maxchar = Py_MAX(maxchar, item_maxchar); 9755 if (i != 0) 9756 sz += seplen; 9757 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9758 PyErr_SetString(PyExc_OverflowError, 9759 "join() result is too long for a Python string"); 9760 goto onError; 9761 } 9762 if (use_memcpy && last_obj != NULL) { 9763 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9764 use_memcpy = 0; 9765 } 9766 last_obj = item; 9767 } 9768 9769 res = PyUnicode_New(sz, maxchar); 9770 if (res == NULL) 9771 goto onError; 9772 9773 /* Catenate everything. */ 9774#ifdef Py_DEBUG 9775 use_memcpy = 0; 9776#else 9777 if (use_memcpy) { 9778 res_data = PyUnicode_1BYTE_DATA(res); 9779 kind = PyUnicode_KIND(res); 9780 if (seplen != 0) 9781 sep_data = PyUnicode_1BYTE_DATA(sep); 9782 } 9783#endif 9784 if (use_memcpy) { 9785 for (i = 0; i < seqlen; ++i) { 9786 Py_ssize_t itemlen; 9787 item = items[i]; 9788 9789 /* Copy item, and maybe the separator. */ 9790 if (i && seplen != 0) { 9791 Py_MEMCPY(res_data, 9792 sep_data, 9793 kind * seplen); 9794 res_data += kind * seplen; 9795 } 9796 9797 itemlen = PyUnicode_GET_LENGTH(item); 9798 if (itemlen != 0) { 9799 Py_MEMCPY(res_data, 9800 PyUnicode_DATA(item), 9801 kind * itemlen); 9802 res_data += kind * itemlen; 9803 } 9804 } 9805 assert(res_data == PyUnicode_1BYTE_DATA(res) 9806 + kind * PyUnicode_GET_LENGTH(res)); 9807 } 9808 else { 9809 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9810 Py_ssize_t itemlen; 9811 item = items[i]; 9812 9813 /* Copy item, and maybe the separator. */ 9814 if (i && seplen != 0) { 9815 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9816 res_offset += seplen; 9817 } 9818 9819 itemlen = PyUnicode_GET_LENGTH(item); 9820 if (itemlen != 0) { 9821 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9822 res_offset += itemlen; 9823 } 9824 } 9825 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9826 } 9827 9828 Py_DECREF(fseq); 9829 Py_XDECREF(sep); 9830 assert(_PyUnicode_CheckConsistency(res, 1)); 9831 return res; 9832 9833 onError: 9834 Py_DECREF(fseq); 9835 Py_XDECREF(sep); 9836 Py_XDECREF(res); 9837 return NULL; 9838} 9839 9840#define FILL(kind, data, value, start, length) \ 9841 do { \ 9842 Py_ssize_t i_ = 0; \ 9843 assert(kind != PyUnicode_WCHAR_KIND); \ 9844 switch ((kind)) { \ 9845 case PyUnicode_1BYTE_KIND: { \ 9846 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9847 memset(to_, (unsigned char)value, (length)); \ 9848 break; \ 9849 } \ 9850 case PyUnicode_2BYTE_KIND: { \ 9851 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9852 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9853 break; \ 9854 } \ 9855 case PyUnicode_4BYTE_KIND: { \ 9856 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9857 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9858 break; \ 9859 } \ 9860 default: assert(0); \ 9861 } \ 9862 } while (0) 9863 9864void 9865_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9866 Py_UCS4 fill_char) 9867{ 9868 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9869 const void *data = PyUnicode_DATA(unicode); 9870 assert(PyUnicode_IS_READY(unicode)); 9871 assert(unicode_modifiable(unicode)); 9872 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9873 assert(start >= 0); 9874 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9875 FILL(kind, data, fill_char, start, length); 9876} 9877 9878Py_ssize_t 9879PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9880 Py_UCS4 fill_char) 9881{ 9882 Py_ssize_t maxlen; 9883 9884 if (!PyUnicode_Check(unicode)) { 9885 PyErr_BadInternalCall(); 9886 return -1; 9887 } 9888 if (PyUnicode_READY(unicode) == -1) 9889 return -1; 9890 if (unicode_check_modifiable(unicode)) 9891 return -1; 9892 9893 if (start < 0) { 9894 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9895 return -1; 9896 } 9897 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9898 PyErr_SetString(PyExc_ValueError, 9899 "fill character is bigger than " 9900 "the string maximum character"); 9901 return -1; 9902 } 9903 9904 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9905 length = Py_MIN(maxlen, length); 9906 if (length <= 0) 9907 return 0; 9908 9909 _PyUnicode_FastFill(unicode, start, length, fill_char); 9910 return length; 9911} 9912 9913static PyObject * 9914pad(PyObject *self, 9915 Py_ssize_t left, 9916 Py_ssize_t right, 9917 Py_UCS4 fill) 9918{ 9919 PyObject *u; 9920 Py_UCS4 maxchar; 9921 int kind; 9922 void *data; 9923 9924 if (left < 0) 9925 left = 0; 9926 if (right < 0) 9927 right = 0; 9928 9929 if (left == 0 && right == 0) 9930 return unicode_result_unchanged(self); 9931 9932 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9933 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9934 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9935 return NULL; 9936 } 9937 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9938 maxchar = Py_MAX(maxchar, fill); 9939 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9940 if (!u) 9941 return NULL; 9942 9943 kind = PyUnicode_KIND(u); 9944 data = PyUnicode_DATA(u); 9945 if (left) 9946 FILL(kind, data, fill, 0, left); 9947 if (right) 9948 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9949 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9950 assert(_PyUnicode_CheckConsistency(u, 1)); 9951 return u; 9952} 9953 9954PyObject * 9955PyUnicode_Splitlines(PyObject *string, int keepends) 9956{ 9957 PyObject *list; 9958 9959 string = PyUnicode_FromObject(string); 9960 if (string == NULL) 9961 return NULL; 9962 if (PyUnicode_READY(string) == -1) { 9963 Py_DECREF(string); 9964 return NULL; 9965 } 9966 9967 switch (PyUnicode_KIND(string)) { 9968 case PyUnicode_1BYTE_KIND: 9969 if (PyUnicode_IS_ASCII(string)) 9970 list = asciilib_splitlines( 9971 string, PyUnicode_1BYTE_DATA(string), 9972 PyUnicode_GET_LENGTH(string), keepends); 9973 else 9974 list = ucs1lib_splitlines( 9975 string, PyUnicode_1BYTE_DATA(string), 9976 PyUnicode_GET_LENGTH(string), keepends); 9977 break; 9978 case PyUnicode_2BYTE_KIND: 9979 list = ucs2lib_splitlines( 9980 string, PyUnicode_2BYTE_DATA(string), 9981 PyUnicode_GET_LENGTH(string), keepends); 9982 break; 9983 case PyUnicode_4BYTE_KIND: 9984 list = ucs4lib_splitlines( 9985 string, PyUnicode_4BYTE_DATA(string), 9986 PyUnicode_GET_LENGTH(string), keepends); 9987 break; 9988 default: 9989 assert(0); 9990 list = 0; 9991 } 9992 Py_DECREF(string); 9993 return list; 9994} 9995 9996static PyObject * 9997split(PyObject *self, 9998 PyObject *substring, 9999 Py_ssize_t maxcount) 10000{ 10001 int kind1, kind2; 10002 void *buf1, *buf2; 10003 Py_ssize_t len1, len2; 10004 PyObject* out; 10005 10006 if (maxcount < 0) 10007 maxcount = PY_SSIZE_T_MAX; 10008 10009 if (PyUnicode_READY(self) == -1) 10010 return NULL; 10011 10012 if (substring == NULL) 10013 switch (PyUnicode_KIND(self)) { 10014 case PyUnicode_1BYTE_KIND: 10015 if (PyUnicode_IS_ASCII(self)) 10016 return asciilib_split_whitespace( 10017 self, PyUnicode_1BYTE_DATA(self), 10018 PyUnicode_GET_LENGTH(self), maxcount 10019 ); 10020 else 10021 return ucs1lib_split_whitespace( 10022 self, PyUnicode_1BYTE_DATA(self), 10023 PyUnicode_GET_LENGTH(self), maxcount 10024 ); 10025 case PyUnicode_2BYTE_KIND: 10026 return ucs2lib_split_whitespace( 10027 self, PyUnicode_2BYTE_DATA(self), 10028 PyUnicode_GET_LENGTH(self), maxcount 10029 ); 10030 case PyUnicode_4BYTE_KIND: 10031 return ucs4lib_split_whitespace( 10032 self, PyUnicode_4BYTE_DATA(self), 10033 PyUnicode_GET_LENGTH(self), maxcount 10034 ); 10035 default: 10036 assert(0); 10037 return NULL; 10038 } 10039 10040 if (PyUnicode_READY(substring) == -1) 10041 return NULL; 10042 10043 kind1 = PyUnicode_KIND(self); 10044 kind2 = PyUnicode_KIND(substring); 10045 len1 = PyUnicode_GET_LENGTH(self); 10046 len2 = PyUnicode_GET_LENGTH(substring); 10047 if (kind1 < kind2 || len1 < len2) { 10048 out = PyList_New(1); 10049 if (out == NULL) 10050 return NULL; 10051 Py_INCREF(self); 10052 PyList_SET_ITEM(out, 0, self); 10053 return out; 10054 } 10055 buf1 = PyUnicode_DATA(self); 10056 buf2 = PyUnicode_DATA(substring); 10057 if (kind2 != kind1) { 10058 buf2 = _PyUnicode_AsKind(substring, kind1); 10059 if (!buf2) 10060 return NULL; 10061 } 10062 10063 switch (kind1) { 10064 case PyUnicode_1BYTE_KIND: 10065 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10066 out = asciilib_split( 10067 self, buf1, len1, buf2, len2, maxcount); 10068 else 10069 out = ucs1lib_split( 10070 self, buf1, len1, buf2, len2, maxcount); 10071 break; 10072 case PyUnicode_2BYTE_KIND: 10073 out = ucs2lib_split( 10074 self, buf1, len1, buf2, len2, maxcount); 10075 break; 10076 case PyUnicode_4BYTE_KIND: 10077 out = ucs4lib_split( 10078 self, buf1, len1, buf2, len2, maxcount); 10079 break; 10080 default: 10081 out = NULL; 10082 } 10083 if (kind2 != kind1) 10084 PyMem_Free(buf2); 10085 return out; 10086} 10087 10088static PyObject * 10089rsplit(PyObject *self, 10090 PyObject *substring, 10091 Py_ssize_t maxcount) 10092{ 10093 int kind1, kind2; 10094 void *buf1, *buf2; 10095 Py_ssize_t len1, len2; 10096 PyObject* out; 10097 10098 if (maxcount < 0) 10099 maxcount = PY_SSIZE_T_MAX; 10100 10101 if (PyUnicode_READY(self) == -1) 10102 return NULL; 10103 10104 if (substring == NULL) 10105 switch (PyUnicode_KIND(self)) { 10106 case PyUnicode_1BYTE_KIND: 10107 if (PyUnicode_IS_ASCII(self)) 10108 return asciilib_rsplit_whitespace( 10109 self, PyUnicode_1BYTE_DATA(self), 10110 PyUnicode_GET_LENGTH(self), maxcount 10111 ); 10112 else 10113 return ucs1lib_rsplit_whitespace( 10114 self, PyUnicode_1BYTE_DATA(self), 10115 PyUnicode_GET_LENGTH(self), maxcount 10116 ); 10117 case PyUnicode_2BYTE_KIND: 10118 return ucs2lib_rsplit_whitespace( 10119 self, PyUnicode_2BYTE_DATA(self), 10120 PyUnicode_GET_LENGTH(self), maxcount 10121 ); 10122 case PyUnicode_4BYTE_KIND: 10123 return ucs4lib_rsplit_whitespace( 10124 self, PyUnicode_4BYTE_DATA(self), 10125 PyUnicode_GET_LENGTH(self), maxcount 10126 ); 10127 default: 10128 assert(0); 10129 return NULL; 10130 } 10131 10132 if (PyUnicode_READY(substring) == -1) 10133 return NULL; 10134 10135 kind1 = PyUnicode_KIND(self); 10136 kind2 = PyUnicode_KIND(substring); 10137 len1 = PyUnicode_GET_LENGTH(self); 10138 len2 = PyUnicode_GET_LENGTH(substring); 10139 if (kind1 < kind2 || len1 < len2) { 10140 out = PyList_New(1); 10141 if (out == NULL) 10142 return NULL; 10143 Py_INCREF(self); 10144 PyList_SET_ITEM(out, 0, self); 10145 return out; 10146 } 10147 buf1 = PyUnicode_DATA(self); 10148 buf2 = PyUnicode_DATA(substring); 10149 if (kind2 != kind1) { 10150 buf2 = _PyUnicode_AsKind(substring, kind1); 10151 if (!buf2) 10152 return NULL; 10153 } 10154 10155 switch (kind1) { 10156 case PyUnicode_1BYTE_KIND: 10157 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10158 out = asciilib_rsplit( 10159 self, buf1, len1, buf2, len2, maxcount); 10160 else 10161 out = ucs1lib_rsplit( 10162 self, buf1, len1, buf2, len2, maxcount); 10163 break; 10164 case PyUnicode_2BYTE_KIND: 10165 out = ucs2lib_rsplit( 10166 self, buf1, len1, buf2, len2, maxcount); 10167 break; 10168 case PyUnicode_4BYTE_KIND: 10169 out = ucs4lib_rsplit( 10170 self, buf1, len1, buf2, len2, maxcount); 10171 break; 10172 default: 10173 out = NULL; 10174 } 10175 if (kind2 != kind1) 10176 PyMem_Free(buf2); 10177 return out; 10178} 10179 10180static Py_ssize_t 10181anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10182 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10183{ 10184 switch (kind) { 10185 case PyUnicode_1BYTE_KIND: 10186 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10187 return asciilib_find(buf1, len1, buf2, len2, offset); 10188 else 10189 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10190 case PyUnicode_2BYTE_KIND: 10191 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10192 case PyUnicode_4BYTE_KIND: 10193 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10194 } 10195 assert(0); 10196 return -1; 10197} 10198 10199static Py_ssize_t 10200anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10201 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10202{ 10203 switch (kind) { 10204 case PyUnicode_1BYTE_KIND: 10205 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10206 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10207 else 10208 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10209 case PyUnicode_2BYTE_KIND: 10210 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10211 case PyUnicode_4BYTE_KIND: 10212 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10213 } 10214 assert(0); 10215 return 0; 10216} 10217 10218static void 10219replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10220 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10221{ 10222 int kind = PyUnicode_KIND(u); 10223 void *data = PyUnicode_DATA(u); 10224 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10225 if (kind == PyUnicode_1BYTE_KIND) { 10226 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10227 (Py_UCS1 *)data + len, 10228 u1, u2, maxcount); 10229 } 10230 else if (kind == PyUnicode_2BYTE_KIND) { 10231 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10232 (Py_UCS2 *)data + len, 10233 u1, u2, maxcount); 10234 } 10235 else { 10236 assert(kind == PyUnicode_4BYTE_KIND); 10237 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10238 (Py_UCS4 *)data + len, 10239 u1, u2, maxcount); 10240 } 10241} 10242 10243static PyObject * 10244replace(PyObject *self, PyObject *str1, 10245 PyObject *str2, Py_ssize_t maxcount) 10246{ 10247 PyObject *u; 10248 char *sbuf = PyUnicode_DATA(self); 10249 char *buf1 = PyUnicode_DATA(str1); 10250 char *buf2 = PyUnicode_DATA(str2); 10251 int srelease = 0, release1 = 0, release2 = 0; 10252 int skind = PyUnicode_KIND(self); 10253 int kind1 = PyUnicode_KIND(str1); 10254 int kind2 = PyUnicode_KIND(str2); 10255 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10256 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10257 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10258 int mayshrink; 10259 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10260 10261 if (maxcount < 0) 10262 maxcount = PY_SSIZE_T_MAX; 10263 else if (maxcount == 0 || slen == 0) 10264 goto nothing; 10265 10266 if (str1 == str2) 10267 goto nothing; 10268 10269 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10270 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10271 if (maxchar < maxchar_str1) 10272 /* substring too wide to be present */ 10273 goto nothing; 10274 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10275 /* Replacing str1 with str2 may cause a maxchar reduction in the 10276 result string. */ 10277 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10278 maxchar = Py_MAX(maxchar, maxchar_str2); 10279 10280 if (len1 == len2) { 10281 /* same length */ 10282 if (len1 == 0) 10283 goto nothing; 10284 if (len1 == 1) { 10285 /* replace characters */ 10286 Py_UCS4 u1, u2; 10287 Py_ssize_t pos; 10288 10289 u1 = PyUnicode_READ(kind1, buf1, 0); 10290 pos = findchar(sbuf, skind, slen, u1, 1); 10291 if (pos < 0) 10292 goto nothing; 10293 u2 = PyUnicode_READ(kind2, buf2, 0); 10294 u = PyUnicode_New(slen, maxchar); 10295 if (!u) 10296 goto error; 10297 10298 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10299 replace_1char_inplace(u, pos, u1, u2, maxcount); 10300 } 10301 else { 10302 int rkind = skind; 10303 char *res; 10304 Py_ssize_t i; 10305 10306 if (kind1 < rkind) { 10307 /* widen substring */ 10308 buf1 = _PyUnicode_AsKind(str1, rkind); 10309 if (!buf1) goto error; 10310 release1 = 1; 10311 } 10312 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10313 if (i < 0) 10314 goto nothing; 10315 if (rkind > kind2) { 10316 /* widen replacement */ 10317 buf2 = _PyUnicode_AsKind(str2, rkind); 10318 if (!buf2) goto error; 10319 release2 = 1; 10320 } 10321 else if (rkind < kind2) { 10322 /* widen self and buf1 */ 10323 rkind = kind2; 10324 if (release1) PyMem_Free(buf1); 10325 release1 = 0; 10326 sbuf = _PyUnicode_AsKind(self, rkind); 10327 if (!sbuf) goto error; 10328 srelease = 1; 10329 buf1 = _PyUnicode_AsKind(str1, rkind); 10330 if (!buf1) goto error; 10331 release1 = 1; 10332 } 10333 u = PyUnicode_New(slen, maxchar); 10334 if (!u) 10335 goto error; 10336 assert(PyUnicode_KIND(u) == rkind); 10337 res = PyUnicode_DATA(u); 10338 10339 memcpy(res, sbuf, rkind * slen); 10340 /* change everything in-place, starting with this one */ 10341 memcpy(res + rkind * i, 10342 buf2, 10343 rkind * len2); 10344 i += len1; 10345 10346 while ( --maxcount > 0) { 10347 i = anylib_find(rkind, self, 10348 sbuf+rkind*i, slen-i, 10349 str1, buf1, len1, i); 10350 if (i == -1) 10351 break; 10352 memcpy(res + rkind * i, 10353 buf2, 10354 rkind * len2); 10355 i += len1; 10356 } 10357 } 10358 } 10359 else { 10360 Py_ssize_t n, i, j, ires; 10361 Py_ssize_t new_size; 10362 int rkind = skind; 10363 char *res; 10364 10365 if (kind1 < rkind) { 10366 /* widen substring */ 10367 buf1 = _PyUnicode_AsKind(str1, rkind); 10368 if (!buf1) goto error; 10369 release1 = 1; 10370 } 10371 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10372 if (n == 0) 10373 goto nothing; 10374 if (kind2 < rkind) { 10375 /* widen replacement */ 10376 buf2 = _PyUnicode_AsKind(str2, rkind); 10377 if (!buf2) goto error; 10378 release2 = 1; 10379 } 10380 else if (kind2 > rkind) { 10381 /* widen self and buf1 */ 10382 rkind = kind2; 10383 sbuf = _PyUnicode_AsKind(self, rkind); 10384 if (!sbuf) goto error; 10385 srelease = 1; 10386 if (release1) PyMem_Free(buf1); 10387 release1 = 0; 10388 buf1 = _PyUnicode_AsKind(str1, rkind); 10389 if (!buf1) goto error; 10390 release1 = 1; 10391 } 10392 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10393 PyUnicode_GET_LENGTH(str1))); */ 10394 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10395 PyErr_SetString(PyExc_OverflowError, 10396 "replace string is too long"); 10397 goto error; 10398 } 10399 new_size = slen + n * (len2 - len1); 10400 if (new_size == 0) { 10401 _Py_INCREF_UNICODE_EMPTY(); 10402 if (!unicode_empty) 10403 goto error; 10404 u = unicode_empty; 10405 goto done; 10406 } 10407 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10408 PyErr_SetString(PyExc_OverflowError, 10409 "replace string is too long"); 10410 goto error; 10411 } 10412 u = PyUnicode_New(new_size, maxchar); 10413 if (!u) 10414 goto error; 10415 assert(PyUnicode_KIND(u) == rkind); 10416 res = PyUnicode_DATA(u); 10417 ires = i = 0; 10418 if (len1 > 0) { 10419 while (n-- > 0) { 10420 /* look for next match */ 10421 j = anylib_find(rkind, self, 10422 sbuf + rkind * i, slen-i, 10423 str1, buf1, len1, i); 10424 if (j == -1) 10425 break; 10426 else if (j > i) { 10427 /* copy unchanged part [i:j] */ 10428 memcpy(res + rkind * ires, 10429 sbuf + rkind * i, 10430 rkind * (j-i)); 10431 ires += j - i; 10432 } 10433 /* copy substitution string */ 10434 if (len2 > 0) { 10435 memcpy(res + rkind * ires, 10436 buf2, 10437 rkind * len2); 10438 ires += len2; 10439 } 10440 i = j + len1; 10441 } 10442 if (i < slen) 10443 /* copy tail [i:] */ 10444 memcpy(res + rkind * ires, 10445 sbuf + rkind * i, 10446 rkind * (slen-i)); 10447 } 10448 else { 10449 /* interleave */ 10450 while (n > 0) { 10451 memcpy(res + rkind * ires, 10452 buf2, 10453 rkind * len2); 10454 ires += len2; 10455 if (--n <= 0) 10456 break; 10457 memcpy(res + rkind * ires, 10458 sbuf + rkind * i, 10459 rkind); 10460 ires++; 10461 i++; 10462 } 10463 memcpy(res + rkind * ires, 10464 sbuf + rkind * i, 10465 rkind * (slen-i)); 10466 } 10467 } 10468 10469 if (mayshrink) { 10470 unicode_adjust_maxchar(&u); 10471 if (u == NULL) 10472 goto error; 10473 } 10474 10475 done: 10476 if (srelease) 10477 PyMem_FREE(sbuf); 10478 if (release1) 10479 PyMem_FREE(buf1); 10480 if (release2) 10481 PyMem_FREE(buf2); 10482 assert(_PyUnicode_CheckConsistency(u, 1)); 10483 return u; 10484 10485 nothing: 10486 /* nothing to replace; return original string (when possible) */ 10487 if (srelease) 10488 PyMem_FREE(sbuf); 10489 if (release1) 10490 PyMem_FREE(buf1); 10491 if (release2) 10492 PyMem_FREE(buf2); 10493 return unicode_result_unchanged(self); 10494 10495 error: 10496 if (srelease && sbuf) 10497 PyMem_FREE(sbuf); 10498 if (release1 && buf1) 10499 PyMem_FREE(buf1); 10500 if (release2 && buf2) 10501 PyMem_FREE(buf2); 10502 return NULL; 10503} 10504 10505/* --- Unicode Object Methods --------------------------------------------- */ 10506 10507PyDoc_STRVAR(title__doc__, 10508 "S.title() -> str\n\ 10509\n\ 10510Return a titlecased version of S, i.e. words start with title case\n\ 10511characters, all remaining cased characters have lower case."); 10512 10513static PyObject* 10514unicode_title(PyObject *self) 10515{ 10516 if (PyUnicode_READY(self) == -1) 10517 return NULL; 10518 return case_operation(self, do_title); 10519} 10520 10521PyDoc_STRVAR(capitalize__doc__, 10522 "S.capitalize() -> str\n\ 10523\n\ 10524Return a capitalized version of S, i.e. make the first character\n\ 10525have upper case and the rest lower case."); 10526 10527static PyObject* 10528unicode_capitalize(PyObject *self) 10529{ 10530 if (PyUnicode_READY(self) == -1) 10531 return NULL; 10532 if (PyUnicode_GET_LENGTH(self) == 0) 10533 return unicode_result_unchanged(self); 10534 return case_operation(self, do_capitalize); 10535} 10536 10537PyDoc_STRVAR(casefold__doc__, 10538 "S.casefold() -> str\n\ 10539\n\ 10540Return a version of S suitable for caseless comparisons."); 10541 10542static PyObject * 10543unicode_casefold(PyObject *self) 10544{ 10545 if (PyUnicode_READY(self) == -1) 10546 return NULL; 10547 if (PyUnicode_IS_ASCII(self)) 10548 return ascii_upper_or_lower(self, 1); 10549 return case_operation(self, do_casefold); 10550} 10551 10552 10553/* Argument converter. Coerces to a single unicode character */ 10554 10555static int 10556convert_uc(PyObject *obj, void *addr) 10557{ 10558 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10559 PyObject *uniobj; 10560 10561 uniobj = PyUnicode_FromObject(obj); 10562 if (uniobj == NULL) { 10563 PyErr_SetString(PyExc_TypeError, 10564 "The fill character cannot be converted to Unicode"); 10565 return 0; 10566 } 10567 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10568 PyErr_SetString(PyExc_TypeError, 10569 "The fill character must be exactly one character long"); 10570 Py_DECREF(uniobj); 10571 return 0; 10572 } 10573 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10574 Py_DECREF(uniobj); 10575 return 1; 10576} 10577 10578PyDoc_STRVAR(center__doc__, 10579 "S.center(width[, fillchar]) -> str\n\ 10580\n\ 10581Return S centered in a string of length width. Padding is\n\ 10582done using the specified fill character (default is a space)"); 10583 10584static PyObject * 10585unicode_center(PyObject *self, PyObject *args) 10586{ 10587 Py_ssize_t marg, left; 10588 Py_ssize_t width; 10589 Py_UCS4 fillchar = ' '; 10590 10591 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10592 return NULL; 10593 10594 if (PyUnicode_READY(self) == -1) 10595 return NULL; 10596 10597 if (PyUnicode_GET_LENGTH(self) >= width) 10598 return unicode_result_unchanged(self); 10599 10600 marg = width - PyUnicode_GET_LENGTH(self); 10601 left = marg / 2 + (marg & width & 1); 10602 10603 return pad(self, left, marg - left, fillchar); 10604} 10605 10606/* This function assumes that str1 and str2 are readied by the caller. */ 10607 10608static int 10609unicode_compare(PyObject *str1, PyObject *str2) 10610{ 10611#define COMPARE(TYPE1, TYPE2) \ 10612 do { \ 10613 TYPE1* p1 = (TYPE1 *)data1; \ 10614 TYPE2* p2 = (TYPE2 *)data2; \ 10615 TYPE1* end = p1 + len; \ 10616 Py_UCS4 c1, c2; \ 10617 for (; p1 != end; p1++, p2++) { \ 10618 c1 = *p1; \ 10619 c2 = *p2; \ 10620 if (c1 != c2) \ 10621 return (c1 < c2) ? -1 : 1; \ 10622 } \ 10623 } \ 10624 while (0) 10625 10626 int kind1, kind2; 10627 void *data1, *data2; 10628 Py_ssize_t len1, len2, len; 10629 10630 kind1 = PyUnicode_KIND(str1); 10631 kind2 = PyUnicode_KIND(str2); 10632 data1 = PyUnicode_DATA(str1); 10633 data2 = PyUnicode_DATA(str2); 10634 len1 = PyUnicode_GET_LENGTH(str1); 10635 len2 = PyUnicode_GET_LENGTH(str2); 10636 len = Py_MIN(len1, len2); 10637 10638 switch(kind1) { 10639 case PyUnicode_1BYTE_KIND: 10640 { 10641 switch(kind2) { 10642 case PyUnicode_1BYTE_KIND: 10643 { 10644 int cmp = memcmp(data1, data2, len); 10645 /* normalize result of memcmp() into the range [-1; 1] */ 10646 if (cmp < 0) 10647 return -1; 10648 if (cmp > 0) 10649 return 1; 10650 break; 10651 } 10652 case PyUnicode_2BYTE_KIND: 10653 COMPARE(Py_UCS1, Py_UCS2); 10654 break; 10655 case PyUnicode_4BYTE_KIND: 10656 COMPARE(Py_UCS1, Py_UCS4); 10657 break; 10658 default: 10659 assert(0); 10660 } 10661 break; 10662 } 10663 case PyUnicode_2BYTE_KIND: 10664 { 10665 switch(kind2) { 10666 case PyUnicode_1BYTE_KIND: 10667 COMPARE(Py_UCS2, Py_UCS1); 10668 break; 10669 case PyUnicode_2BYTE_KIND: 10670 { 10671 COMPARE(Py_UCS2, Py_UCS2); 10672 break; 10673 } 10674 case PyUnicode_4BYTE_KIND: 10675 COMPARE(Py_UCS2, Py_UCS4); 10676 break; 10677 default: 10678 assert(0); 10679 } 10680 break; 10681 } 10682 case PyUnicode_4BYTE_KIND: 10683 { 10684 switch(kind2) { 10685 case PyUnicode_1BYTE_KIND: 10686 COMPARE(Py_UCS4, Py_UCS1); 10687 break; 10688 case PyUnicode_2BYTE_KIND: 10689 COMPARE(Py_UCS4, Py_UCS2); 10690 break; 10691 case PyUnicode_4BYTE_KIND: 10692 { 10693#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10694 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10695 /* normalize result of wmemcmp() into the range [-1; 1] */ 10696 if (cmp < 0) 10697 return -1; 10698 if (cmp > 0) 10699 return 1; 10700#else 10701 COMPARE(Py_UCS4, Py_UCS4); 10702#endif 10703 break; 10704 } 10705 default: 10706 assert(0); 10707 } 10708 break; 10709 } 10710 default: 10711 assert(0); 10712 } 10713 10714 if (len1 == len2) 10715 return 0; 10716 if (len1 < len2) 10717 return -1; 10718 else 10719 return 1; 10720 10721#undef COMPARE 10722} 10723 10724Py_LOCAL(int) 10725unicode_compare_eq(PyObject *str1, PyObject *str2) 10726{ 10727 int kind; 10728 void *data1, *data2; 10729 Py_ssize_t len; 10730 int cmp; 10731 10732 len = PyUnicode_GET_LENGTH(str1); 10733 if (PyUnicode_GET_LENGTH(str2) != len) 10734 return 0; 10735 kind = PyUnicode_KIND(str1); 10736 if (PyUnicode_KIND(str2) != kind) 10737 return 0; 10738 data1 = PyUnicode_DATA(str1); 10739 data2 = PyUnicode_DATA(str2); 10740 10741 cmp = memcmp(data1, data2, len * kind); 10742 return (cmp == 0); 10743} 10744 10745 10746int 10747PyUnicode_Compare(PyObject *left, PyObject *right) 10748{ 10749 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10750 if (PyUnicode_READY(left) == -1 || 10751 PyUnicode_READY(right) == -1) 10752 return -1; 10753 10754 /* a string is equal to itself */ 10755 if (left == right) 10756 return 0; 10757 10758 return unicode_compare(left, right); 10759 } 10760 PyErr_Format(PyExc_TypeError, 10761 "Can't compare %.100s and %.100s", 10762 left->ob_type->tp_name, 10763 right->ob_type->tp_name); 10764 return -1; 10765} 10766 10767int 10768_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10769{ 10770 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10771 if (right_str == NULL) 10772 return -1; 10773 return PyUnicode_Compare(left, right_str); 10774} 10775 10776int 10777PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10778{ 10779 Py_ssize_t i; 10780 int kind; 10781 Py_UCS4 chr; 10782 10783 assert(_PyUnicode_CHECK(uni)); 10784 if (PyUnicode_READY(uni) == -1) 10785 return -1; 10786 kind = PyUnicode_KIND(uni); 10787 if (kind == PyUnicode_1BYTE_KIND) { 10788 const void *data = PyUnicode_1BYTE_DATA(uni); 10789 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10790 size_t len, len2 = strlen(str); 10791 int cmp; 10792 10793 len = Py_MIN(len1, len2); 10794 cmp = memcmp(data, str, len); 10795 if (cmp != 0) { 10796 if (cmp < 0) 10797 return -1; 10798 else 10799 return 1; 10800 } 10801 if (len1 > len2) 10802 return 1; /* uni is longer */ 10803 if (len1 < len2) 10804 return -1; /* str is longer */ 10805 return 0; 10806 } 10807 else { 10808 void *data = PyUnicode_DATA(uni); 10809 /* Compare Unicode string and source character set string */ 10810 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10811 if (chr != (unsigned char)str[i]) 10812 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10813 /* This check keeps Python strings that end in '\0' from comparing equal 10814 to C strings identical up to that point. */ 10815 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10816 return 1; /* uni is longer */ 10817 if (str[i]) 10818 return -1; /* str is longer */ 10819 return 0; 10820 } 10821} 10822 10823 10824#define TEST_COND(cond) \ 10825 ((cond) ? Py_True : Py_False) 10826 10827PyObject * 10828PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10829{ 10830 int result; 10831 PyObject *v; 10832 10833 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10834 Py_RETURN_NOTIMPLEMENTED; 10835 10836 if (PyUnicode_READY(left) == -1 || 10837 PyUnicode_READY(right) == -1) 10838 return NULL; 10839 10840 if (left == right) { 10841 switch (op) { 10842 case Py_EQ: 10843 case Py_LE: 10844 case Py_GE: 10845 /* a string is equal to itself */ 10846 v = Py_True; 10847 break; 10848 case Py_NE: 10849 case Py_LT: 10850 case Py_GT: 10851 v = Py_False; 10852 break; 10853 default: 10854 PyErr_BadArgument(); 10855 return NULL; 10856 } 10857 } 10858 else if (op == Py_EQ || op == Py_NE) { 10859 result = unicode_compare_eq(left, right); 10860 result ^= (op == Py_NE); 10861 v = TEST_COND(result); 10862 } 10863 else { 10864 result = unicode_compare(left, right); 10865 10866 /* Convert the return value to a Boolean */ 10867 switch (op) { 10868 case Py_LE: 10869 v = TEST_COND(result <= 0); 10870 break; 10871 case Py_GE: 10872 v = TEST_COND(result >= 0); 10873 break; 10874 case Py_LT: 10875 v = TEST_COND(result == -1); 10876 break; 10877 case Py_GT: 10878 v = TEST_COND(result == 1); 10879 break; 10880 default: 10881 PyErr_BadArgument(); 10882 return NULL; 10883 } 10884 } 10885 Py_INCREF(v); 10886 return v; 10887} 10888 10889int 10890PyUnicode_Contains(PyObject *container, PyObject *element) 10891{ 10892 PyObject *str, *sub; 10893 int kind1, kind2; 10894 void *buf1, *buf2; 10895 Py_ssize_t len1, len2; 10896 int result; 10897 10898 /* Coerce the two arguments */ 10899 sub = PyUnicode_FromObject(element); 10900 if (!sub) { 10901 PyErr_Format(PyExc_TypeError, 10902 "'in <string>' requires string as left operand, not %s", 10903 element->ob_type->tp_name); 10904 return -1; 10905 } 10906 10907 str = PyUnicode_FromObject(container); 10908 if (!str) { 10909 Py_DECREF(sub); 10910 return -1; 10911 } 10912 10913 kind1 = PyUnicode_KIND(str); 10914 kind2 = PyUnicode_KIND(sub); 10915 if (kind1 < kind2) { 10916 Py_DECREF(sub); 10917 Py_DECREF(str); 10918 return 0; 10919 } 10920 len1 = PyUnicode_GET_LENGTH(str); 10921 len2 = PyUnicode_GET_LENGTH(sub); 10922 if (len1 < len2) { 10923 Py_DECREF(sub); 10924 Py_DECREF(str); 10925 return 0; 10926 } 10927 buf1 = PyUnicode_DATA(str); 10928 buf2 = PyUnicode_DATA(sub); 10929 if (len2 == 1) { 10930 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 10931 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 10932 Py_DECREF(sub); 10933 Py_DECREF(str); 10934 return result; 10935 } 10936 if (kind2 != kind1) { 10937 buf2 = _PyUnicode_AsKind(sub, kind1); 10938 if (!buf2) { 10939 Py_DECREF(sub); 10940 Py_DECREF(str); 10941 return -1; 10942 } 10943 } 10944 10945 switch (kind1) { 10946 case PyUnicode_1BYTE_KIND: 10947 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10948 break; 10949 case PyUnicode_2BYTE_KIND: 10950 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10951 break; 10952 case PyUnicode_4BYTE_KIND: 10953 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10954 break; 10955 default: 10956 result = -1; 10957 assert(0); 10958 } 10959 10960 Py_DECREF(str); 10961 Py_DECREF(sub); 10962 10963 if (kind2 != kind1) 10964 PyMem_Free(buf2); 10965 10966 return result; 10967} 10968 10969/* Concat to string or Unicode object giving a new Unicode object. */ 10970 10971PyObject * 10972PyUnicode_Concat(PyObject *left, PyObject *right) 10973{ 10974 PyObject *u = NULL, *v = NULL, *w; 10975 Py_UCS4 maxchar, maxchar2; 10976 Py_ssize_t u_len, v_len, new_len; 10977 10978 /* Coerce the two arguments */ 10979 u = PyUnicode_FromObject(left); 10980 if (u == NULL) 10981 goto onError; 10982 v = PyUnicode_FromObject(right); 10983 if (v == NULL) 10984 goto onError; 10985 10986 /* Shortcuts */ 10987 if (v == unicode_empty) { 10988 Py_DECREF(v); 10989 return u; 10990 } 10991 if (u == unicode_empty) { 10992 Py_DECREF(u); 10993 return v; 10994 } 10995 10996 u_len = PyUnicode_GET_LENGTH(u); 10997 v_len = PyUnicode_GET_LENGTH(v); 10998 if (u_len > PY_SSIZE_T_MAX - v_len) { 10999 PyErr_SetString(PyExc_OverflowError, 11000 "strings are too large to concat"); 11001 goto onError; 11002 } 11003 new_len = u_len + v_len; 11004 11005 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11006 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11007 maxchar = Py_MAX(maxchar, maxchar2); 11008 11009 /* Concat the two Unicode strings */ 11010 w = PyUnicode_New(new_len, maxchar); 11011 if (w == NULL) 11012 goto onError; 11013 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11014 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11015 Py_DECREF(u); 11016 Py_DECREF(v); 11017 assert(_PyUnicode_CheckConsistency(w, 1)); 11018 return w; 11019 11020 onError: 11021 Py_XDECREF(u); 11022 Py_XDECREF(v); 11023 return NULL; 11024} 11025 11026void 11027PyUnicode_Append(PyObject **p_left, PyObject *right) 11028{ 11029 PyObject *left, *res; 11030 Py_UCS4 maxchar, maxchar2; 11031 Py_ssize_t left_len, right_len, new_len; 11032 11033 if (p_left == NULL) { 11034 if (!PyErr_Occurred()) 11035 PyErr_BadInternalCall(); 11036 return; 11037 } 11038 left = *p_left; 11039 if (right == NULL || left == NULL 11040 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11041 if (!PyErr_Occurred()) 11042 PyErr_BadInternalCall(); 11043 goto error; 11044 } 11045 11046 if (PyUnicode_READY(left) == -1) 11047 goto error; 11048 if (PyUnicode_READY(right) == -1) 11049 goto error; 11050 11051 /* Shortcuts */ 11052 if (left == unicode_empty) { 11053 Py_DECREF(left); 11054 Py_INCREF(right); 11055 *p_left = right; 11056 return; 11057 } 11058 if (right == unicode_empty) 11059 return; 11060 11061 left_len = PyUnicode_GET_LENGTH(left); 11062 right_len = PyUnicode_GET_LENGTH(right); 11063 if (left_len > PY_SSIZE_T_MAX - right_len) { 11064 PyErr_SetString(PyExc_OverflowError, 11065 "strings are too large to concat"); 11066 goto error; 11067 } 11068 new_len = left_len + right_len; 11069 11070 if (unicode_modifiable(left) 11071 && PyUnicode_CheckExact(right) 11072 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11073 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11074 to change the structure size, but characters are stored just after 11075 the structure, and so it requires to move all characters which is 11076 not so different than duplicating the string. */ 11077 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11078 { 11079 /* append inplace */ 11080 if (unicode_resize(p_left, new_len) != 0) 11081 goto error; 11082 11083 /* copy 'right' into the newly allocated area of 'left' */ 11084 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11085 } 11086 else { 11087 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11088 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11089 maxchar = Py_MAX(maxchar, maxchar2); 11090 11091 /* Concat the two Unicode strings */ 11092 res = PyUnicode_New(new_len, maxchar); 11093 if (res == NULL) 11094 goto error; 11095 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11096 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11097 Py_DECREF(left); 11098 *p_left = res; 11099 } 11100 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11101 return; 11102 11103error: 11104 Py_CLEAR(*p_left); 11105} 11106 11107void 11108PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11109{ 11110 PyUnicode_Append(pleft, right); 11111 Py_XDECREF(right); 11112} 11113 11114PyDoc_STRVAR(count__doc__, 11115 "S.count(sub[, start[, end]]) -> int\n\ 11116\n\ 11117Return the number of non-overlapping occurrences of substring sub in\n\ 11118string S[start:end]. Optional arguments start and end are\n\ 11119interpreted as in slice notation."); 11120 11121static PyObject * 11122unicode_count(PyObject *self, PyObject *args) 11123{ 11124 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11125 Py_ssize_t start = 0; 11126 Py_ssize_t end = PY_SSIZE_T_MAX; 11127 PyObject *result; 11128 int kind1, kind2; 11129 void *buf1, *buf2; 11130 Py_ssize_t len1, len2, iresult; 11131 11132 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11133 &start, &end)) 11134 return NULL; 11135 11136 kind1 = PyUnicode_KIND(self); 11137 kind2 = PyUnicode_KIND(substring); 11138 if (kind1 < kind2) { 11139 Py_DECREF(substring); 11140 return PyLong_FromLong(0); 11141 } 11142 len1 = PyUnicode_GET_LENGTH(self); 11143 len2 = PyUnicode_GET_LENGTH(substring); 11144 ADJUST_INDICES(start, end, len1); 11145 if (end - start < len2) { 11146 Py_DECREF(substring); 11147 return PyLong_FromLong(0); 11148 } 11149 buf1 = PyUnicode_DATA(self); 11150 buf2 = PyUnicode_DATA(substring); 11151 if (kind2 != kind1) { 11152 buf2 = _PyUnicode_AsKind(substring, kind1); 11153 if (!buf2) { 11154 Py_DECREF(substring); 11155 return NULL; 11156 } 11157 } 11158 switch (kind1) { 11159 case PyUnicode_1BYTE_KIND: 11160 iresult = ucs1lib_count( 11161 ((Py_UCS1*)buf1) + start, end - start, 11162 buf2, len2, PY_SSIZE_T_MAX 11163 ); 11164 break; 11165 case PyUnicode_2BYTE_KIND: 11166 iresult = ucs2lib_count( 11167 ((Py_UCS2*)buf1) + start, end - start, 11168 buf2, len2, PY_SSIZE_T_MAX 11169 ); 11170 break; 11171 case PyUnicode_4BYTE_KIND: 11172 iresult = ucs4lib_count( 11173 ((Py_UCS4*)buf1) + start, end - start, 11174 buf2, len2, PY_SSIZE_T_MAX 11175 ); 11176 break; 11177 default: 11178 assert(0); iresult = 0; 11179 } 11180 11181 result = PyLong_FromSsize_t(iresult); 11182 11183 if (kind2 != kind1) 11184 PyMem_Free(buf2); 11185 11186 Py_DECREF(substring); 11187 11188 return result; 11189} 11190 11191PyDoc_STRVAR(encode__doc__, 11192 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11193\n\ 11194Encode S using the codec registered for encoding. Default encoding\n\ 11195is 'utf-8'. errors may be given to set a different error\n\ 11196handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11197a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11198'xmlcharrefreplace' as well as any other name registered with\n\ 11199codecs.register_error that can handle UnicodeEncodeErrors."); 11200 11201static PyObject * 11202unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11203{ 11204 static char *kwlist[] = {"encoding", "errors", 0}; 11205 char *encoding = NULL; 11206 char *errors = NULL; 11207 11208 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11209 kwlist, &encoding, &errors)) 11210 return NULL; 11211 return PyUnicode_AsEncodedString(self, encoding, errors); 11212} 11213 11214PyDoc_STRVAR(expandtabs__doc__, 11215 "S.expandtabs(tabsize=8) -> str\n\ 11216\n\ 11217Return a copy of S where all tab characters are expanded using spaces.\n\ 11218If tabsize is not given, a tab size of 8 characters is assumed."); 11219 11220static PyObject* 11221unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11222{ 11223 Py_ssize_t i, j, line_pos, src_len, incr; 11224 Py_UCS4 ch; 11225 PyObject *u; 11226 void *src_data, *dest_data; 11227 static char *kwlist[] = {"tabsize", 0}; 11228 int tabsize = 8; 11229 int kind; 11230 int found; 11231 11232 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11233 kwlist, &tabsize)) 11234 return NULL; 11235 11236 if (PyUnicode_READY(self) == -1) 11237 return NULL; 11238 11239 /* First pass: determine size of output string */ 11240 src_len = PyUnicode_GET_LENGTH(self); 11241 i = j = line_pos = 0; 11242 kind = PyUnicode_KIND(self); 11243 src_data = PyUnicode_DATA(self); 11244 found = 0; 11245 for (; i < src_len; i++) { 11246 ch = PyUnicode_READ(kind, src_data, i); 11247 if (ch == '\t') { 11248 found = 1; 11249 if (tabsize > 0) { 11250 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11251 if (j > PY_SSIZE_T_MAX - incr) 11252 goto overflow; 11253 line_pos += incr; 11254 j += incr; 11255 } 11256 } 11257 else { 11258 if (j > PY_SSIZE_T_MAX - 1) 11259 goto overflow; 11260 line_pos++; 11261 j++; 11262 if (ch == '\n' || ch == '\r') 11263 line_pos = 0; 11264 } 11265 } 11266 if (!found) 11267 return unicode_result_unchanged(self); 11268 11269 /* Second pass: create output string and fill it */ 11270 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11271 if (!u) 11272 return NULL; 11273 dest_data = PyUnicode_DATA(u); 11274 11275 i = j = line_pos = 0; 11276 11277 for (; i < src_len; i++) { 11278 ch = PyUnicode_READ(kind, src_data, i); 11279 if (ch == '\t') { 11280 if (tabsize > 0) { 11281 incr = tabsize - (line_pos % tabsize); 11282 line_pos += incr; 11283 FILL(kind, dest_data, ' ', j, incr); 11284 j += incr; 11285 } 11286 } 11287 else { 11288 line_pos++; 11289 PyUnicode_WRITE(kind, dest_data, j, ch); 11290 j++; 11291 if (ch == '\n' || ch == '\r') 11292 line_pos = 0; 11293 } 11294 } 11295 assert (j == PyUnicode_GET_LENGTH(u)); 11296 return unicode_result(u); 11297 11298 overflow: 11299 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11300 return NULL; 11301} 11302 11303PyDoc_STRVAR(find__doc__, 11304 "S.find(sub[, start[, end]]) -> int\n\ 11305\n\ 11306Return the lowest index in S where substring sub is found,\n\ 11307such that sub is contained within S[start:end]. Optional\n\ 11308arguments start and end are interpreted as in slice notation.\n\ 11309\n\ 11310Return -1 on failure."); 11311 11312static PyObject * 11313unicode_find(PyObject *self, PyObject *args) 11314{ 11315 /* initialize variables to prevent gcc warning */ 11316 PyObject *substring = NULL; 11317 Py_ssize_t start = 0; 11318 Py_ssize_t end = 0; 11319 Py_ssize_t result; 11320 11321 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11322 &start, &end)) 11323 return NULL; 11324 11325 if (PyUnicode_READY(self) == -1) { 11326 Py_DECREF(substring); 11327 return NULL; 11328 } 11329 if (PyUnicode_READY(substring) == -1) { 11330 Py_DECREF(substring); 11331 return NULL; 11332 } 11333 11334 result = any_find_slice(1, self, substring, start, end); 11335 11336 Py_DECREF(substring); 11337 11338 if (result == -2) 11339 return NULL; 11340 11341 return PyLong_FromSsize_t(result); 11342} 11343 11344static PyObject * 11345unicode_getitem(PyObject *self, Py_ssize_t index) 11346{ 11347 void *data; 11348 enum PyUnicode_Kind kind; 11349 Py_UCS4 ch; 11350 11351 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11352 PyErr_BadArgument(); 11353 return NULL; 11354 } 11355 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11356 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11357 return NULL; 11358 } 11359 kind = PyUnicode_KIND(self); 11360 data = PyUnicode_DATA(self); 11361 ch = PyUnicode_READ(kind, data, index); 11362 return unicode_char(ch); 11363} 11364 11365/* Believe it or not, this produces the same value for ASCII strings 11366 as bytes_hash(). */ 11367static Py_hash_t 11368unicode_hash(PyObject *self) 11369{ 11370 Py_ssize_t len; 11371 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11372 11373#ifdef Py_DEBUG 11374 assert(_Py_HashSecret_Initialized); 11375#endif 11376 if (_PyUnicode_HASH(self) != -1) 11377 return _PyUnicode_HASH(self); 11378 if (PyUnicode_READY(self) == -1) 11379 return -1; 11380 len = PyUnicode_GET_LENGTH(self); 11381 /* 11382 We make the hash of the empty string be 0, rather than using 11383 (prefix ^ suffix), since this slightly obfuscates the hash secret 11384 */ 11385 if (len == 0) { 11386 _PyUnicode_HASH(self) = 0; 11387 return 0; 11388 } 11389 x = _Py_HashBytes(PyUnicode_DATA(self), 11390 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11391 _PyUnicode_HASH(self) = x; 11392 return x; 11393} 11394 11395PyDoc_STRVAR(index__doc__, 11396 "S.index(sub[, start[, end]]) -> int\n\ 11397\n\ 11398Like S.find() but raise ValueError when the substring is not found."); 11399 11400static PyObject * 11401unicode_index(PyObject *self, PyObject *args) 11402{ 11403 /* initialize variables to prevent gcc warning */ 11404 Py_ssize_t result; 11405 PyObject *substring = NULL; 11406 Py_ssize_t start = 0; 11407 Py_ssize_t end = 0; 11408 11409 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11410 &start, &end)) 11411 return NULL; 11412 11413 if (PyUnicode_READY(self) == -1) { 11414 Py_DECREF(substring); 11415 return NULL; 11416 } 11417 if (PyUnicode_READY(substring) == -1) { 11418 Py_DECREF(substring); 11419 return NULL; 11420 } 11421 11422 result = any_find_slice(1, self, substring, start, end); 11423 11424 Py_DECREF(substring); 11425 11426 if (result == -2) 11427 return NULL; 11428 11429 if (result < 0) { 11430 PyErr_SetString(PyExc_ValueError, "substring not found"); 11431 return NULL; 11432 } 11433 11434 return PyLong_FromSsize_t(result); 11435} 11436 11437PyDoc_STRVAR(islower__doc__, 11438 "S.islower() -> bool\n\ 11439\n\ 11440Return True if all cased characters in S are lowercase and there is\n\ 11441at least one cased character in S, False otherwise."); 11442 11443static PyObject* 11444unicode_islower(PyObject *self) 11445{ 11446 Py_ssize_t i, length; 11447 int kind; 11448 void *data; 11449 int cased; 11450 11451 if (PyUnicode_READY(self) == -1) 11452 return NULL; 11453 length = PyUnicode_GET_LENGTH(self); 11454 kind = PyUnicode_KIND(self); 11455 data = PyUnicode_DATA(self); 11456 11457 /* Shortcut for single character strings */ 11458 if (length == 1) 11459 return PyBool_FromLong( 11460 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11461 11462 /* Special case for empty strings */ 11463 if (length == 0) 11464 return PyBool_FromLong(0); 11465 11466 cased = 0; 11467 for (i = 0; i < length; i++) { 11468 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11469 11470 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11471 return PyBool_FromLong(0); 11472 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11473 cased = 1; 11474 } 11475 return PyBool_FromLong(cased); 11476} 11477 11478PyDoc_STRVAR(isupper__doc__, 11479 "S.isupper() -> bool\n\ 11480\n\ 11481Return True if all cased characters in S are uppercase and there is\n\ 11482at least one cased character in S, False otherwise."); 11483 11484static PyObject* 11485unicode_isupper(PyObject *self) 11486{ 11487 Py_ssize_t i, length; 11488 int kind; 11489 void *data; 11490 int cased; 11491 11492 if (PyUnicode_READY(self) == -1) 11493 return NULL; 11494 length = PyUnicode_GET_LENGTH(self); 11495 kind = PyUnicode_KIND(self); 11496 data = PyUnicode_DATA(self); 11497 11498 /* Shortcut for single character strings */ 11499 if (length == 1) 11500 return PyBool_FromLong( 11501 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11502 11503 /* Special case for empty strings */ 11504 if (length == 0) 11505 return PyBool_FromLong(0); 11506 11507 cased = 0; 11508 for (i = 0; i < length; i++) { 11509 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11510 11511 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11512 return PyBool_FromLong(0); 11513 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11514 cased = 1; 11515 } 11516 return PyBool_FromLong(cased); 11517} 11518 11519PyDoc_STRVAR(istitle__doc__, 11520 "S.istitle() -> bool\n\ 11521\n\ 11522Return True if S is a titlecased string and there is at least one\n\ 11523character in S, i.e. upper- and titlecase characters may only\n\ 11524follow uncased characters and lowercase characters only cased ones.\n\ 11525Return False otherwise."); 11526 11527static PyObject* 11528unicode_istitle(PyObject *self) 11529{ 11530 Py_ssize_t i, length; 11531 int kind; 11532 void *data; 11533 int cased, previous_is_cased; 11534 11535 if (PyUnicode_READY(self) == -1) 11536 return NULL; 11537 length = PyUnicode_GET_LENGTH(self); 11538 kind = PyUnicode_KIND(self); 11539 data = PyUnicode_DATA(self); 11540 11541 /* Shortcut for single character strings */ 11542 if (length == 1) { 11543 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11544 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11545 (Py_UNICODE_ISUPPER(ch) != 0)); 11546 } 11547 11548 /* Special case for empty strings */ 11549 if (length == 0) 11550 return PyBool_FromLong(0); 11551 11552 cased = 0; 11553 previous_is_cased = 0; 11554 for (i = 0; i < length; i++) { 11555 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11556 11557 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11558 if (previous_is_cased) 11559 return PyBool_FromLong(0); 11560 previous_is_cased = 1; 11561 cased = 1; 11562 } 11563 else if (Py_UNICODE_ISLOWER(ch)) { 11564 if (!previous_is_cased) 11565 return PyBool_FromLong(0); 11566 previous_is_cased = 1; 11567 cased = 1; 11568 } 11569 else 11570 previous_is_cased = 0; 11571 } 11572 return PyBool_FromLong(cased); 11573} 11574 11575PyDoc_STRVAR(isspace__doc__, 11576 "S.isspace() -> bool\n\ 11577\n\ 11578Return True if all characters in S are whitespace\n\ 11579and there is at least one character in S, False otherwise."); 11580 11581static PyObject* 11582unicode_isspace(PyObject *self) 11583{ 11584 Py_ssize_t i, length; 11585 int kind; 11586 void *data; 11587 11588 if (PyUnicode_READY(self) == -1) 11589 return NULL; 11590 length = PyUnicode_GET_LENGTH(self); 11591 kind = PyUnicode_KIND(self); 11592 data = PyUnicode_DATA(self); 11593 11594 /* Shortcut for single character strings */ 11595 if (length == 1) 11596 return PyBool_FromLong( 11597 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11598 11599 /* Special case for empty strings */ 11600 if (length == 0) 11601 return PyBool_FromLong(0); 11602 11603 for (i = 0; i < length; i++) { 11604 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11605 if (!Py_UNICODE_ISSPACE(ch)) 11606 return PyBool_FromLong(0); 11607 } 11608 return PyBool_FromLong(1); 11609} 11610 11611PyDoc_STRVAR(isalpha__doc__, 11612 "S.isalpha() -> bool\n\ 11613\n\ 11614Return True if all characters in S are alphabetic\n\ 11615and there is at least one character in S, False otherwise."); 11616 11617static PyObject* 11618unicode_isalpha(PyObject *self) 11619{ 11620 Py_ssize_t i, length; 11621 int kind; 11622 void *data; 11623 11624 if (PyUnicode_READY(self) == -1) 11625 return NULL; 11626 length = PyUnicode_GET_LENGTH(self); 11627 kind = PyUnicode_KIND(self); 11628 data = PyUnicode_DATA(self); 11629 11630 /* Shortcut for single character strings */ 11631 if (length == 1) 11632 return PyBool_FromLong( 11633 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11634 11635 /* Special case for empty strings */ 11636 if (length == 0) 11637 return PyBool_FromLong(0); 11638 11639 for (i = 0; i < length; i++) { 11640 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11641 return PyBool_FromLong(0); 11642 } 11643 return PyBool_FromLong(1); 11644} 11645 11646PyDoc_STRVAR(isalnum__doc__, 11647 "S.isalnum() -> bool\n\ 11648\n\ 11649Return True if all characters in S are alphanumeric\n\ 11650and there is at least one character in S, False otherwise."); 11651 11652static PyObject* 11653unicode_isalnum(PyObject *self) 11654{ 11655 int kind; 11656 void *data; 11657 Py_ssize_t len, i; 11658 11659 if (PyUnicode_READY(self) == -1) 11660 return NULL; 11661 11662 kind = PyUnicode_KIND(self); 11663 data = PyUnicode_DATA(self); 11664 len = PyUnicode_GET_LENGTH(self); 11665 11666 /* Shortcut for single character strings */ 11667 if (len == 1) { 11668 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11669 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11670 } 11671 11672 /* Special case for empty strings */ 11673 if (len == 0) 11674 return PyBool_FromLong(0); 11675 11676 for (i = 0; i < len; i++) { 11677 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11678 if (!Py_UNICODE_ISALNUM(ch)) 11679 return PyBool_FromLong(0); 11680 } 11681 return PyBool_FromLong(1); 11682} 11683 11684PyDoc_STRVAR(isdecimal__doc__, 11685 "S.isdecimal() -> bool\n\ 11686\n\ 11687Return True if there are only decimal characters in S,\n\ 11688False otherwise."); 11689 11690static PyObject* 11691unicode_isdecimal(PyObject *self) 11692{ 11693 Py_ssize_t i, length; 11694 int kind; 11695 void *data; 11696 11697 if (PyUnicode_READY(self) == -1) 11698 return NULL; 11699 length = PyUnicode_GET_LENGTH(self); 11700 kind = PyUnicode_KIND(self); 11701 data = PyUnicode_DATA(self); 11702 11703 /* Shortcut for single character strings */ 11704 if (length == 1) 11705 return PyBool_FromLong( 11706 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11707 11708 /* Special case for empty strings */ 11709 if (length == 0) 11710 return PyBool_FromLong(0); 11711 11712 for (i = 0; i < length; i++) { 11713 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11714 return PyBool_FromLong(0); 11715 } 11716 return PyBool_FromLong(1); 11717} 11718 11719PyDoc_STRVAR(isdigit__doc__, 11720 "S.isdigit() -> bool\n\ 11721\n\ 11722Return True if all characters in S are digits\n\ 11723and there is at least one character in S, False otherwise."); 11724 11725static PyObject* 11726unicode_isdigit(PyObject *self) 11727{ 11728 Py_ssize_t i, length; 11729 int kind; 11730 void *data; 11731 11732 if (PyUnicode_READY(self) == -1) 11733 return NULL; 11734 length = PyUnicode_GET_LENGTH(self); 11735 kind = PyUnicode_KIND(self); 11736 data = PyUnicode_DATA(self); 11737 11738 /* Shortcut for single character strings */ 11739 if (length == 1) { 11740 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11741 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11742 } 11743 11744 /* Special case for empty strings */ 11745 if (length == 0) 11746 return PyBool_FromLong(0); 11747 11748 for (i = 0; i < length; i++) { 11749 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11750 return PyBool_FromLong(0); 11751 } 11752 return PyBool_FromLong(1); 11753} 11754 11755PyDoc_STRVAR(isnumeric__doc__, 11756 "S.isnumeric() -> bool\n\ 11757\n\ 11758Return True if there are only numeric characters in S,\n\ 11759False otherwise."); 11760 11761static PyObject* 11762unicode_isnumeric(PyObject *self) 11763{ 11764 Py_ssize_t i, length; 11765 int kind; 11766 void *data; 11767 11768 if (PyUnicode_READY(self) == -1) 11769 return NULL; 11770 length = PyUnicode_GET_LENGTH(self); 11771 kind = PyUnicode_KIND(self); 11772 data = PyUnicode_DATA(self); 11773 11774 /* Shortcut for single character strings */ 11775 if (length == 1) 11776 return PyBool_FromLong( 11777 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11778 11779 /* Special case for empty strings */ 11780 if (length == 0) 11781 return PyBool_FromLong(0); 11782 11783 for (i = 0; i < length; i++) { 11784 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11785 return PyBool_FromLong(0); 11786 } 11787 return PyBool_FromLong(1); 11788} 11789 11790int 11791PyUnicode_IsIdentifier(PyObject *self) 11792{ 11793 int kind; 11794 void *data; 11795 Py_ssize_t i; 11796 Py_UCS4 first; 11797 11798 if (PyUnicode_READY(self) == -1) { 11799 Py_FatalError("identifier not ready"); 11800 return 0; 11801 } 11802 11803 /* Special case for empty strings */ 11804 if (PyUnicode_GET_LENGTH(self) == 0) 11805 return 0; 11806 kind = PyUnicode_KIND(self); 11807 data = PyUnicode_DATA(self); 11808 11809 /* PEP 3131 says that the first character must be in 11810 XID_Start and subsequent characters in XID_Continue, 11811 and for the ASCII range, the 2.x rules apply (i.e 11812 start with letters and underscore, continue with 11813 letters, digits, underscore). However, given the current 11814 definition of XID_Start and XID_Continue, it is sufficient 11815 to check just for these, except that _ must be allowed 11816 as starting an identifier. */ 11817 first = PyUnicode_READ(kind, data, 0); 11818 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11819 return 0; 11820 11821 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11822 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11823 return 0; 11824 return 1; 11825} 11826 11827PyDoc_STRVAR(isidentifier__doc__, 11828 "S.isidentifier() -> bool\n\ 11829\n\ 11830Return True if S is a valid identifier according\n\ 11831to the language definition.\n\ 11832\n\ 11833Use keyword.iskeyword() to test for reserved identifiers\n\ 11834such as \"def\" and \"class\".\n"); 11835 11836static PyObject* 11837unicode_isidentifier(PyObject *self) 11838{ 11839 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11840} 11841 11842PyDoc_STRVAR(isprintable__doc__, 11843 "S.isprintable() -> bool\n\ 11844\n\ 11845Return True if all characters in S are considered\n\ 11846printable in repr() or S is empty, False otherwise."); 11847 11848static PyObject* 11849unicode_isprintable(PyObject *self) 11850{ 11851 Py_ssize_t i, length; 11852 int kind; 11853 void *data; 11854 11855 if (PyUnicode_READY(self) == -1) 11856 return NULL; 11857 length = PyUnicode_GET_LENGTH(self); 11858 kind = PyUnicode_KIND(self); 11859 data = PyUnicode_DATA(self); 11860 11861 /* Shortcut for single character strings */ 11862 if (length == 1) 11863 return PyBool_FromLong( 11864 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11865 11866 for (i = 0; i < length; i++) { 11867 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11868 Py_RETURN_FALSE; 11869 } 11870 } 11871 Py_RETURN_TRUE; 11872} 11873 11874PyDoc_STRVAR(join__doc__, 11875 "S.join(iterable) -> str\n\ 11876\n\ 11877Return a string which is the concatenation of the strings in the\n\ 11878iterable. The separator between elements is S."); 11879 11880static PyObject* 11881unicode_join(PyObject *self, PyObject *data) 11882{ 11883 return PyUnicode_Join(self, data); 11884} 11885 11886static Py_ssize_t 11887unicode_length(PyObject *self) 11888{ 11889 if (PyUnicode_READY(self) == -1) 11890 return -1; 11891 return PyUnicode_GET_LENGTH(self); 11892} 11893 11894PyDoc_STRVAR(ljust__doc__, 11895 "S.ljust(width[, fillchar]) -> str\n\ 11896\n\ 11897Return S left-justified in a Unicode string of length width. Padding is\n\ 11898done using the specified fill character (default is a space)."); 11899 11900static PyObject * 11901unicode_ljust(PyObject *self, PyObject *args) 11902{ 11903 Py_ssize_t width; 11904 Py_UCS4 fillchar = ' '; 11905 11906 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11907 return NULL; 11908 11909 if (PyUnicode_READY(self) == -1) 11910 return NULL; 11911 11912 if (PyUnicode_GET_LENGTH(self) >= width) 11913 return unicode_result_unchanged(self); 11914 11915 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11916} 11917 11918PyDoc_STRVAR(lower__doc__, 11919 "S.lower() -> str\n\ 11920\n\ 11921Return a copy of the string S converted to lowercase."); 11922 11923static PyObject* 11924unicode_lower(PyObject *self) 11925{ 11926 if (PyUnicode_READY(self) == -1) 11927 return NULL; 11928 if (PyUnicode_IS_ASCII(self)) 11929 return ascii_upper_or_lower(self, 1); 11930 return case_operation(self, do_lower); 11931} 11932 11933#define LEFTSTRIP 0 11934#define RIGHTSTRIP 1 11935#define BOTHSTRIP 2 11936 11937/* Arrays indexed by above */ 11938static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11939 11940#define STRIPNAME(i) (stripformat[i]+3) 11941 11942/* externally visible for str.strip(unicode) */ 11943PyObject * 11944_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11945{ 11946 void *data; 11947 int kind; 11948 Py_ssize_t i, j, len; 11949 BLOOM_MASK sepmask; 11950 Py_ssize_t seplen; 11951 11952 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11953 return NULL; 11954 11955 kind = PyUnicode_KIND(self); 11956 data = PyUnicode_DATA(self); 11957 len = PyUnicode_GET_LENGTH(self); 11958 seplen = PyUnicode_GET_LENGTH(sepobj); 11959 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11960 PyUnicode_DATA(sepobj), 11961 seplen); 11962 11963 i = 0; 11964 if (striptype != RIGHTSTRIP) { 11965 while (i < len) { 11966 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11967 if (!BLOOM(sepmask, ch)) 11968 break; 11969 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 11970 break; 11971 i++; 11972 } 11973 } 11974 11975 j = len; 11976 if (striptype != LEFTSTRIP) { 11977 j--; 11978 while (j >= i) { 11979 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 11980 if (!BLOOM(sepmask, ch)) 11981 break; 11982 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 11983 break; 11984 j--; 11985 } 11986 11987 j++; 11988 } 11989 11990 return PyUnicode_Substring(self, i, j); 11991} 11992 11993PyObject* 11994PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11995{ 11996 unsigned char *data; 11997 int kind; 11998 Py_ssize_t length; 11999 12000 if (PyUnicode_READY(self) == -1) 12001 return NULL; 12002 12003 length = PyUnicode_GET_LENGTH(self); 12004 end = Py_MIN(end, length); 12005 12006 if (start == 0 && end == length) 12007 return unicode_result_unchanged(self); 12008 12009 if (start < 0 || end < 0) { 12010 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12011 return NULL; 12012 } 12013 if (start >= length || end < start) 12014 _Py_RETURN_UNICODE_EMPTY(); 12015 12016 length = end - start; 12017 if (PyUnicode_IS_ASCII(self)) { 12018 data = PyUnicode_1BYTE_DATA(self); 12019 return _PyUnicode_FromASCII((char*)(data + start), length); 12020 } 12021 else { 12022 kind = PyUnicode_KIND(self); 12023 data = PyUnicode_1BYTE_DATA(self); 12024 return PyUnicode_FromKindAndData(kind, 12025 data + kind * start, 12026 length); 12027 } 12028} 12029 12030static PyObject * 12031do_strip(PyObject *self, int striptype) 12032{ 12033 Py_ssize_t len, i, j; 12034 12035 if (PyUnicode_READY(self) == -1) 12036 return NULL; 12037 12038 len = PyUnicode_GET_LENGTH(self); 12039 12040 if (PyUnicode_IS_ASCII(self)) { 12041 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12042 12043 i = 0; 12044 if (striptype != RIGHTSTRIP) { 12045 while (i < len) { 12046 Py_UCS1 ch = data[i]; 12047 if (!_Py_ascii_whitespace[ch]) 12048 break; 12049 i++; 12050 } 12051 } 12052 12053 j = len; 12054 if (striptype != LEFTSTRIP) { 12055 j--; 12056 while (j >= i) { 12057 Py_UCS1 ch = data[j]; 12058 if (!_Py_ascii_whitespace[ch]) 12059 break; 12060 j--; 12061 } 12062 j++; 12063 } 12064 } 12065 else { 12066 int kind = PyUnicode_KIND(self); 12067 void *data = PyUnicode_DATA(self); 12068 12069 i = 0; 12070 if (striptype != RIGHTSTRIP) { 12071 while (i < len) { 12072 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12073 if (!Py_UNICODE_ISSPACE(ch)) 12074 break; 12075 i++; 12076 } 12077 } 12078 12079 j = len; 12080 if (striptype != LEFTSTRIP) { 12081 j--; 12082 while (j >= i) { 12083 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12084 if (!Py_UNICODE_ISSPACE(ch)) 12085 break; 12086 j--; 12087 } 12088 j++; 12089 } 12090 } 12091 12092 return PyUnicode_Substring(self, i, j); 12093} 12094 12095 12096static PyObject * 12097do_argstrip(PyObject *self, int striptype, PyObject *args) 12098{ 12099 PyObject *sep = NULL; 12100 12101 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12102 return NULL; 12103 12104 if (sep != NULL && sep != Py_None) { 12105 if (PyUnicode_Check(sep)) 12106 return _PyUnicode_XStrip(self, striptype, sep); 12107 else { 12108 PyErr_Format(PyExc_TypeError, 12109 "%s arg must be None or str", 12110 STRIPNAME(striptype)); 12111 return NULL; 12112 } 12113 } 12114 12115 return do_strip(self, striptype); 12116} 12117 12118 12119PyDoc_STRVAR(strip__doc__, 12120 "S.strip([chars]) -> str\n\ 12121\n\ 12122Return a copy of the string S with leading and trailing\n\ 12123whitespace removed.\n\ 12124If chars is given and not None, remove characters in chars instead."); 12125 12126static PyObject * 12127unicode_strip(PyObject *self, PyObject *args) 12128{ 12129 if (PyTuple_GET_SIZE(args) == 0) 12130 return do_strip(self, BOTHSTRIP); /* Common case */ 12131 else 12132 return do_argstrip(self, BOTHSTRIP, args); 12133} 12134 12135 12136PyDoc_STRVAR(lstrip__doc__, 12137 "S.lstrip([chars]) -> str\n\ 12138\n\ 12139Return a copy of the string S with leading whitespace removed.\n\ 12140If chars is given and not None, remove characters in chars instead."); 12141 12142static PyObject * 12143unicode_lstrip(PyObject *self, PyObject *args) 12144{ 12145 if (PyTuple_GET_SIZE(args) == 0) 12146 return do_strip(self, LEFTSTRIP); /* Common case */ 12147 else 12148 return do_argstrip(self, LEFTSTRIP, args); 12149} 12150 12151 12152PyDoc_STRVAR(rstrip__doc__, 12153 "S.rstrip([chars]) -> str\n\ 12154\n\ 12155Return a copy of the string S with trailing whitespace removed.\n\ 12156If chars is given and not None, remove characters in chars instead."); 12157 12158static PyObject * 12159unicode_rstrip(PyObject *self, PyObject *args) 12160{ 12161 if (PyTuple_GET_SIZE(args) == 0) 12162 return do_strip(self, RIGHTSTRIP); /* Common case */ 12163 else 12164 return do_argstrip(self, RIGHTSTRIP, args); 12165} 12166 12167 12168static PyObject* 12169unicode_repeat(PyObject *str, Py_ssize_t len) 12170{ 12171 PyObject *u; 12172 Py_ssize_t nchars, n; 12173 12174 if (len < 1) 12175 _Py_RETURN_UNICODE_EMPTY(); 12176 12177 /* no repeat, return original string */ 12178 if (len == 1) 12179 return unicode_result_unchanged(str); 12180 12181 if (PyUnicode_READY(str) == -1) 12182 return NULL; 12183 12184 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12185 PyErr_SetString(PyExc_OverflowError, 12186 "repeated string is too long"); 12187 return NULL; 12188 } 12189 nchars = len * PyUnicode_GET_LENGTH(str); 12190 12191 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12192 if (!u) 12193 return NULL; 12194 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12195 12196 if (PyUnicode_GET_LENGTH(str) == 1) { 12197 const int kind = PyUnicode_KIND(str); 12198 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12199 if (kind == PyUnicode_1BYTE_KIND) { 12200 void *to = PyUnicode_DATA(u); 12201 memset(to, (unsigned char)fill_char, len); 12202 } 12203 else if (kind == PyUnicode_2BYTE_KIND) { 12204 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12205 for (n = 0; n < len; ++n) 12206 ucs2[n] = fill_char; 12207 } else { 12208 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12209 assert(kind == PyUnicode_4BYTE_KIND); 12210 for (n = 0; n < len; ++n) 12211 ucs4[n] = fill_char; 12212 } 12213 } 12214 else { 12215 /* number of characters copied this far */ 12216 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12217 const Py_ssize_t char_size = PyUnicode_KIND(str); 12218 char *to = (char *) PyUnicode_DATA(u); 12219 Py_MEMCPY(to, PyUnicode_DATA(str), 12220 PyUnicode_GET_LENGTH(str) * char_size); 12221 while (done < nchars) { 12222 n = (done <= nchars-done) ? done : nchars-done; 12223 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12224 done += n; 12225 } 12226 } 12227 12228 assert(_PyUnicode_CheckConsistency(u, 1)); 12229 return u; 12230} 12231 12232PyObject * 12233PyUnicode_Replace(PyObject *obj, 12234 PyObject *subobj, 12235 PyObject *replobj, 12236 Py_ssize_t maxcount) 12237{ 12238 PyObject *self; 12239 PyObject *str1; 12240 PyObject *str2; 12241 PyObject *result; 12242 12243 self = PyUnicode_FromObject(obj); 12244 if (self == NULL) 12245 return NULL; 12246 str1 = PyUnicode_FromObject(subobj); 12247 if (str1 == NULL) { 12248 Py_DECREF(self); 12249 return NULL; 12250 } 12251 str2 = PyUnicode_FromObject(replobj); 12252 if (str2 == NULL) { 12253 Py_DECREF(self); 12254 Py_DECREF(str1); 12255 return NULL; 12256 } 12257 if (PyUnicode_READY(self) == -1 || 12258 PyUnicode_READY(str1) == -1 || 12259 PyUnicode_READY(str2) == -1) 12260 result = NULL; 12261 else 12262 result = replace(self, str1, str2, maxcount); 12263 Py_DECREF(self); 12264 Py_DECREF(str1); 12265 Py_DECREF(str2); 12266 return result; 12267} 12268 12269PyDoc_STRVAR(replace__doc__, 12270 "S.replace(old, new[, count]) -> str\n\ 12271\n\ 12272Return a copy of S with all occurrences of substring\n\ 12273old replaced by new. If the optional argument count is\n\ 12274given, only the first count occurrences are replaced."); 12275 12276static PyObject* 12277unicode_replace(PyObject *self, PyObject *args) 12278{ 12279 PyObject *str1; 12280 PyObject *str2; 12281 Py_ssize_t maxcount = -1; 12282 PyObject *result; 12283 12284 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12285 return NULL; 12286 if (PyUnicode_READY(self) == -1) 12287 return NULL; 12288 str1 = PyUnicode_FromObject(str1); 12289 if (str1 == NULL) 12290 return NULL; 12291 str2 = PyUnicode_FromObject(str2); 12292 if (str2 == NULL) { 12293 Py_DECREF(str1); 12294 return NULL; 12295 } 12296 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12297 result = NULL; 12298 else 12299 result = replace(self, str1, str2, maxcount); 12300 12301 Py_DECREF(str1); 12302 Py_DECREF(str2); 12303 return result; 12304} 12305 12306static PyObject * 12307unicode_repr(PyObject *unicode) 12308{ 12309 PyObject *repr; 12310 Py_ssize_t isize; 12311 Py_ssize_t osize, squote, dquote, i, o; 12312 Py_UCS4 max, quote; 12313 int ikind, okind, unchanged; 12314 void *idata, *odata; 12315 12316 if (PyUnicode_READY(unicode) == -1) 12317 return NULL; 12318 12319 isize = PyUnicode_GET_LENGTH(unicode); 12320 idata = PyUnicode_DATA(unicode); 12321 12322 /* Compute length of output, quote characters, and 12323 maximum character */ 12324 osize = 0; 12325 max = 127; 12326 squote = dquote = 0; 12327 ikind = PyUnicode_KIND(unicode); 12328 for (i = 0; i < isize; i++) { 12329 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12330 Py_ssize_t incr = 1; 12331 switch (ch) { 12332 case '\'': squote++; break; 12333 case '"': dquote++; break; 12334 case '\\': case '\t': case '\r': case '\n': 12335 incr = 2; 12336 break; 12337 default: 12338 /* Fast-path ASCII */ 12339 if (ch < ' ' || ch == 0x7f) 12340 incr = 4; /* \xHH */ 12341 else if (ch < 0x7f) 12342 ; 12343 else if (Py_UNICODE_ISPRINTABLE(ch)) 12344 max = ch > max ? ch : max; 12345 else if (ch < 0x100) 12346 incr = 4; /* \xHH */ 12347 else if (ch < 0x10000) 12348 incr = 6; /* \uHHHH */ 12349 else 12350 incr = 10; /* \uHHHHHHHH */ 12351 } 12352 if (osize > PY_SSIZE_T_MAX - incr) { 12353 PyErr_SetString(PyExc_OverflowError, 12354 "string is too long to generate repr"); 12355 return NULL; 12356 } 12357 osize += incr; 12358 } 12359 12360 quote = '\''; 12361 unchanged = (osize == isize); 12362 if (squote) { 12363 unchanged = 0; 12364 if (dquote) 12365 /* Both squote and dquote present. Use squote, 12366 and escape them */ 12367 osize += squote; 12368 else 12369 quote = '"'; 12370 } 12371 osize += 2; /* quotes */ 12372 12373 repr = PyUnicode_New(osize, max); 12374 if (repr == NULL) 12375 return NULL; 12376 okind = PyUnicode_KIND(repr); 12377 odata = PyUnicode_DATA(repr); 12378 12379 PyUnicode_WRITE(okind, odata, 0, quote); 12380 PyUnicode_WRITE(okind, odata, osize-1, quote); 12381 if (unchanged) { 12382 _PyUnicode_FastCopyCharacters(repr, 1, 12383 unicode, 0, 12384 isize); 12385 } 12386 else { 12387 for (i = 0, o = 1; i < isize; i++) { 12388 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12389 12390 /* Escape quotes and backslashes */ 12391 if ((ch == quote) || (ch == '\\')) { 12392 PyUnicode_WRITE(okind, odata, o++, '\\'); 12393 PyUnicode_WRITE(okind, odata, o++, ch); 12394 continue; 12395 } 12396 12397 /* Map special whitespace to '\t', \n', '\r' */ 12398 if (ch == '\t') { 12399 PyUnicode_WRITE(okind, odata, o++, '\\'); 12400 PyUnicode_WRITE(okind, odata, o++, 't'); 12401 } 12402 else if (ch == '\n') { 12403 PyUnicode_WRITE(okind, odata, o++, '\\'); 12404 PyUnicode_WRITE(okind, odata, o++, 'n'); 12405 } 12406 else if (ch == '\r') { 12407 PyUnicode_WRITE(okind, odata, o++, '\\'); 12408 PyUnicode_WRITE(okind, odata, o++, 'r'); 12409 } 12410 12411 /* Map non-printable US ASCII to '\xhh' */ 12412 else if (ch < ' ' || ch == 0x7F) { 12413 PyUnicode_WRITE(okind, odata, o++, '\\'); 12414 PyUnicode_WRITE(okind, odata, o++, 'x'); 12415 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12416 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12417 } 12418 12419 /* Copy ASCII characters as-is */ 12420 else if (ch < 0x7F) { 12421 PyUnicode_WRITE(okind, odata, o++, ch); 12422 } 12423 12424 /* Non-ASCII characters */ 12425 else { 12426 /* Map Unicode whitespace and control characters 12427 (categories Z* and C* except ASCII space) 12428 */ 12429 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12430 PyUnicode_WRITE(okind, odata, o++, '\\'); 12431 /* Map 8-bit characters to '\xhh' */ 12432 if (ch <= 0xff) { 12433 PyUnicode_WRITE(okind, odata, o++, 'x'); 12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12435 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12436 } 12437 /* Map 16-bit characters to '\uxxxx' */ 12438 else if (ch <= 0xffff) { 12439 PyUnicode_WRITE(okind, odata, o++, 'u'); 12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12444 } 12445 /* Map 21-bit characters to '\U00xxxxxx' */ 12446 else { 12447 PyUnicode_WRITE(okind, odata, o++, 'U'); 12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12456 } 12457 } 12458 /* Copy characters as-is */ 12459 else { 12460 PyUnicode_WRITE(okind, odata, o++, ch); 12461 } 12462 } 12463 } 12464 } 12465 /* Closing quote already added at the beginning */ 12466 assert(_PyUnicode_CheckConsistency(repr, 1)); 12467 return repr; 12468} 12469 12470PyDoc_STRVAR(rfind__doc__, 12471 "S.rfind(sub[, start[, end]]) -> int\n\ 12472\n\ 12473Return the highest index in S where substring sub is found,\n\ 12474such that sub is contained within S[start:end]. Optional\n\ 12475arguments start and end are interpreted as in slice notation.\n\ 12476\n\ 12477Return -1 on failure."); 12478 12479static PyObject * 12480unicode_rfind(PyObject *self, PyObject *args) 12481{ 12482 /* initialize variables to prevent gcc warning */ 12483 PyObject *substring = NULL; 12484 Py_ssize_t start = 0; 12485 Py_ssize_t end = 0; 12486 Py_ssize_t result; 12487 12488 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12489 &start, &end)) 12490 return NULL; 12491 12492 if (PyUnicode_READY(self) == -1) { 12493 Py_DECREF(substring); 12494 return NULL; 12495 } 12496 if (PyUnicode_READY(substring) == -1) { 12497 Py_DECREF(substring); 12498 return NULL; 12499 } 12500 12501 result = any_find_slice(-1, self, substring, start, end); 12502 12503 Py_DECREF(substring); 12504 12505 if (result == -2) 12506 return NULL; 12507 12508 return PyLong_FromSsize_t(result); 12509} 12510 12511PyDoc_STRVAR(rindex__doc__, 12512 "S.rindex(sub[, start[, end]]) -> int\n\ 12513\n\ 12514Like S.rfind() but raise ValueError when the substring is not found."); 12515 12516static PyObject * 12517unicode_rindex(PyObject *self, PyObject *args) 12518{ 12519 /* initialize variables to prevent gcc warning */ 12520 PyObject *substring = NULL; 12521 Py_ssize_t start = 0; 12522 Py_ssize_t end = 0; 12523 Py_ssize_t result; 12524 12525 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12526 &start, &end)) 12527 return NULL; 12528 12529 if (PyUnicode_READY(self) == -1) { 12530 Py_DECREF(substring); 12531 return NULL; 12532 } 12533 if (PyUnicode_READY(substring) == -1) { 12534 Py_DECREF(substring); 12535 return NULL; 12536 } 12537 12538 result = any_find_slice(-1, self, substring, start, end); 12539 12540 Py_DECREF(substring); 12541 12542 if (result == -2) 12543 return NULL; 12544 12545 if (result < 0) { 12546 PyErr_SetString(PyExc_ValueError, "substring not found"); 12547 return NULL; 12548 } 12549 12550 return PyLong_FromSsize_t(result); 12551} 12552 12553PyDoc_STRVAR(rjust__doc__, 12554 "S.rjust(width[, fillchar]) -> str\n\ 12555\n\ 12556Return S right-justified in a string of length width. Padding is\n\ 12557done using the specified fill character (default is a space)."); 12558 12559static PyObject * 12560unicode_rjust(PyObject *self, PyObject *args) 12561{ 12562 Py_ssize_t width; 12563 Py_UCS4 fillchar = ' '; 12564 12565 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12566 return NULL; 12567 12568 if (PyUnicode_READY(self) == -1) 12569 return NULL; 12570 12571 if (PyUnicode_GET_LENGTH(self) >= width) 12572 return unicode_result_unchanged(self); 12573 12574 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12575} 12576 12577PyObject * 12578PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12579{ 12580 PyObject *result; 12581 12582 s = PyUnicode_FromObject(s); 12583 if (s == NULL) 12584 return NULL; 12585 if (sep != NULL) { 12586 sep = PyUnicode_FromObject(sep); 12587 if (sep == NULL) { 12588 Py_DECREF(s); 12589 return NULL; 12590 } 12591 } 12592 12593 result = split(s, sep, maxsplit); 12594 12595 Py_DECREF(s); 12596 Py_XDECREF(sep); 12597 return result; 12598} 12599 12600PyDoc_STRVAR(split__doc__, 12601 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12602\n\ 12603Return a list of the words in S, using sep as the\n\ 12604delimiter string. If maxsplit is given, at most maxsplit\n\ 12605splits are done. If sep is not specified or is None, any\n\ 12606whitespace string is a separator and empty strings are\n\ 12607removed from the result."); 12608 12609static PyObject* 12610unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12611{ 12612 static char *kwlist[] = {"sep", "maxsplit", 0}; 12613 PyObject *substring = Py_None; 12614 Py_ssize_t maxcount = -1; 12615 12616 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12617 kwlist, &substring, &maxcount)) 12618 return NULL; 12619 12620 if (substring == Py_None) 12621 return split(self, NULL, maxcount); 12622 else if (PyUnicode_Check(substring)) 12623 return split(self, substring, maxcount); 12624 else 12625 return PyUnicode_Split(self, substring, maxcount); 12626} 12627 12628PyObject * 12629PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12630{ 12631 PyObject* str_obj; 12632 PyObject* sep_obj; 12633 PyObject* out; 12634 int kind1, kind2; 12635 void *buf1, *buf2; 12636 Py_ssize_t len1, len2; 12637 12638 str_obj = PyUnicode_FromObject(str_in); 12639 if (!str_obj) 12640 return NULL; 12641 sep_obj = PyUnicode_FromObject(sep_in); 12642 if (!sep_obj) { 12643 Py_DECREF(str_obj); 12644 return NULL; 12645 } 12646 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12647 Py_DECREF(sep_obj); 12648 Py_DECREF(str_obj); 12649 return NULL; 12650 } 12651 12652 kind1 = PyUnicode_KIND(str_obj); 12653 kind2 = PyUnicode_KIND(sep_obj); 12654 len1 = PyUnicode_GET_LENGTH(str_obj); 12655 len2 = PyUnicode_GET_LENGTH(sep_obj); 12656 if (kind1 < kind2 || len1 < len2) { 12657 _Py_INCREF_UNICODE_EMPTY(); 12658 if (!unicode_empty) 12659 out = NULL; 12660 else { 12661 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12662 Py_DECREF(unicode_empty); 12663 } 12664 Py_DECREF(sep_obj); 12665 Py_DECREF(str_obj); 12666 return out; 12667 } 12668 buf1 = PyUnicode_DATA(str_obj); 12669 buf2 = PyUnicode_DATA(sep_obj); 12670 if (kind2 != kind1) { 12671 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12672 if (!buf2) 12673 goto onError; 12674 } 12675 12676 switch (kind1) { 12677 case PyUnicode_1BYTE_KIND: 12678 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12679 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12680 else 12681 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12682 break; 12683 case PyUnicode_2BYTE_KIND: 12684 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12685 break; 12686 case PyUnicode_4BYTE_KIND: 12687 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12688 break; 12689 default: 12690 assert(0); 12691 out = 0; 12692 } 12693 12694 Py_DECREF(sep_obj); 12695 Py_DECREF(str_obj); 12696 if (kind2 != kind1) 12697 PyMem_Free(buf2); 12698 12699 return out; 12700 onError: 12701 Py_DECREF(sep_obj); 12702 Py_DECREF(str_obj); 12703 if (kind2 != kind1 && buf2) 12704 PyMem_Free(buf2); 12705 return NULL; 12706} 12707 12708 12709PyObject * 12710PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12711{ 12712 PyObject* str_obj; 12713 PyObject* sep_obj; 12714 PyObject* out; 12715 int kind1, kind2; 12716 void *buf1, *buf2; 12717 Py_ssize_t len1, len2; 12718 12719 str_obj = PyUnicode_FromObject(str_in); 12720 if (!str_obj) 12721 return NULL; 12722 sep_obj = PyUnicode_FromObject(sep_in); 12723 if (!sep_obj) { 12724 Py_DECREF(str_obj); 12725 return NULL; 12726 } 12727 12728 kind1 = PyUnicode_KIND(str_obj); 12729 kind2 = PyUnicode_KIND(sep_obj); 12730 len1 = PyUnicode_GET_LENGTH(str_obj); 12731 len2 = PyUnicode_GET_LENGTH(sep_obj); 12732 if (kind1 < kind2 || len1 < len2) { 12733 _Py_INCREF_UNICODE_EMPTY(); 12734 if (!unicode_empty) 12735 out = NULL; 12736 else { 12737 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12738 Py_DECREF(unicode_empty); 12739 } 12740 Py_DECREF(sep_obj); 12741 Py_DECREF(str_obj); 12742 return out; 12743 } 12744 buf1 = PyUnicode_DATA(str_obj); 12745 buf2 = PyUnicode_DATA(sep_obj); 12746 if (kind2 != kind1) { 12747 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12748 if (!buf2) 12749 goto onError; 12750 } 12751 12752 switch (kind1) { 12753 case PyUnicode_1BYTE_KIND: 12754 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12755 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12756 else 12757 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12758 break; 12759 case PyUnicode_2BYTE_KIND: 12760 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12761 break; 12762 case PyUnicode_4BYTE_KIND: 12763 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12764 break; 12765 default: 12766 assert(0); 12767 out = 0; 12768 } 12769 12770 Py_DECREF(sep_obj); 12771 Py_DECREF(str_obj); 12772 if (kind2 != kind1) 12773 PyMem_Free(buf2); 12774 12775 return out; 12776 onError: 12777 Py_DECREF(sep_obj); 12778 Py_DECREF(str_obj); 12779 if (kind2 != kind1 && buf2) 12780 PyMem_Free(buf2); 12781 return NULL; 12782} 12783 12784PyDoc_STRVAR(partition__doc__, 12785 "S.partition(sep) -> (head, sep, tail)\n\ 12786\n\ 12787Search for the separator sep in S, and return the part before it,\n\ 12788the separator itself, and the part after it. If the separator is not\n\ 12789found, return S and two empty strings."); 12790 12791static PyObject* 12792unicode_partition(PyObject *self, PyObject *separator) 12793{ 12794 return PyUnicode_Partition(self, separator); 12795} 12796 12797PyDoc_STRVAR(rpartition__doc__, 12798 "S.rpartition(sep) -> (head, sep, tail)\n\ 12799\n\ 12800Search for the separator sep in S, starting at the end of S, and return\n\ 12801the part before it, the separator itself, and the part after it. If the\n\ 12802separator is not found, return two empty strings and S."); 12803 12804static PyObject* 12805unicode_rpartition(PyObject *self, PyObject *separator) 12806{ 12807 return PyUnicode_RPartition(self, separator); 12808} 12809 12810PyObject * 12811PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12812{ 12813 PyObject *result; 12814 12815 s = PyUnicode_FromObject(s); 12816 if (s == NULL) 12817 return NULL; 12818 if (sep != NULL) { 12819 sep = PyUnicode_FromObject(sep); 12820 if (sep == NULL) { 12821 Py_DECREF(s); 12822 return NULL; 12823 } 12824 } 12825 12826 result = rsplit(s, sep, maxsplit); 12827 12828 Py_DECREF(s); 12829 Py_XDECREF(sep); 12830 return result; 12831} 12832 12833PyDoc_STRVAR(rsplit__doc__, 12834 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12835\n\ 12836Return a list of the words in S, using sep as the\n\ 12837delimiter string, starting at the end of the string and\n\ 12838working to the front. If maxsplit is given, at most maxsplit\n\ 12839splits are done. If sep is not specified, any whitespace string\n\ 12840is a separator."); 12841 12842static PyObject* 12843unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12844{ 12845 static char *kwlist[] = {"sep", "maxsplit", 0}; 12846 PyObject *substring = Py_None; 12847 Py_ssize_t maxcount = -1; 12848 12849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12850 kwlist, &substring, &maxcount)) 12851 return NULL; 12852 12853 if (substring == Py_None) 12854 return rsplit(self, NULL, maxcount); 12855 else if (PyUnicode_Check(substring)) 12856 return rsplit(self, substring, maxcount); 12857 else 12858 return PyUnicode_RSplit(self, substring, maxcount); 12859} 12860 12861PyDoc_STRVAR(splitlines__doc__, 12862 "S.splitlines([keepends]) -> list of strings\n\ 12863\n\ 12864Return a list of the lines in S, breaking at line boundaries.\n\ 12865Line breaks are not included in the resulting list unless keepends\n\ 12866is given and true."); 12867 12868static PyObject* 12869unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12870{ 12871 static char *kwlist[] = {"keepends", 0}; 12872 int keepends = 0; 12873 12874 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12875 kwlist, &keepends)) 12876 return NULL; 12877 12878 return PyUnicode_Splitlines(self, keepends); 12879} 12880 12881static 12882PyObject *unicode_str(PyObject *self) 12883{ 12884 return unicode_result_unchanged(self); 12885} 12886 12887PyDoc_STRVAR(swapcase__doc__, 12888 "S.swapcase() -> str\n\ 12889\n\ 12890Return a copy of S with uppercase characters converted to lowercase\n\ 12891and vice versa."); 12892 12893static PyObject* 12894unicode_swapcase(PyObject *self) 12895{ 12896 if (PyUnicode_READY(self) == -1) 12897 return NULL; 12898 return case_operation(self, do_swapcase); 12899} 12900 12901/*[clinic input] 12902 12903@staticmethod 12904str.maketrans as unicode_maketrans 12905 12906 x: object 12907 12908 y: unicode=NULL 12909 12910 z: unicode=NULL 12911 12912 / 12913 12914Return a translation table usable for str.translate(). 12915 12916If there is only one argument, it must be a dictionary mapping Unicode 12917ordinals (integers) or characters to Unicode ordinals, strings or None. 12918Character keys will be then converted to ordinals. 12919If there are two arguments, they must be strings of equal length, and 12920in the resulting dictionary, each character in x will be mapped to the 12921character at the same position in y. If there is a third argument, it 12922must be a string, whose characters will be mapped to None in the result. 12923[clinic start generated code]*/ 12924 12925static PyObject * 12926unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12927/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 12928{ 12929 PyObject *new = NULL, *key, *value; 12930 Py_ssize_t i = 0; 12931 int res; 12932 12933 new = PyDict_New(); 12934 if (!new) 12935 return NULL; 12936 if (y != NULL) { 12937 int x_kind, y_kind, z_kind; 12938 void *x_data, *y_data, *z_data; 12939 12940 /* x must be a string too, of equal length */ 12941 if (!PyUnicode_Check(x)) { 12942 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12943 "be a string if there is a second argument"); 12944 goto err; 12945 } 12946 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12947 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12948 "arguments must have equal length"); 12949 goto err; 12950 } 12951 /* create entries for translating chars in x to those in y */ 12952 x_kind = PyUnicode_KIND(x); 12953 y_kind = PyUnicode_KIND(y); 12954 x_data = PyUnicode_DATA(x); 12955 y_data = PyUnicode_DATA(y); 12956 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12957 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12958 if (!key) 12959 goto err; 12960 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12961 if (!value) { 12962 Py_DECREF(key); 12963 goto err; 12964 } 12965 res = PyDict_SetItem(new, key, value); 12966 Py_DECREF(key); 12967 Py_DECREF(value); 12968 if (res < 0) 12969 goto err; 12970 } 12971 /* create entries for deleting chars in z */ 12972 if (z != NULL) { 12973 z_kind = PyUnicode_KIND(z); 12974 z_data = PyUnicode_DATA(z); 12975 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12976 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12977 if (!key) 12978 goto err; 12979 res = PyDict_SetItem(new, key, Py_None); 12980 Py_DECREF(key); 12981 if (res < 0) 12982 goto err; 12983 } 12984 } 12985 } else { 12986 int kind; 12987 void *data; 12988 12989 /* x must be a dict */ 12990 if (!PyDict_CheckExact(x)) { 12991 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12992 "to maketrans it must be a dict"); 12993 goto err; 12994 } 12995 /* copy entries into the new dict, converting string keys to int keys */ 12996 while (PyDict_Next(x, &i, &key, &value)) { 12997 if (PyUnicode_Check(key)) { 12998 /* convert string keys to integer keys */ 12999 PyObject *newkey; 13000 if (PyUnicode_GET_LENGTH(key) != 1) { 13001 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13002 "table must be of length 1"); 13003 goto err; 13004 } 13005 kind = PyUnicode_KIND(key); 13006 data = PyUnicode_DATA(key); 13007 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13008 if (!newkey) 13009 goto err; 13010 res = PyDict_SetItem(new, newkey, value); 13011 Py_DECREF(newkey); 13012 if (res < 0) 13013 goto err; 13014 } else if (PyLong_Check(key)) { 13015 /* just keep integer keys */ 13016 if (PyDict_SetItem(new, key, value) < 0) 13017 goto err; 13018 } else { 13019 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13020 "be strings or integers"); 13021 goto err; 13022 } 13023 } 13024 } 13025 return new; 13026 err: 13027 Py_DECREF(new); 13028 return NULL; 13029} 13030 13031PyDoc_STRVAR(translate__doc__, 13032 "S.translate(table) -> str\n\ 13033\n\ 13034Return a copy of the string S, where all characters have been mapped\n\ 13035through the given translation table, which must be a mapping of\n\ 13036Unicode ordinals to Unicode ordinals, strings, or None.\n\ 13037Unmapped characters are left untouched. Characters mapped to None\n\ 13038are deleted."); 13039 13040static PyObject* 13041unicode_translate(PyObject *self, PyObject *table) 13042{ 13043 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13044} 13045 13046PyDoc_STRVAR(upper__doc__, 13047 "S.upper() -> str\n\ 13048\n\ 13049Return a copy of S converted to uppercase."); 13050 13051static PyObject* 13052unicode_upper(PyObject *self) 13053{ 13054 if (PyUnicode_READY(self) == -1) 13055 return NULL; 13056 if (PyUnicode_IS_ASCII(self)) 13057 return ascii_upper_or_lower(self, 0); 13058 return case_operation(self, do_upper); 13059} 13060 13061PyDoc_STRVAR(zfill__doc__, 13062 "S.zfill(width) -> str\n\ 13063\n\ 13064Pad a numeric string S with zeros on the left, to fill a field\n\ 13065of the specified width. The string S is never truncated."); 13066 13067static PyObject * 13068unicode_zfill(PyObject *self, PyObject *args) 13069{ 13070 Py_ssize_t fill; 13071 PyObject *u; 13072 Py_ssize_t width; 13073 int kind; 13074 void *data; 13075 Py_UCS4 chr; 13076 13077 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13078 return NULL; 13079 13080 if (PyUnicode_READY(self) == -1) 13081 return NULL; 13082 13083 if (PyUnicode_GET_LENGTH(self) >= width) 13084 return unicode_result_unchanged(self); 13085 13086 fill = width - PyUnicode_GET_LENGTH(self); 13087 13088 u = pad(self, fill, 0, '0'); 13089 13090 if (u == NULL) 13091 return NULL; 13092 13093 kind = PyUnicode_KIND(u); 13094 data = PyUnicode_DATA(u); 13095 chr = PyUnicode_READ(kind, data, fill); 13096 13097 if (chr == '+' || chr == '-') { 13098 /* move sign to beginning of string */ 13099 PyUnicode_WRITE(kind, data, 0, chr); 13100 PyUnicode_WRITE(kind, data, fill, '0'); 13101 } 13102 13103 assert(_PyUnicode_CheckConsistency(u, 1)); 13104 return u; 13105} 13106 13107#if 0 13108static PyObject * 13109unicode__decimal2ascii(PyObject *self) 13110{ 13111 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13112} 13113#endif 13114 13115PyDoc_STRVAR(startswith__doc__, 13116 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13117\n\ 13118Return True if S starts with the specified prefix, False otherwise.\n\ 13119With optional start, test S beginning at that position.\n\ 13120With optional end, stop comparing S at that position.\n\ 13121prefix can also be a tuple of strings to try."); 13122 13123static PyObject * 13124unicode_startswith(PyObject *self, 13125 PyObject *args) 13126{ 13127 PyObject *subobj; 13128 PyObject *substring; 13129 Py_ssize_t start = 0; 13130 Py_ssize_t end = PY_SSIZE_T_MAX; 13131 int result; 13132 13133 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13134 return NULL; 13135 if (PyTuple_Check(subobj)) { 13136 Py_ssize_t i; 13137 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13138 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13139 if (substring == NULL) 13140 return NULL; 13141 result = tailmatch(self, substring, start, end, -1); 13142 Py_DECREF(substring); 13143 if (result == -1) 13144 return NULL; 13145 if (result) { 13146 Py_RETURN_TRUE; 13147 } 13148 } 13149 /* nothing matched */ 13150 Py_RETURN_FALSE; 13151 } 13152 substring = PyUnicode_FromObject(subobj); 13153 if (substring == NULL) { 13154 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13155 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13156 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13157 return NULL; 13158 } 13159 result = tailmatch(self, substring, start, end, -1); 13160 Py_DECREF(substring); 13161 if (result == -1) 13162 return NULL; 13163 return PyBool_FromLong(result); 13164} 13165 13166 13167PyDoc_STRVAR(endswith__doc__, 13168 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13169\n\ 13170Return True if S ends with the specified suffix, False otherwise.\n\ 13171With optional start, test S beginning at that position.\n\ 13172With optional end, stop comparing S at that position.\n\ 13173suffix can also be a tuple of strings to try."); 13174 13175static PyObject * 13176unicode_endswith(PyObject *self, 13177 PyObject *args) 13178{ 13179 PyObject *subobj; 13180 PyObject *substring; 13181 Py_ssize_t start = 0; 13182 Py_ssize_t end = PY_SSIZE_T_MAX; 13183 int result; 13184 13185 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13186 return NULL; 13187 if (PyTuple_Check(subobj)) { 13188 Py_ssize_t i; 13189 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13190 substring = PyUnicode_FromObject( 13191 PyTuple_GET_ITEM(subobj, i)); 13192 if (substring == NULL) 13193 return NULL; 13194 result = tailmatch(self, substring, start, end, +1); 13195 Py_DECREF(substring); 13196 if (result == -1) 13197 return NULL; 13198 if (result) { 13199 Py_RETURN_TRUE; 13200 } 13201 } 13202 Py_RETURN_FALSE; 13203 } 13204 substring = PyUnicode_FromObject(subobj); 13205 if (substring == NULL) { 13206 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13207 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13208 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13209 return NULL; 13210 } 13211 result = tailmatch(self, substring, start, end, +1); 13212 Py_DECREF(substring); 13213 if (result == -1) 13214 return NULL; 13215 return PyBool_FromLong(result); 13216} 13217 13218Py_LOCAL_INLINE(void) 13219_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13220{ 13221 if (!writer->readonly) 13222 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13223 else { 13224 /* Copy-on-write mode: set buffer size to 0 so 13225 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13226 * next write. */ 13227 writer->size = 0; 13228 } 13229 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13230 writer->data = PyUnicode_DATA(writer->buffer); 13231 writer->kind = PyUnicode_KIND(writer->buffer); 13232} 13233 13234void 13235_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13236{ 13237 memset(writer, 0, sizeof(*writer)); 13238#ifdef Py_DEBUG 13239 writer->kind = 5; /* invalid kind */ 13240#endif 13241 writer->min_char = 127; 13242} 13243 13244int 13245_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13246 Py_ssize_t length, Py_UCS4 maxchar) 13247{ 13248#ifdef MS_WINDOWS 13249 /* On Windows, overallocate by 50% is the best factor */ 13250# define OVERALLOCATE_FACTOR 2 13251#else 13252 /* On Linux, overallocate by 25% is the best factor */ 13253# define OVERALLOCATE_FACTOR 4 13254#endif 13255 Py_ssize_t newlen; 13256 PyObject *newbuffer; 13257 13258 assert(length > 0); 13259 13260 if (length > PY_SSIZE_T_MAX - writer->pos) { 13261 PyErr_NoMemory(); 13262 return -1; 13263 } 13264 newlen = writer->pos + length; 13265 13266 maxchar = Py_MAX(maxchar, writer->min_char); 13267 13268 if (writer->buffer == NULL) { 13269 assert(!writer->readonly); 13270 if (writer->overallocate 13271 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13272 /* overallocate to limit the number of realloc() */ 13273 newlen += newlen / OVERALLOCATE_FACTOR; 13274 } 13275 if (newlen < writer->min_length) 13276 newlen = writer->min_length; 13277 13278 writer->buffer = PyUnicode_New(newlen, maxchar); 13279 if (writer->buffer == NULL) 13280 return -1; 13281 } 13282 else if (newlen > writer->size) { 13283 if (writer->overallocate 13284 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13285 /* overallocate to limit the number of realloc() */ 13286 newlen += newlen / OVERALLOCATE_FACTOR; 13287 } 13288 if (newlen < writer->min_length) 13289 newlen = writer->min_length; 13290 13291 if (maxchar > writer->maxchar || writer->readonly) { 13292 /* resize + widen */ 13293 newbuffer = PyUnicode_New(newlen, maxchar); 13294 if (newbuffer == NULL) 13295 return -1; 13296 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13297 writer->buffer, 0, writer->pos); 13298 Py_DECREF(writer->buffer); 13299 writer->readonly = 0; 13300 } 13301 else { 13302 newbuffer = resize_compact(writer->buffer, newlen); 13303 if (newbuffer == NULL) 13304 return -1; 13305 } 13306 writer->buffer = newbuffer; 13307 } 13308 else if (maxchar > writer->maxchar) { 13309 assert(!writer->readonly); 13310 newbuffer = PyUnicode_New(writer->size, maxchar); 13311 if (newbuffer == NULL) 13312 return -1; 13313 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13314 writer->buffer, 0, writer->pos); 13315 Py_DECREF(writer->buffer); 13316 writer->buffer = newbuffer; 13317 } 13318 _PyUnicodeWriter_Update(writer); 13319 return 0; 13320 13321#undef OVERALLOCATE_FACTOR 13322} 13323 13324Py_LOCAL_INLINE(int) 13325_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13326{ 13327 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13328 return -1; 13329 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13330 writer->pos++; 13331 return 0; 13332} 13333 13334int 13335_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13336{ 13337 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13338} 13339 13340int 13341_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13342{ 13343 Py_UCS4 maxchar; 13344 Py_ssize_t len; 13345 13346 if (PyUnicode_READY(str) == -1) 13347 return -1; 13348 len = PyUnicode_GET_LENGTH(str); 13349 if (len == 0) 13350 return 0; 13351 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13352 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13353 if (writer->buffer == NULL && !writer->overallocate) { 13354 assert(_PyUnicode_CheckConsistency(str, 1)); 13355 writer->readonly = 1; 13356 Py_INCREF(str); 13357 writer->buffer = str; 13358 _PyUnicodeWriter_Update(writer); 13359 writer->pos += len; 13360 return 0; 13361 } 13362 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13363 return -1; 13364 } 13365 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13366 str, 0, len); 13367 writer->pos += len; 13368 return 0; 13369} 13370 13371int 13372_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13373 Py_ssize_t start, Py_ssize_t end) 13374{ 13375 Py_UCS4 maxchar; 13376 Py_ssize_t len; 13377 13378 if (PyUnicode_READY(str) == -1) 13379 return -1; 13380 13381 assert(0 <= start); 13382 assert(end <= PyUnicode_GET_LENGTH(str)); 13383 assert(start <= end); 13384 13385 if (end == 0) 13386 return 0; 13387 13388 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13389 return _PyUnicodeWriter_WriteStr(writer, str); 13390 13391 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13392 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13393 else 13394 maxchar = writer->maxchar; 13395 len = end - start; 13396 13397 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13398 return -1; 13399 13400 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13401 str, start, len); 13402 writer->pos += len; 13403 return 0; 13404} 13405 13406int 13407_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13408 const char *ascii, Py_ssize_t len) 13409{ 13410 if (len == -1) 13411 len = strlen(ascii); 13412 13413 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13414 13415 if (writer->buffer == NULL && !writer->overallocate) { 13416 PyObject *str; 13417 13418 str = _PyUnicode_FromASCII(ascii, len); 13419 if (str == NULL) 13420 return -1; 13421 13422 writer->readonly = 1; 13423 writer->buffer = str; 13424 _PyUnicodeWriter_Update(writer); 13425 writer->pos += len; 13426 return 0; 13427 } 13428 13429 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13430 return -1; 13431 13432 switch (writer->kind) 13433 { 13434 case PyUnicode_1BYTE_KIND: 13435 { 13436 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13437 Py_UCS1 *data = writer->data; 13438 13439 Py_MEMCPY(data + writer->pos, str, len); 13440 break; 13441 } 13442 case PyUnicode_2BYTE_KIND: 13443 { 13444 _PyUnicode_CONVERT_BYTES( 13445 Py_UCS1, Py_UCS2, 13446 ascii, ascii + len, 13447 (Py_UCS2 *)writer->data + writer->pos); 13448 break; 13449 } 13450 case PyUnicode_4BYTE_KIND: 13451 { 13452 _PyUnicode_CONVERT_BYTES( 13453 Py_UCS1, Py_UCS4, 13454 ascii, ascii + len, 13455 (Py_UCS4 *)writer->data + writer->pos); 13456 break; 13457 } 13458 default: 13459 assert(0); 13460 } 13461 13462 writer->pos += len; 13463 return 0; 13464} 13465 13466int 13467_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13468 const char *str, Py_ssize_t len) 13469{ 13470 Py_UCS4 maxchar; 13471 13472 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13473 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13474 return -1; 13475 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13476 writer->pos += len; 13477 return 0; 13478} 13479 13480PyObject * 13481_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13482{ 13483 PyObject *str; 13484 if (writer->pos == 0) { 13485 Py_CLEAR(writer->buffer); 13486 _Py_RETURN_UNICODE_EMPTY(); 13487 } 13488 if (writer->readonly) { 13489 str = writer->buffer; 13490 writer->buffer = NULL; 13491 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13492 return str; 13493 } 13494 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13495 PyObject *newbuffer; 13496 newbuffer = resize_compact(writer->buffer, writer->pos); 13497 if (newbuffer == NULL) { 13498 Py_CLEAR(writer->buffer); 13499 return NULL; 13500 } 13501 writer->buffer = newbuffer; 13502 } 13503 str = writer->buffer; 13504 writer->buffer = NULL; 13505 assert(_PyUnicode_CheckConsistency(str, 1)); 13506 return unicode_result_ready(str); 13507} 13508 13509void 13510_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13511{ 13512 Py_CLEAR(writer->buffer); 13513} 13514 13515#include "stringlib/unicode_format.h" 13516 13517PyDoc_STRVAR(format__doc__, 13518 "S.format(*args, **kwargs) -> str\n\ 13519\n\ 13520Return a formatted version of S, using substitutions from args and kwargs.\n\ 13521The substitutions are identified by braces ('{' and '}')."); 13522 13523PyDoc_STRVAR(format_map__doc__, 13524 "S.format_map(mapping) -> str\n\ 13525\n\ 13526Return a formatted version of S, using substitutions from mapping.\n\ 13527The substitutions are identified by braces ('{' and '}')."); 13528 13529static PyObject * 13530unicode__format__(PyObject* self, PyObject* args) 13531{ 13532 PyObject *format_spec; 13533 _PyUnicodeWriter writer; 13534 int ret; 13535 13536 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13537 return NULL; 13538 13539 if (PyUnicode_READY(self) == -1) 13540 return NULL; 13541 _PyUnicodeWriter_Init(&writer); 13542 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13543 self, format_spec, 0, 13544 PyUnicode_GET_LENGTH(format_spec)); 13545 if (ret == -1) { 13546 _PyUnicodeWriter_Dealloc(&writer); 13547 return NULL; 13548 } 13549 return _PyUnicodeWriter_Finish(&writer); 13550} 13551 13552PyDoc_STRVAR(p_format__doc__, 13553 "S.__format__(format_spec) -> str\n\ 13554\n\ 13555Return a formatted version of S as described by format_spec."); 13556 13557static PyObject * 13558unicode__sizeof__(PyObject *v) 13559{ 13560 Py_ssize_t size; 13561 13562 /* If it's a compact object, account for base structure + 13563 character data. */ 13564 if (PyUnicode_IS_COMPACT_ASCII(v)) 13565 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13566 else if (PyUnicode_IS_COMPACT(v)) 13567 size = sizeof(PyCompactUnicodeObject) + 13568 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13569 else { 13570 /* If it is a two-block object, account for base object, and 13571 for character block if present. */ 13572 size = sizeof(PyUnicodeObject); 13573 if (_PyUnicode_DATA_ANY(v)) 13574 size += (PyUnicode_GET_LENGTH(v) + 1) * 13575 PyUnicode_KIND(v); 13576 } 13577 /* If the wstr pointer is present, account for it unless it is shared 13578 with the data pointer. Check if the data is not shared. */ 13579 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13580 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13581 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13582 size += PyUnicode_UTF8_LENGTH(v) + 1; 13583 13584 return PyLong_FromSsize_t(size); 13585} 13586 13587PyDoc_STRVAR(sizeof__doc__, 13588 "S.__sizeof__() -> size of S in memory, in bytes"); 13589 13590static PyObject * 13591unicode_getnewargs(PyObject *v) 13592{ 13593 PyObject *copy = _PyUnicode_Copy(v); 13594 if (!copy) 13595 return NULL; 13596 return Py_BuildValue("(N)", copy); 13597} 13598 13599static PyMethodDef unicode_methods[] = { 13600 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13601 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13602 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13603 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13604 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13605 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13606 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13607 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13608 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13609 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13610 {"expandtabs", (PyCFunction) unicode_expandtabs, 13611 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13612 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13613 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13614 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13615 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13616 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13617 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13618 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13619 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13620 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13621 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13622 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13623 {"splitlines", (PyCFunction) unicode_splitlines, 13624 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13625 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13626 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13627 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13628 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13629 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13630 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13631 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13632 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13633 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13634 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13635 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13636 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13637 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13638 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13639 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13640 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13641 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13642 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13643 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13644 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13645 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13646 UNICODE_MAKETRANS_METHODDEF 13647 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13648#if 0 13649 /* These methods are just used for debugging the implementation. */ 13650 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13651#endif 13652 13653 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13654 {NULL, NULL} 13655}; 13656 13657static PyObject * 13658unicode_mod(PyObject *v, PyObject *w) 13659{ 13660 if (!PyUnicode_Check(v)) 13661 Py_RETURN_NOTIMPLEMENTED; 13662 return PyUnicode_Format(v, w); 13663} 13664 13665static PyNumberMethods unicode_as_number = { 13666 0, /*nb_add*/ 13667 0, /*nb_subtract*/ 13668 0, /*nb_multiply*/ 13669 unicode_mod, /*nb_remainder*/ 13670}; 13671 13672static PySequenceMethods unicode_as_sequence = { 13673 (lenfunc) unicode_length, /* sq_length */ 13674 PyUnicode_Concat, /* sq_concat */ 13675 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13676 (ssizeargfunc) unicode_getitem, /* sq_item */ 13677 0, /* sq_slice */ 13678 0, /* sq_ass_item */ 13679 0, /* sq_ass_slice */ 13680 PyUnicode_Contains, /* sq_contains */ 13681}; 13682 13683static PyObject* 13684unicode_subscript(PyObject* self, PyObject* item) 13685{ 13686 if (PyUnicode_READY(self) == -1) 13687 return NULL; 13688 13689 if (PyIndex_Check(item)) { 13690 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13691 if (i == -1 && PyErr_Occurred()) 13692 return NULL; 13693 if (i < 0) 13694 i += PyUnicode_GET_LENGTH(self); 13695 return unicode_getitem(self, i); 13696 } else if (PySlice_Check(item)) { 13697 Py_ssize_t start, stop, step, slicelength, cur, i; 13698 PyObject *result; 13699 void *src_data, *dest_data; 13700 int src_kind, dest_kind; 13701 Py_UCS4 ch, max_char, kind_limit; 13702 13703 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13704 &start, &stop, &step, &slicelength) < 0) { 13705 return NULL; 13706 } 13707 13708 if (slicelength <= 0) { 13709 _Py_RETURN_UNICODE_EMPTY(); 13710 } else if (start == 0 && step == 1 && 13711 slicelength == PyUnicode_GET_LENGTH(self)) { 13712 return unicode_result_unchanged(self); 13713 } else if (step == 1) { 13714 return PyUnicode_Substring(self, 13715 start, start + slicelength); 13716 } 13717 /* General case */ 13718 src_kind = PyUnicode_KIND(self); 13719 src_data = PyUnicode_DATA(self); 13720 if (!PyUnicode_IS_ASCII(self)) { 13721 kind_limit = kind_maxchar_limit(src_kind); 13722 max_char = 0; 13723 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13724 ch = PyUnicode_READ(src_kind, src_data, cur); 13725 if (ch > max_char) { 13726 max_char = ch; 13727 if (max_char >= kind_limit) 13728 break; 13729 } 13730 } 13731 } 13732 else 13733 max_char = 127; 13734 result = PyUnicode_New(slicelength, max_char); 13735 if (result == NULL) 13736 return NULL; 13737 dest_kind = PyUnicode_KIND(result); 13738 dest_data = PyUnicode_DATA(result); 13739 13740 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13741 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13742 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13743 } 13744 assert(_PyUnicode_CheckConsistency(result, 1)); 13745 return result; 13746 } else { 13747 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13748 return NULL; 13749 } 13750} 13751 13752static PyMappingMethods unicode_as_mapping = { 13753 (lenfunc)unicode_length, /* mp_length */ 13754 (binaryfunc)unicode_subscript, /* mp_subscript */ 13755 (objobjargproc)0, /* mp_ass_subscript */ 13756}; 13757 13758 13759/* Helpers for PyUnicode_Format() */ 13760 13761struct unicode_formatter_t { 13762 PyObject *args; 13763 int args_owned; 13764 Py_ssize_t arglen, argidx; 13765 PyObject *dict; 13766 13767 enum PyUnicode_Kind fmtkind; 13768 Py_ssize_t fmtcnt, fmtpos; 13769 void *fmtdata; 13770 PyObject *fmtstr; 13771 13772 _PyUnicodeWriter writer; 13773}; 13774 13775struct unicode_format_arg_t { 13776 Py_UCS4 ch; 13777 int flags; 13778 Py_ssize_t width; 13779 int prec; 13780 int sign; 13781}; 13782 13783static PyObject * 13784unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13785{ 13786 Py_ssize_t argidx = ctx->argidx; 13787 13788 if (argidx < ctx->arglen) { 13789 ctx->argidx++; 13790 if (ctx->arglen < 0) 13791 return ctx->args; 13792 else 13793 return PyTuple_GetItem(ctx->args, argidx); 13794 } 13795 PyErr_SetString(PyExc_TypeError, 13796 "not enough arguments for format string"); 13797 return NULL; 13798} 13799 13800/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13801 13802/* Format a float into the writer if the writer is not NULL, or into *p_output 13803 otherwise. 13804 13805 Return 0 on success, raise an exception and return -1 on error. */ 13806static int 13807formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13808 PyObject **p_output, 13809 _PyUnicodeWriter *writer) 13810{ 13811 char *p; 13812 double x; 13813 Py_ssize_t len; 13814 int prec; 13815 int dtoa_flags; 13816 13817 x = PyFloat_AsDouble(v); 13818 if (x == -1.0 && PyErr_Occurred()) 13819 return -1; 13820 13821 prec = arg->prec; 13822 if (prec < 0) 13823 prec = 6; 13824 13825 if (arg->flags & F_ALT) 13826 dtoa_flags = Py_DTSF_ALT; 13827 else 13828 dtoa_flags = 0; 13829 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13830 if (p == NULL) 13831 return -1; 13832 len = strlen(p); 13833 if (writer) { 13834 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13835 PyMem_Free(p); 13836 return -1; 13837 } 13838 } 13839 else 13840 *p_output = _PyUnicode_FromASCII(p, len); 13841 PyMem_Free(p); 13842 return 0; 13843} 13844 13845/* formatlong() emulates the format codes d, u, o, x and X, and 13846 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13847 * Python's regular ints. 13848 * Return value: a new PyUnicodeObject*, or NULL if error. 13849 * The output string is of the form 13850 * "-"? ("0x" | "0X")? digit+ 13851 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13852 * set in flags. The case of hex digits will be correct, 13853 * There will be at least prec digits, zero-filled on the left if 13854 * necessary to get that many. 13855 * val object to be converted 13856 * flags bitmask of format flags; only F_ALT is looked at 13857 * prec minimum number of digits; 0-fill on left if needed 13858 * type a character in [duoxX]; u acts the same as d 13859 * 13860 * CAUTION: o, x and X conversions on regular ints can never 13861 * produce a '-' sign, but can for Python's unbounded ints. 13862 */ 13863PyObject * 13864_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 13865{ 13866 PyObject *result = NULL; 13867 char *buf; 13868 Py_ssize_t i; 13869 int sign; /* 1 if '-', else 0 */ 13870 int len; /* number of characters */ 13871 Py_ssize_t llen; 13872 int numdigits; /* len == numnondigits + numdigits */ 13873 int numnondigits = 0; 13874 13875 /* Avoid exceeding SSIZE_T_MAX */ 13876 if (prec > INT_MAX-3) { 13877 PyErr_SetString(PyExc_OverflowError, 13878 "precision too large"); 13879 return NULL; 13880 } 13881 13882 assert(PyLong_Check(val)); 13883 13884 switch (type) { 13885 default: 13886 assert(!"'type' not in [diuoxX]"); 13887 case 'd': 13888 case 'i': 13889 case 'u': 13890 /* int and int subclasses should print numerically when a numeric */ 13891 /* format code is used (see issue18780) */ 13892 result = PyNumber_ToBase(val, 10); 13893 break; 13894 case 'o': 13895 numnondigits = 2; 13896 result = PyNumber_ToBase(val, 8); 13897 break; 13898 case 'x': 13899 case 'X': 13900 numnondigits = 2; 13901 result = PyNumber_ToBase(val, 16); 13902 break; 13903 } 13904 if (!result) 13905 return NULL; 13906 13907 assert(unicode_modifiable(result)); 13908 assert(PyUnicode_IS_READY(result)); 13909 assert(PyUnicode_IS_ASCII(result)); 13910 13911 /* To modify the string in-place, there can only be one reference. */ 13912 if (Py_REFCNT(result) != 1) { 13913 Py_DECREF(result); 13914 PyErr_BadInternalCall(); 13915 return NULL; 13916 } 13917 buf = PyUnicode_DATA(result); 13918 llen = PyUnicode_GET_LENGTH(result); 13919 if (llen > INT_MAX) { 13920 Py_DECREF(result); 13921 PyErr_SetString(PyExc_ValueError, 13922 "string too large in _PyUnicode_FormatLong"); 13923 return NULL; 13924 } 13925 len = (int)llen; 13926 sign = buf[0] == '-'; 13927 numnondigits += sign; 13928 numdigits = len - numnondigits; 13929 assert(numdigits > 0); 13930 13931 /* Get rid of base marker unless F_ALT */ 13932 if (((alt) == 0 && 13933 (type == 'o' || type == 'x' || type == 'X'))) { 13934 assert(buf[sign] == '0'); 13935 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13936 buf[sign+1] == 'o'); 13937 numnondigits -= 2; 13938 buf += 2; 13939 len -= 2; 13940 if (sign) 13941 buf[0] = '-'; 13942 assert(len == numnondigits + numdigits); 13943 assert(numdigits > 0); 13944 } 13945 13946 /* Fill with leading zeroes to meet minimum width. */ 13947 if (prec > numdigits) { 13948 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13949 numnondigits + prec); 13950 char *b1; 13951 if (!r1) { 13952 Py_DECREF(result); 13953 return NULL; 13954 } 13955 b1 = PyBytes_AS_STRING(r1); 13956 for (i = 0; i < numnondigits; ++i) 13957 *b1++ = *buf++; 13958 for (i = 0; i < prec - numdigits; i++) 13959 *b1++ = '0'; 13960 for (i = 0; i < numdigits; i++) 13961 *b1++ = *buf++; 13962 *b1 = '\0'; 13963 Py_DECREF(result); 13964 result = r1; 13965 buf = PyBytes_AS_STRING(result); 13966 len = numnondigits + prec; 13967 } 13968 13969 /* Fix up case for hex conversions. */ 13970 if (type == 'X') { 13971 /* Need to convert all lower case letters to upper case. 13972 and need to convert 0x to 0X (and -0x to -0X). */ 13973 for (i = 0; i < len; i++) 13974 if (buf[i] >= 'a' && buf[i] <= 'x') 13975 buf[i] -= 'a'-'A'; 13976 } 13977 if (!PyUnicode_Check(result) 13978 || buf != PyUnicode_DATA(result)) { 13979 PyObject *unicode; 13980 unicode = _PyUnicode_FromASCII(buf, len); 13981 Py_DECREF(result); 13982 result = unicode; 13983 } 13984 else if (len != PyUnicode_GET_LENGTH(result)) { 13985 if (PyUnicode_Resize(&result, len) < 0) 13986 Py_CLEAR(result); 13987 } 13988 return result; 13989} 13990 13991/* Format an integer or a float as an integer. 13992 * Return 1 if the number has been formatted into the writer, 13993 * 0 if the number has been formatted into *p_output 13994 * -1 and raise an exception on error */ 13995static int 13996mainformatlong(PyObject *v, 13997 struct unicode_format_arg_t *arg, 13998 PyObject **p_output, 13999 _PyUnicodeWriter *writer) 14000{ 14001 PyObject *iobj, *res; 14002 char type = (char)arg->ch; 14003 14004 if (!PyNumber_Check(v)) 14005 goto wrongtype; 14006 14007 /* make sure number is a type of integer for o, x, and X */ 14008 if (!PyLong_Check(v)) { 14009 if (type == 'o' || type == 'x' || type == 'X') { 14010 iobj = PyNumber_Index(v); 14011 if (iobj == NULL) { 14012 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14013 goto wrongtype; 14014 return -1; 14015 } 14016 } 14017 else { 14018 iobj = PyNumber_Long(v); 14019 if (iobj == NULL ) { 14020 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14021 goto wrongtype; 14022 return -1; 14023 } 14024 } 14025 assert(PyLong_Check(iobj)); 14026 } 14027 else { 14028 iobj = v; 14029 Py_INCREF(iobj); 14030 } 14031 14032 if (PyLong_CheckExact(v) 14033 && arg->width == -1 && arg->prec == -1 14034 && !(arg->flags & (F_SIGN | F_BLANK)) 14035 && type != 'X') 14036 { 14037 /* Fast path */ 14038 int alternate = arg->flags & F_ALT; 14039 int base; 14040 14041 switch(type) 14042 { 14043 default: 14044 assert(0 && "'type' not in [diuoxX]"); 14045 case 'd': 14046 case 'i': 14047 case 'u': 14048 base = 10; 14049 break; 14050 case 'o': 14051 base = 8; 14052 break; 14053 case 'x': 14054 case 'X': 14055 base = 16; 14056 break; 14057 } 14058 14059 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14060 Py_DECREF(iobj); 14061 return -1; 14062 } 14063 Py_DECREF(iobj); 14064 return 1; 14065 } 14066 14067 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14068 Py_DECREF(iobj); 14069 if (res == NULL) 14070 return -1; 14071 *p_output = res; 14072 return 0; 14073 14074wrongtype: 14075 switch(type) 14076 { 14077 case 'o': 14078 case 'x': 14079 case 'X': 14080 PyErr_Format(PyExc_TypeError, 14081 "%%%c format: an integer is required, " 14082 "not %.200s", 14083 type, Py_TYPE(v)->tp_name); 14084 break; 14085 default: 14086 PyErr_Format(PyExc_TypeError, 14087 "%%%c format: a number is required, " 14088 "not %.200s", 14089 type, Py_TYPE(v)->tp_name); 14090 break; 14091 } 14092 return -1; 14093} 14094 14095static Py_UCS4 14096formatchar(PyObject *v) 14097{ 14098 /* presume that the buffer is at least 3 characters long */ 14099 if (PyUnicode_Check(v)) { 14100 if (PyUnicode_GET_LENGTH(v) == 1) { 14101 return PyUnicode_READ_CHAR(v, 0); 14102 } 14103 goto onError; 14104 } 14105 else { 14106 PyObject *iobj; 14107 long x; 14108 /* make sure number is a type of integer */ 14109 if (!PyLong_Check(v)) { 14110 iobj = PyNumber_Index(v); 14111 if (iobj == NULL) { 14112 goto onError; 14113 } 14114 v = iobj; 14115 Py_DECREF(iobj); 14116 } 14117 /* Integer input truncated to a character */ 14118 x = PyLong_AsLong(v); 14119 if (x == -1 && PyErr_Occurred()) 14120 goto onError; 14121 14122 if (x < 0 || x > MAX_UNICODE) { 14123 PyErr_SetString(PyExc_OverflowError, 14124 "%c arg not in range(0x110000)"); 14125 return (Py_UCS4) -1; 14126 } 14127 14128 return (Py_UCS4) x; 14129 } 14130 14131 onError: 14132 PyErr_SetString(PyExc_TypeError, 14133 "%c requires int or char"); 14134 return (Py_UCS4) -1; 14135} 14136 14137/* Parse options of an argument: flags, width, precision. 14138 Handle also "%(name)" syntax. 14139 14140 Return 0 if the argument has been formatted into arg->str. 14141 Return 1 if the argument has been written into ctx->writer, 14142 Raise an exception and return -1 on error. */ 14143static int 14144unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14145 struct unicode_format_arg_t *arg) 14146{ 14147#define FORMAT_READ(ctx) \ 14148 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14149 14150 PyObject *v; 14151 14152 if (arg->ch == '(') { 14153 /* Get argument value from a dictionary. Example: "%(name)s". */ 14154 Py_ssize_t keystart; 14155 Py_ssize_t keylen; 14156 PyObject *key; 14157 int pcount = 1; 14158 14159 if (ctx->dict == NULL) { 14160 PyErr_SetString(PyExc_TypeError, 14161 "format requires a mapping"); 14162 return -1; 14163 } 14164 ++ctx->fmtpos; 14165 --ctx->fmtcnt; 14166 keystart = ctx->fmtpos; 14167 /* Skip over balanced parentheses */ 14168 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14169 arg->ch = FORMAT_READ(ctx); 14170 if (arg->ch == ')') 14171 --pcount; 14172 else if (arg->ch == '(') 14173 ++pcount; 14174 ctx->fmtpos++; 14175 } 14176 keylen = ctx->fmtpos - keystart - 1; 14177 if (ctx->fmtcnt < 0 || pcount > 0) { 14178 PyErr_SetString(PyExc_ValueError, 14179 "incomplete format key"); 14180 return -1; 14181 } 14182 key = PyUnicode_Substring(ctx->fmtstr, 14183 keystart, keystart + keylen); 14184 if (key == NULL) 14185 return -1; 14186 if (ctx->args_owned) { 14187 Py_DECREF(ctx->args); 14188 ctx->args_owned = 0; 14189 } 14190 ctx->args = PyObject_GetItem(ctx->dict, key); 14191 Py_DECREF(key); 14192 if (ctx->args == NULL) 14193 return -1; 14194 ctx->args_owned = 1; 14195 ctx->arglen = -1; 14196 ctx->argidx = -2; 14197 } 14198 14199 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14200 while (--ctx->fmtcnt >= 0) { 14201 arg->ch = FORMAT_READ(ctx); 14202 ctx->fmtpos++; 14203 switch (arg->ch) { 14204 case '-': arg->flags |= F_LJUST; continue; 14205 case '+': arg->flags |= F_SIGN; continue; 14206 case ' ': arg->flags |= F_BLANK; continue; 14207 case '#': arg->flags |= F_ALT; continue; 14208 case '0': arg->flags |= F_ZERO; continue; 14209 } 14210 break; 14211 } 14212 14213 /* Parse width. Example: "%10s" => width=10 */ 14214 if (arg->ch == '*') { 14215 v = unicode_format_getnextarg(ctx); 14216 if (v == NULL) 14217 return -1; 14218 if (!PyLong_Check(v)) { 14219 PyErr_SetString(PyExc_TypeError, 14220 "* wants int"); 14221 return -1; 14222 } 14223 arg->width = PyLong_AsSsize_t(v); 14224 if (arg->width == -1 && PyErr_Occurred()) 14225 return -1; 14226 if (arg->width < 0) { 14227 arg->flags |= F_LJUST; 14228 arg->width = -arg->width; 14229 } 14230 if (--ctx->fmtcnt >= 0) { 14231 arg->ch = FORMAT_READ(ctx); 14232 ctx->fmtpos++; 14233 } 14234 } 14235 else if (arg->ch >= '0' && arg->ch <= '9') { 14236 arg->width = arg->ch - '0'; 14237 while (--ctx->fmtcnt >= 0) { 14238 arg->ch = FORMAT_READ(ctx); 14239 ctx->fmtpos++; 14240 if (arg->ch < '0' || arg->ch > '9') 14241 break; 14242 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14243 mixing signed and unsigned comparison. Since arg->ch is between 14244 '0' and '9', casting to int is safe. */ 14245 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14246 PyErr_SetString(PyExc_ValueError, 14247 "width too big"); 14248 return -1; 14249 } 14250 arg->width = arg->width*10 + (arg->ch - '0'); 14251 } 14252 } 14253 14254 /* Parse precision. Example: "%.3f" => prec=3 */ 14255 if (arg->ch == '.') { 14256 arg->prec = 0; 14257 if (--ctx->fmtcnt >= 0) { 14258 arg->ch = FORMAT_READ(ctx); 14259 ctx->fmtpos++; 14260 } 14261 if (arg->ch == '*') { 14262 v = unicode_format_getnextarg(ctx); 14263 if (v == NULL) 14264 return -1; 14265 if (!PyLong_Check(v)) { 14266 PyErr_SetString(PyExc_TypeError, 14267 "* wants int"); 14268 return -1; 14269 } 14270 arg->prec = _PyLong_AsInt(v); 14271 if (arg->prec == -1 && PyErr_Occurred()) 14272 return -1; 14273 if (arg->prec < 0) 14274 arg->prec = 0; 14275 if (--ctx->fmtcnt >= 0) { 14276 arg->ch = FORMAT_READ(ctx); 14277 ctx->fmtpos++; 14278 } 14279 } 14280 else if (arg->ch >= '0' && arg->ch <= '9') { 14281 arg->prec = arg->ch - '0'; 14282 while (--ctx->fmtcnt >= 0) { 14283 arg->ch = FORMAT_READ(ctx); 14284 ctx->fmtpos++; 14285 if (arg->ch < '0' || arg->ch > '9') 14286 break; 14287 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14288 PyErr_SetString(PyExc_ValueError, 14289 "precision too big"); 14290 return -1; 14291 } 14292 arg->prec = arg->prec*10 + (arg->ch - '0'); 14293 } 14294 } 14295 } 14296 14297 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14298 if (ctx->fmtcnt >= 0) { 14299 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14300 if (--ctx->fmtcnt >= 0) { 14301 arg->ch = FORMAT_READ(ctx); 14302 ctx->fmtpos++; 14303 } 14304 } 14305 } 14306 if (ctx->fmtcnt < 0) { 14307 PyErr_SetString(PyExc_ValueError, 14308 "incomplete format"); 14309 return -1; 14310 } 14311 return 0; 14312 14313#undef FORMAT_READ 14314} 14315 14316/* Format one argument. Supported conversion specifiers: 14317 14318 - "s", "r", "a": any type 14319 - "i", "d", "u": int or float 14320 - "o", "x", "X": int 14321 - "e", "E", "f", "F", "g", "G": float 14322 - "c": int or str (1 character) 14323 14324 When possible, the output is written directly into the Unicode writer 14325 (ctx->writer). A string is created when padding is required. 14326 14327 Return 0 if the argument has been formatted into *p_str, 14328 1 if the argument has been written into ctx->writer, 14329 -1 on error. */ 14330static int 14331unicode_format_arg_format(struct unicode_formatter_t *ctx, 14332 struct unicode_format_arg_t *arg, 14333 PyObject **p_str) 14334{ 14335 PyObject *v; 14336 _PyUnicodeWriter *writer = &ctx->writer; 14337 14338 if (ctx->fmtcnt == 0) 14339 ctx->writer.overallocate = 0; 14340 14341 if (arg->ch == '%') { 14342 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14343 return -1; 14344 return 1; 14345 } 14346 14347 v = unicode_format_getnextarg(ctx); 14348 if (v == NULL) 14349 return -1; 14350 14351 14352 switch (arg->ch) { 14353 case 's': 14354 case 'r': 14355 case 'a': 14356 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14357 /* Fast path */ 14358 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14359 return -1; 14360 return 1; 14361 } 14362 14363 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14364 *p_str = v; 14365 Py_INCREF(*p_str); 14366 } 14367 else { 14368 if (arg->ch == 's') 14369 *p_str = PyObject_Str(v); 14370 else if (arg->ch == 'r') 14371 *p_str = PyObject_Repr(v); 14372 else 14373 *p_str = PyObject_ASCII(v); 14374 } 14375 break; 14376 14377 case 'i': 14378 case 'd': 14379 case 'u': 14380 case 'o': 14381 case 'x': 14382 case 'X': 14383 { 14384 int ret = mainformatlong(v, arg, p_str, writer); 14385 if (ret != 0) 14386 return ret; 14387 arg->sign = 1; 14388 break; 14389 } 14390 14391 case 'e': 14392 case 'E': 14393 case 'f': 14394 case 'F': 14395 case 'g': 14396 case 'G': 14397 if (arg->width == -1 && arg->prec == -1 14398 && !(arg->flags & (F_SIGN | F_BLANK))) 14399 { 14400 /* Fast path */ 14401 if (formatfloat(v, arg, NULL, writer) == -1) 14402 return -1; 14403 return 1; 14404 } 14405 14406 arg->sign = 1; 14407 if (formatfloat(v, arg, p_str, NULL) == -1) 14408 return -1; 14409 break; 14410 14411 case 'c': 14412 { 14413 Py_UCS4 ch = formatchar(v); 14414 if (ch == (Py_UCS4) -1) 14415 return -1; 14416 if (arg->width == -1 && arg->prec == -1) { 14417 /* Fast path */ 14418 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14419 return -1; 14420 return 1; 14421 } 14422 *p_str = PyUnicode_FromOrdinal(ch); 14423 break; 14424 } 14425 14426 default: 14427 PyErr_Format(PyExc_ValueError, 14428 "unsupported format character '%c' (0x%x) " 14429 "at index %zd", 14430 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14431 (int)arg->ch, 14432 ctx->fmtpos - 1); 14433 return -1; 14434 } 14435 if (*p_str == NULL) 14436 return -1; 14437 assert (PyUnicode_Check(*p_str)); 14438 return 0; 14439} 14440 14441static int 14442unicode_format_arg_output(struct unicode_formatter_t *ctx, 14443 struct unicode_format_arg_t *arg, 14444 PyObject *str) 14445{ 14446 Py_ssize_t len; 14447 enum PyUnicode_Kind kind; 14448 void *pbuf; 14449 Py_ssize_t pindex; 14450 Py_UCS4 signchar; 14451 Py_ssize_t buflen; 14452 Py_UCS4 maxchar; 14453 Py_ssize_t sublen; 14454 _PyUnicodeWriter *writer = &ctx->writer; 14455 Py_UCS4 fill; 14456 14457 fill = ' '; 14458 if (arg->sign && arg->flags & F_ZERO) 14459 fill = '0'; 14460 14461 if (PyUnicode_READY(str) == -1) 14462 return -1; 14463 14464 len = PyUnicode_GET_LENGTH(str); 14465 if ((arg->width == -1 || arg->width <= len) 14466 && (arg->prec == -1 || arg->prec >= len) 14467 && !(arg->flags & (F_SIGN | F_BLANK))) 14468 { 14469 /* Fast path */ 14470 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14471 return -1; 14472 return 0; 14473 } 14474 14475 /* Truncate the string for "s", "r" and "a" formats 14476 if the precision is set */ 14477 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14478 if (arg->prec >= 0 && len > arg->prec) 14479 len = arg->prec; 14480 } 14481 14482 /* Adjust sign and width */ 14483 kind = PyUnicode_KIND(str); 14484 pbuf = PyUnicode_DATA(str); 14485 pindex = 0; 14486 signchar = '\0'; 14487 if (arg->sign) { 14488 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14489 if (ch == '-' || ch == '+') { 14490 signchar = ch; 14491 len--; 14492 pindex++; 14493 } 14494 else if (arg->flags & F_SIGN) 14495 signchar = '+'; 14496 else if (arg->flags & F_BLANK) 14497 signchar = ' '; 14498 else 14499 arg->sign = 0; 14500 } 14501 if (arg->width < len) 14502 arg->width = len; 14503 14504 /* Prepare the writer */ 14505 maxchar = writer->maxchar; 14506 if (!(arg->flags & F_LJUST)) { 14507 if (arg->sign) { 14508 if ((arg->width-1) > len) 14509 maxchar = Py_MAX(maxchar, fill); 14510 } 14511 else { 14512 if (arg->width > len) 14513 maxchar = Py_MAX(maxchar, fill); 14514 } 14515 } 14516 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14517 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14518 maxchar = Py_MAX(maxchar, strmaxchar); 14519 } 14520 14521 buflen = arg->width; 14522 if (arg->sign && len == arg->width) 14523 buflen++; 14524 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14525 return -1; 14526 14527 /* Write the sign if needed */ 14528 if (arg->sign) { 14529 if (fill != ' ') { 14530 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14531 writer->pos += 1; 14532 } 14533 if (arg->width > len) 14534 arg->width--; 14535 } 14536 14537 /* Write the numeric prefix for "x", "X" and "o" formats 14538 if the alternate form is used. 14539 For example, write "0x" for the "%#x" format. */ 14540 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14541 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14542 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14543 if (fill != ' ') { 14544 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14545 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14546 writer->pos += 2; 14547 pindex += 2; 14548 } 14549 arg->width -= 2; 14550 if (arg->width < 0) 14551 arg->width = 0; 14552 len -= 2; 14553 } 14554 14555 /* Pad left with the fill character if needed */ 14556 if (arg->width > len && !(arg->flags & F_LJUST)) { 14557 sublen = arg->width - len; 14558 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14559 writer->pos += sublen; 14560 arg->width = len; 14561 } 14562 14563 /* If padding with spaces: write sign if needed and/or numeric prefix if 14564 the alternate form is used */ 14565 if (fill == ' ') { 14566 if (arg->sign) { 14567 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14568 writer->pos += 1; 14569 } 14570 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14571 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14572 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14573 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14574 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14575 writer->pos += 2; 14576 pindex += 2; 14577 } 14578 } 14579 14580 /* Write characters */ 14581 if (len) { 14582 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14583 str, pindex, len); 14584 writer->pos += len; 14585 } 14586 14587 /* Pad right with the fill character if needed */ 14588 if (arg->width > len) { 14589 sublen = arg->width - len; 14590 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14591 writer->pos += sublen; 14592 } 14593 return 0; 14594} 14595 14596/* Helper of PyUnicode_Format(): format one arg. 14597 Return 0 on success, raise an exception and return -1 on error. */ 14598static int 14599unicode_format_arg(struct unicode_formatter_t *ctx) 14600{ 14601 struct unicode_format_arg_t arg; 14602 PyObject *str; 14603 int ret; 14604 14605 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14606 arg.flags = 0; 14607 arg.width = -1; 14608 arg.prec = -1; 14609 arg.sign = 0; 14610 str = NULL; 14611 14612 ret = unicode_format_arg_parse(ctx, &arg); 14613 if (ret == -1) 14614 return -1; 14615 14616 ret = unicode_format_arg_format(ctx, &arg, &str); 14617 if (ret == -1) 14618 return -1; 14619 14620 if (ret != 1) { 14621 ret = unicode_format_arg_output(ctx, &arg, str); 14622 Py_DECREF(str); 14623 if (ret == -1) 14624 return -1; 14625 } 14626 14627 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14628 PyErr_SetString(PyExc_TypeError, 14629 "not all arguments converted during string formatting"); 14630 return -1; 14631 } 14632 return 0; 14633} 14634 14635PyObject * 14636PyUnicode_Format(PyObject *format, PyObject *args) 14637{ 14638 struct unicode_formatter_t ctx; 14639 14640 if (format == NULL || args == NULL) { 14641 PyErr_BadInternalCall(); 14642 return NULL; 14643 } 14644 14645 ctx.fmtstr = PyUnicode_FromObject(format); 14646 if (ctx.fmtstr == NULL) 14647 return NULL; 14648 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14649 Py_DECREF(ctx.fmtstr); 14650 return NULL; 14651 } 14652 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14653 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14654 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14655 ctx.fmtpos = 0; 14656 14657 _PyUnicodeWriter_Init(&ctx.writer); 14658 ctx.writer.min_length = ctx.fmtcnt + 100; 14659 ctx.writer.overallocate = 1; 14660 14661 if (PyTuple_Check(args)) { 14662 ctx.arglen = PyTuple_Size(args); 14663 ctx.argidx = 0; 14664 } 14665 else { 14666 ctx.arglen = -1; 14667 ctx.argidx = -2; 14668 } 14669 ctx.args_owned = 0; 14670 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14671 ctx.dict = args; 14672 else 14673 ctx.dict = NULL; 14674 ctx.args = args; 14675 14676 while (--ctx.fmtcnt >= 0) { 14677 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14678 Py_ssize_t nonfmtpos; 14679 14680 nonfmtpos = ctx.fmtpos++; 14681 while (ctx.fmtcnt >= 0 && 14682 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14683 ctx.fmtpos++; 14684 ctx.fmtcnt--; 14685 } 14686 if (ctx.fmtcnt < 0) { 14687 ctx.fmtpos--; 14688 ctx.writer.overallocate = 0; 14689 } 14690 14691 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14692 nonfmtpos, ctx.fmtpos) < 0) 14693 goto onError; 14694 } 14695 else { 14696 ctx.fmtpos++; 14697 if (unicode_format_arg(&ctx) == -1) 14698 goto onError; 14699 } 14700 } 14701 14702 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14703 PyErr_SetString(PyExc_TypeError, 14704 "not all arguments converted during string formatting"); 14705 goto onError; 14706 } 14707 14708 if (ctx.args_owned) { 14709 Py_DECREF(ctx.args); 14710 } 14711 Py_DECREF(ctx.fmtstr); 14712 return _PyUnicodeWriter_Finish(&ctx.writer); 14713 14714 onError: 14715 Py_DECREF(ctx.fmtstr); 14716 _PyUnicodeWriter_Dealloc(&ctx.writer); 14717 if (ctx.args_owned) { 14718 Py_DECREF(ctx.args); 14719 } 14720 return NULL; 14721} 14722 14723static PyObject * 14724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14725 14726static PyObject * 14727unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14728{ 14729 PyObject *x = NULL; 14730 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14731 char *encoding = NULL; 14732 char *errors = NULL; 14733 14734 if (type != &PyUnicode_Type) 14735 return unicode_subtype_new(type, args, kwds); 14736 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14737 kwlist, &x, &encoding, &errors)) 14738 return NULL; 14739 if (x == NULL) 14740 _Py_RETURN_UNICODE_EMPTY(); 14741 if (encoding == NULL && errors == NULL) 14742 return PyObject_Str(x); 14743 else 14744 return PyUnicode_FromEncodedObject(x, encoding, errors); 14745} 14746 14747static PyObject * 14748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14749{ 14750 PyObject *unicode, *self; 14751 Py_ssize_t length, char_size; 14752 int share_wstr, share_utf8; 14753 unsigned int kind; 14754 void *data; 14755 14756 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14757 14758 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14759 if (unicode == NULL) 14760 return NULL; 14761 assert(_PyUnicode_CHECK(unicode)); 14762 if (PyUnicode_READY(unicode) == -1) { 14763 Py_DECREF(unicode); 14764 return NULL; 14765 } 14766 14767 self = type->tp_alloc(type, 0); 14768 if (self == NULL) { 14769 Py_DECREF(unicode); 14770 return NULL; 14771 } 14772 kind = PyUnicode_KIND(unicode); 14773 length = PyUnicode_GET_LENGTH(unicode); 14774 14775 _PyUnicode_LENGTH(self) = length; 14776#ifdef Py_DEBUG 14777 _PyUnicode_HASH(self) = -1; 14778#else 14779 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14780#endif 14781 _PyUnicode_STATE(self).interned = 0; 14782 _PyUnicode_STATE(self).kind = kind; 14783 _PyUnicode_STATE(self).compact = 0; 14784 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14785 _PyUnicode_STATE(self).ready = 1; 14786 _PyUnicode_WSTR(self) = NULL; 14787 _PyUnicode_UTF8_LENGTH(self) = 0; 14788 _PyUnicode_UTF8(self) = NULL; 14789 _PyUnicode_WSTR_LENGTH(self) = 0; 14790 _PyUnicode_DATA_ANY(self) = NULL; 14791 14792 share_utf8 = 0; 14793 share_wstr = 0; 14794 if (kind == PyUnicode_1BYTE_KIND) { 14795 char_size = 1; 14796 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14797 share_utf8 = 1; 14798 } 14799 else if (kind == PyUnicode_2BYTE_KIND) { 14800 char_size = 2; 14801 if (sizeof(wchar_t) == 2) 14802 share_wstr = 1; 14803 } 14804 else { 14805 assert(kind == PyUnicode_4BYTE_KIND); 14806 char_size = 4; 14807 if (sizeof(wchar_t) == 4) 14808 share_wstr = 1; 14809 } 14810 14811 /* Ensure we won't overflow the length. */ 14812 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14813 PyErr_NoMemory(); 14814 goto onError; 14815 } 14816 data = PyObject_MALLOC((length + 1) * char_size); 14817 if (data == NULL) { 14818 PyErr_NoMemory(); 14819 goto onError; 14820 } 14821 14822 _PyUnicode_DATA_ANY(self) = data; 14823 if (share_utf8) { 14824 _PyUnicode_UTF8_LENGTH(self) = length; 14825 _PyUnicode_UTF8(self) = data; 14826 } 14827 if (share_wstr) { 14828 _PyUnicode_WSTR_LENGTH(self) = length; 14829 _PyUnicode_WSTR(self) = (wchar_t *)data; 14830 } 14831 14832 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14833 kind * (length + 1)); 14834 assert(_PyUnicode_CheckConsistency(self, 1)); 14835#ifdef Py_DEBUG 14836 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14837#endif 14838 Py_DECREF(unicode); 14839 return self; 14840 14841onError: 14842 Py_DECREF(unicode); 14843 Py_DECREF(self); 14844 return NULL; 14845} 14846 14847PyDoc_STRVAR(unicode_doc, 14848"str(object='') -> str\n\ 14849str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14850\n\ 14851Create a new string object from the given object. If encoding or\n\ 14852errors is specified, then the object must expose a data buffer\n\ 14853that will be decoded using the given encoding and error handler.\n\ 14854Otherwise, returns the result of object.__str__() (if defined)\n\ 14855or repr(object).\n\ 14856encoding defaults to sys.getdefaultencoding().\n\ 14857errors defaults to 'strict'."); 14858 14859static PyObject *unicode_iter(PyObject *seq); 14860 14861PyTypeObject PyUnicode_Type = { 14862 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14863 "str", /* tp_name */ 14864 sizeof(PyUnicodeObject), /* tp_size */ 14865 0, /* tp_itemsize */ 14866 /* Slots */ 14867 (destructor)unicode_dealloc, /* tp_dealloc */ 14868 0, /* tp_print */ 14869 0, /* tp_getattr */ 14870 0, /* tp_setattr */ 14871 0, /* tp_reserved */ 14872 unicode_repr, /* tp_repr */ 14873 &unicode_as_number, /* tp_as_number */ 14874 &unicode_as_sequence, /* tp_as_sequence */ 14875 &unicode_as_mapping, /* tp_as_mapping */ 14876 (hashfunc) unicode_hash, /* tp_hash*/ 14877 0, /* tp_call*/ 14878 (reprfunc) unicode_str, /* tp_str */ 14879 PyObject_GenericGetAttr, /* tp_getattro */ 14880 0, /* tp_setattro */ 14881 0, /* tp_as_buffer */ 14882 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14883 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14884 unicode_doc, /* tp_doc */ 14885 0, /* tp_traverse */ 14886 0, /* tp_clear */ 14887 PyUnicode_RichCompare, /* tp_richcompare */ 14888 0, /* tp_weaklistoffset */ 14889 unicode_iter, /* tp_iter */ 14890 0, /* tp_iternext */ 14891 unicode_methods, /* tp_methods */ 14892 0, /* tp_members */ 14893 0, /* tp_getset */ 14894 &PyBaseObject_Type, /* tp_base */ 14895 0, /* tp_dict */ 14896 0, /* tp_descr_get */ 14897 0, /* tp_descr_set */ 14898 0, /* tp_dictoffset */ 14899 0, /* tp_init */ 14900 0, /* tp_alloc */ 14901 unicode_new, /* tp_new */ 14902 PyObject_Del, /* tp_free */ 14903}; 14904 14905/* Initialize the Unicode implementation */ 14906 14907int _PyUnicode_Init(void) 14908{ 14909 /* XXX - move this array to unicodectype.c ? */ 14910 Py_UCS2 linebreak[] = { 14911 0x000A, /* LINE FEED */ 14912 0x000D, /* CARRIAGE RETURN */ 14913 0x001C, /* FILE SEPARATOR */ 14914 0x001D, /* GROUP SEPARATOR */ 14915 0x001E, /* RECORD SEPARATOR */ 14916 0x0085, /* NEXT LINE */ 14917 0x2028, /* LINE SEPARATOR */ 14918 0x2029, /* PARAGRAPH SEPARATOR */ 14919 }; 14920 14921 /* Init the implementation */ 14922 _Py_INCREF_UNICODE_EMPTY(); 14923 if (!unicode_empty) 14924 Py_FatalError("Can't create empty string"); 14925 Py_DECREF(unicode_empty); 14926 14927 if (PyType_Ready(&PyUnicode_Type) < 0) 14928 Py_FatalError("Can't initialize 'unicode'"); 14929 14930 /* initialize the linebreak bloom filter */ 14931 bloom_linebreak = make_bloom_mask( 14932 PyUnicode_2BYTE_KIND, linebreak, 14933 Py_ARRAY_LENGTH(linebreak)); 14934 14935 if (PyType_Ready(&EncodingMapType) < 0) 14936 Py_FatalError("Can't initialize encoding map type"); 14937 14938 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 14939 Py_FatalError("Can't initialize field name iterator type"); 14940 14941 if (PyType_Ready(&PyFormatterIter_Type) < 0) 14942 Py_FatalError("Can't initialize formatter iter type"); 14943 14944 return 0; 14945} 14946 14947/* Finalize the Unicode implementation */ 14948 14949int 14950PyUnicode_ClearFreeList(void) 14951{ 14952 return 0; 14953} 14954 14955void 14956_PyUnicode_Fini(void) 14957{ 14958 int i; 14959 14960 Py_CLEAR(unicode_empty); 14961 14962 for (i = 0; i < 256; i++) 14963 Py_CLEAR(unicode_latin1[i]); 14964 _PyUnicode_ClearStaticStrings(); 14965 (void)PyUnicode_ClearFreeList(); 14966} 14967 14968void 14969PyUnicode_InternInPlace(PyObject **p) 14970{ 14971 PyObject *s = *p; 14972 PyObject *t; 14973#ifdef Py_DEBUG 14974 assert(s != NULL); 14975 assert(_PyUnicode_CHECK(s)); 14976#else 14977 if (s == NULL || !PyUnicode_Check(s)) 14978 return; 14979#endif 14980 /* If it's a subclass, we don't really know what putting 14981 it in the interned dict might do. */ 14982 if (!PyUnicode_CheckExact(s)) 14983 return; 14984 if (PyUnicode_CHECK_INTERNED(s)) 14985 return; 14986 if (interned == NULL) { 14987 interned = PyDict_New(); 14988 if (interned == NULL) { 14989 PyErr_Clear(); /* Don't leave an exception */ 14990 return; 14991 } 14992 } 14993 /* It might be that the GetItem call fails even 14994 though the key is present in the dictionary, 14995 namely when this happens during a stack overflow. */ 14996 Py_ALLOW_RECURSION 14997 t = PyDict_GetItem(interned, s); 14998 Py_END_ALLOW_RECURSION 14999 15000 if (t) { 15001 Py_INCREF(t); 15002 Py_DECREF(*p); 15003 *p = t; 15004 return; 15005 } 15006 15007 PyThreadState_GET()->recursion_critical = 1; 15008 if (PyDict_SetItem(interned, s, s) < 0) { 15009 PyErr_Clear(); 15010 PyThreadState_GET()->recursion_critical = 0; 15011 return; 15012 } 15013 PyThreadState_GET()->recursion_critical = 0; 15014 /* The two references in interned are not counted by refcnt. 15015 The deallocator will take care of this */ 15016 Py_REFCNT(s) -= 2; 15017 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15018} 15019 15020void 15021PyUnicode_InternImmortal(PyObject **p) 15022{ 15023 PyUnicode_InternInPlace(p); 15024 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15025 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15026 Py_INCREF(*p); 15027 } 15028} 15029 15030PyObject * 15031PyUnicode_InternFromString(const char *cp) 15032{ 15033 PyObject *s = PyUnicode_FromString(cp); 15034 if (s == NULL) 15035 return NULL; 15036 PyUnicode_InternInPlace(&s); 15037 return s; 15038} 15039 15040void 15041_Py_ReleaseInternedUnicodeStrings(void) 15042{ 15043 PyObject *keys; 15044 PyObject *s; 15045 Py_ssize_t i, n; 15046 Py_ssize_t immortal_size = 0, mortal_size = 0; 15047 15048 if (interned == NULL || !PyDict_Check(interned)) 15049 return; 15050 keys = PyDict_Keys(interned); 15051 if (keys == NULL || !PyList_Check(keys)) { 15052 PyErr_Clear(); 15053 return; 15054 } 15055 15056 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15057 detector, interned unicode strings are not forcibly deallocated; 15058 rather, we give them their stolen references back, and then clear 15059 and DECREF the interned dict. */ 15060 15061 n = PyList_GET_SIZE(keys); 15062 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15063 n); 15064 for (i = 0; i < n; i++) { 15065 s = PyList_GET_ITEM(keys, i); 15066 if (PyUnicode_READY(s) == -1) { 15067 assert(0 && "could not ready string"); 15068 fprintf(stderr, "could not ready string\n"); 15069 } 15070 switch (PyUnicode_CHECK_INTERNED(s)) { 15071 case SSTATE_NOT_INTERNED: 15072 /* XXX Shouldn't happen */ 15073 break; 15074 case SSTATE_INTERNED_IMMORTAL: 15075 Py_REFCNT(s) += 1; 15076 immortal_size += PyUnicode_GET_LENGTH(s); 15077 break; 15078 case SSTATE_INTERNED_MORTAL: 15079 Py_REFCNT(s) += 2; 15080 mortal_size += PyUnicode_GET_LENGTH(s); 15081 break; 15082 default: 15083 Py_FatalError("Inconsistent interned string state."); 15084 } 15085 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15086 } 15087 fprintf(stderr, "total size of all interned strings: " 15088 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15089 "mortal/immortal\n", mortal_size, immortal_size); 15090 Py_DECREF(keys); 15091 PyDict_Clear(interned); 15092 Py_CLEAR(interned); 15093} 15094 15095 15096/********************* Unicode Iterator **************************/ 15097 15098typedef struct { 15099 PyObject_HEAD 15100 Py_ssize_t it_index; 15101 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15102} unicodeiterobject; 15103 15104static void 15105unicodeiter_dealloc(unicodeiterobject *it) 15106{ 15107 _PyObject_GC_UNTRACK(it); 15108 Py_XDECREF(it->it_seq); 15109 PyObject_GC_Del(it); 15110} 15111 15112static int 15113unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15114{ 15115 Py_VISIT(it->it_seq); 15116 return 0; 15117} 15118 15119static PyObject * 15120unicodeiter_next(unicodeiterobject *it) 15121{ 15122 PyObject *seq, *item; 15123 15124 assert(it != NULL); 15125 seq = it->it_seq; 15126 if (seq == NULL) 15127 return NULL; 15128 assert(_PyUnicode_CHECK(seq)); 15129 15130 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15131 int kind = PyUnicode_KIND(seq); 15132 void *data = PyUnicode_DATA(seq); 15133 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15134 item = PyUnicode_FromOrdinal(chr); 15135 if (item != NULL) 15136 ++it->it_index; 15137 return item; 15138 } 15139 15140 Py_DECREF(seq); 15141 it->it_seq = NULL; 15142 return NULL; 15143} 15144 15145static PyObject * 15146unicodeiter_len(unicodeiterobject *it) 15147{ 15148 Py_ssize_t len = 0; 15149 if (it->it_seq) 15150 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15151 return PyLong_FromSsize_t(len); 15152} 15153 15154PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15155 15156static PyObject * 15157unicodeiter_reduce(unicodeiterobject *it) 15158{ 15159 if (it->it_seq != NULL) { 15160 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15161 it->it_seq, it->it_index); 15162 } else { 15163 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15164 if (u == NULL) 15165 return NULL; 15166 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15167 } 15168} 15169 15170PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15171 15172static PyObject * 15173unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15174{ 15175 Py_ssize_t index = PyLong_AsSsize_t(state); 15176 if (index == -1 && PyErr_Occurred()) 15177 return NULL; 15178 if (it->it_seq != NULL) { 15179 if (index < 0) 15180 index = 0; 15181 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15182 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15183 it->it_index = index; 15184 } 15185 Py_RETURN_NONE; 15186} 15187 15188PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15189 15190static PyMethodDef unicodeiter_methods[] = { 15191 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15192 length_hint_doc}, 15193 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15194 reduce_doc}, 15195 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15196 setstate_doc}, 15197 {NULL, NULL} /* sentinel */ 15198}; 15199 15200PyTypeObject PyUnicodeIter_Type = { 15201 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15202 "str_iterator", /* tp_name */ 15203 sizeof(unicodeiterobject), /* tp_basicsize */ 15204 0, /* tp_itemsize */ 15205 /* methods */ 15206 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15207 0, /* tp_print */ 15208 0, /* tp_getattr */ 15209 0, /* tp_setattr */ 15210 0, /* tp_reserved */ 15211 0, /* tp_repr */ 15212 0, /* tp_as_number */ 15213 0, /* tp_as_sequence */ 15214 0, /* tp_as_mapping */ 15215 0, /* tp_hash */ 15216 0, /* tp_call */ 15217 0, /* tp_str */ 15218 PyObject_GenericGetAttr, /* tp_getattro */ 15219 0, /* tp_setattro */ 15220 0, /* tp_as_buffer */ 15221 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15222 0, /* tp_doc */ 15223 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15224 0, /* tp_clear */ 15225 0, /* tp_richcompare */ 15226 0, /* tp_weaklistoffset */ 15227 PyObject_SelfIter, /* tp_iter */ 15228 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15229 unicodeiter_methods, /* tp_methods */ 15230 0, 15231}; 15232 15233static PyObject * 15234unicode_iter(PyObject *seq) 15235{ 15236 unicodeiterobject *it; 15237 15238 if (!PyUnicode_Check(seq)) { 15239 PyErr_BadInternalCall(); 15240 return NULL; 15241 } 15242 if (PyUnicode_READY(seq) == -1) 15243 return NULL; 15244 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15245 if (it == NULL) 15246 return NULL; 15247 it->it_index = 0; 15248 Py_INCREF(seq); 15249 it->it_seq = seq; 15250 _PyObject_GC_TRACK(it); 15251 return (PyObject *)it; 15252} 15253 15254 15255size_t 15256Py_UNICODE_strlen(const Py_UNICODE *u) 15257{ 15258 int res = 0; 15259 while(*u++) 15260 res++; 15261 return res; 15262} 15263 15264Py_UNICODE* 15265Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15266{ 15267 Py_UNICODE *u = s1; 15268 while ((*u++ = *s2++)); 15269 return s1; 15270} 15271 15272Py_UNICODE* 15273Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15274{ 15275 Py_UNICODE *u = s1; 15276 while ((*u++ = *s2++)) 15277 if (n-- == 0) 15278 break; 15279 return s1; 15280} 15281 15282Py_UNICODE* 15283Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15284{ 15285 Py_UNICODE *u1 = s1; 15286 u1 += Py_UNICODE_strlen(u1); 15287 Py_UNICODE_strcpy(u1, s2); 15288 return s1; 15289} 15290 15291int 15292Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15293{ 15294 while (*s1 && *s2 && *s1 == *s2) 15295 s1++, s2++; 15296 if (*s1 && *s2) 15297 return (*s1 < *s2) ? -1 : +1; 15298 if (*s1) 15299 return 1; 15300 if (*s2) 15301 return -1; 15302 return 0; 15303} 15304 15305int 15306Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15307{ 15308 Py_UNICODE u1, u2; 15309 for (; n != 0; n--) { 15310 u1 = *s1; 15311 u2 = *s2; 15312 if (u1 != u2) 15313 return (u1 < u2) ? -1 : +1; 15314 if (u1 == '\0') 15315 return 0; 15316 s1++; 15317 s2++; 15318 } 15319 return 0; 15320} 15321 15322Py_UNICODE* 15323Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15324{ 15325 const Py_UNICODE *p; 15326 for (p = s; *p; p++) 15327 if (*p == c) 15328 return (Py_UNICODE*)p; 15329 return NULL; 15330} 15331 15332Py_UNICODE* 15333Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15334{ 15335 const Py_UNICODE *p; 15336 p = s + Py_UNICODE_strlen(s); 15337 while (p != s) { 15338 p--; 15339 if (*p == c) 15340 return (Py_UNICODE*)p; 15341 } 15342 return NULL; 15343} 15344 15345Py_UNICODE* 15346PyUnicode_AsUnicodeCopy(PyObject *unicode) 15347{ 15348 Py_UNICODE *u, *copy; 15349 Py_ssize_t len, size; 15350 15351 if (!PyUnicode_Check(unicode)) { 15352 PyErr_BadArgument(); 15353 return NULL; 15354 } 15355 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15356 if (u == NULL) 15357 return NULL; 15358 /* Ensure we won't overflow the size. */ 15359 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15360 PyErr_NoMemory(); 15361 return NULL; 15362 } 15363 size = len + 1; /* copy the null character */ 15364 size *= sizeof(Py_UNICODE); 15365 copy = PyMem_Malloc(size); 15366 if (copy == NULL) { 15367 PyErr_NoMemory(); 15368 return NULL; 15369 } 15370 memcpy(copy, u, size); 15371 return copy; 15372} 15373 15374/* A _string module, to export formatter_parser and formatter_field_name_split 15375 to the string.Formatter class implemented in Python. */ 15376 15377static PyMethodDef _string_methods[] = { 15378 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15379 METH_O, PyDoc_STR("split the argument as a field name")}, 15380 {"formatter_parser", (PyCFunction) formatter_parser, 15381 METH_O, PyDoc_STR("parse the argument as a format string")}, 15382 {NULL, NULL} 15383}; 15384 15385static struct PyModuleDef _string_module = { 15386 PyModuleDef_HEAD_INIT, 15387 "_string", 15388 PyDoc_STR("string helper module"), 15389 0, 15390 _string_methods, 15391 NULL, 15392 NULL, 15393 NULL, 15394 NULL 15395}; 15396 15397PyMODINIT_FUNC 15398PyInit__string(void) 15399{ 15400 return PyModule_Create(&_string_module); 15401} 15402 15403 15404#ifdef __cplusplus 15405} 15406#endif 15407