unicodeobject.c revision f6d1f1fa8a503f218a2103ba1e6768c6cfdb7c50
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/*[clinic input] 51class str "PyUnicodeObject *" "&PyUnicode_Type" 52[clinic start generated code]*/ 53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 54 55/* --- Globals ------------------------------------------------------------ 56 57NOTE: In the interpreter's initialization phase, some globals are currently 58 initialized dynamically as needed. In the process Unicode objects may 59 be created before the Unicode type is ready. 60 61*/ 62 63 64#ifdef __cplusplus 65extern "C" { 66#endif 67 68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 69#define MAX_UNICODE 0x10ffff 70 71#ifdef Py_DEBUG 72# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 73#else 74# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 75#endif 76 77#define _PyUnicode_UTF8(op) \ 78 (((PyCompactUnicodeObject*)(op))->utf8) 79#define PyUnicode_UTF8(op) \ 80 (assert(_PyUnicode_CHECK(op)), \ 81 assert(PyUnicode_IS_READY(op)), \ 82 PyUnicode_IS_COMPACT_ASCII(op) ? \ 83 ((char*)((PyASCIIObject*)(op) + 1)) : \ 84 _PyUnicode_UTF8(op)) 85#define _PyUnicode_UTF8_LENGTH(op) \ 86 (((PyCompactUnicodeObject*)(op))->utf8_length) 87#define PyUnicode_UTF8_LENGTH(op) \ 88 (assert(_PyUnicode_CHECK(op)), \ 89 assert(PyUnicode_IS_READY(op)), \ 90 PyUnicode_IS_COMPACT_ASCII(op) ? \ 91 ((PyASCIIObject*)(op))->length : \ 92 _PyUnicode_UTF8_LENGTH(op)) 93#define _PyUnicode_WSTR(op) \ 94 (((PyASCIIObject*)(op))->wstr) 95#define _PyUnicode_WSTR_LENGTH(op) \ 96 (((PyCompactUnicodeObject*)(op))->wstr_length) 97#define _PyUnicode_LENGTH(op) \ 98 (((PyASCIIObject *)(op))->length) 99#define _PyUnicode_STATE(op) \ 100 (((PyASCIIObject *)(op))->state) 101#define _PyUnicode_HASH(op) \ 102 (((PyASCIIObject *)(op))->hash) 103#define _PyUnicode_KIND(op) \ 104 (assert(_PyUnicode_CHECK(op)), \ 105 ((PyASCIIObject *)(op))->state.kind) 106#define _PyUnicode_GET_LENGTH(op) \ 107 (assert(_PyUnicode_CHECK(op)), \ 108 ((PyASCIIObject *)(op))->length) 109#define _PyUnicode_DATA_ANY(op) \ 110 (((PyUnicodeObject*)(op))->data.any) 111 112#undef PyUnicode_READY 113#define PyUnicode_READY(op) \ 114 (assert(_PyUnicode_CHECK(op)), \ 115 (PyUnicode_IS_READY(op) ? \ 116 0 : \ 117 _PyUnicode_Ready(op))) 118 119#define _PyUnicode_SHARE_UTF8(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 123#define _PyUnicode_SHARE_WSTR(op) \ 124 (assert(_PyUnicode_CHECK(op)), \ 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 126 127/* true if the Unicode object has an allocated UTF-8 memory block 128 (not shared with other data) */ 129#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 ((_PyUnicode_WSTR(op) && \ 138 (!PyUnicode_IS_READY(op) || \ 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 140 141/* Generic helper macro to convert characters of different types. 142 from_type and to_type have to be valid type names, begin and end 143 are pointers to the source characters which should be of type 144 "from_type *". to is a pointer of type "to_type *" and points to the 145 buffer where the result characters are written to. */ 146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 147 do { \ 148 to_type *_to = (to_type *)(to); \ 149 const from_type *_iter = (from_type *)(begin); \ 150 const from_type *_end = (from_type *)(end); \ 151 Py_ssize_t n = (_end) - (_iter); \ 152 const from_type *_unrolled_end = \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 154 while (_iter < (_unrolled_end)) { \ 155 _to[0] = (to_type) _iter[0]; \ 156 _to[1] = (to_type) _iter[1]; \ 157 _to[2] = (to_type) _iter[2]; \ 158 _to[3] = (to_type) _iter[3]; \ 159 _iter += 4; _to += 4; \ 160 } \ 161 while (_iter < (_end)) \ 162 *_to++ = (to_type) *_iter++; \ 163 } while (0) 164 165/* This dictionary holds all interned unicode strings. Note that references 166 to strings in this dictionary are *not* counted in the string's ob_refcnt. 167 When the interned string reaches a refcnt of 0 the string deallocation 168 function will delete the reference from this dictionary. 169 170 Another way to look at this is that to say that the actual reference 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 172*/ 173static PyObject *interned = NULL; 174 175/* The empty Unicode object is shared to improve performance. */ 176static PyObject *unicode_empty = NULL; 177 178#define _Py_INCREF_UNICODE_EMPTY() \ 179 do { \ 180 if (unicode_empty != NULL) \ 181 Py_INCREF(unicode_empty); \ 182 else { \ 183 unicode_empty = PyUnicode_New(0, 0); \ 184 if (unicode_empty != NULL) { \ 185 Py_INCREF(unicode_empty); \ 186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 187 } \ 188 } \ 189 } while (0) 190 191#define _Py_RETURN_UNICODE_EMPTY() \ 192 do { \ 193 _Py_INCREF_UNICODE_EMPTY(); \ 194 return unicode_empty; \ 195 } while (0) 196 197/* Forward declaration */ 198Py_LOCAL_INLINE(int) 199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 200 201/* List of static strings. */ 202static _Py_Identifier *static_strings = NULL; 203 204/* Single character Unicode strings in the Latin-1 range are being 205 shared as well. */ 206static PyObject *unicode_latin1[256] = {NULL}; 207 208/* Fast detection of the most frequent whitespace characters */ 209const unsigned char _Py_ascii_whitespace[] = { 210 0, 0, 0, 0, 0, 0, 0, 0, 211/* case 0x0009: * CHARACTER TABULATION */ 212/* case 0x000A: * LINE FEED */ 213/* case 0x000B: * LINE TABULATION */ 214/* case 0x000C: * FORM FEED */ 215/* case 0x000D: * CARRIAGE RETURN */ 216 0, 1, 1, 1, 1, 1, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218/* case 0x001C: * FILE SEPARATOR */ 219/* case 0x001D: * GROUP SEPARATOR */ 220/* case 0x001E: * RECORD SEPARATOR */ 221/* case 0x001F: * UNIT SEPARATOR */ 222 0, 0, 0, 0, 1, 1, 1, 1, 223/* case 0x0020: * SPACE */ 224 1, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0 237}; 238 239/* forward */ 240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 241static PyObject* get_latin1_char(unsigned char ch); 242static int unicode_modifiable(PyObject *unicode); 243 244 245static PyObject * 246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 247static PyObject * 248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 249static PyObject * 250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 251 252static PyObject * 253unicode_encode_call_errorhandler(const char *errors, 254 PyObject **errorHandler,const char *encoding, const char *reason, 255 PyObject *unicode, PyObject **exceptionObject, 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 257 258static void 259raise_encode_exception(PyObject **exceptionObject, 260 const char *encoding, 261 PyObject *unicode, 262 Py_ssize_t startpos, Py_ssize_t endpos, 263 const char *reason); 264 265/* Same for linebreaks */ 266static unsigned char ascii_linebreak[] = { 267 0, 0, 0, 0, 0, 0, 0, 0, 268/* 0x000A, * LINE FEED */ 269/* 0x000B, * LINE TABULATION */ 270/* 0x000C, * FORM FEED */ 271/* 0x000D, * CARRIAGE RETURN */ 272 0, 0, 1, 1, 1, 1, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274/* 0x001C, * FILE SEPARATOR */ 275/* 0x001D, * GROUP SEPARATOR */ 276/* 0x001E, * RECORD SEPARATOR */ 277 0, 0, 0, 0, 1, 1, 1, 0, 278 0, 0, 0, 0, 0, 0, 0, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0 291}; 292 293/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 294 This function is kept for backward compatibility with the old API. */ 295Py_UNICODE 296PyUnicode_GetMax(void) 297{ 298#ifdef Py_UNICODE_WIDE 299 return 0x10FFFF; 300#else 301 /* This is actually an illegal character, so it should 302 not be passed to unichr. */ 303 return 0xFFFF; 304#endif 305} 306 307#ifdef Py_DEBUG 308int 309_PyUnicode_CheckConsistency(PyObject *op, int check_content) 310{ 311 PyASCIIObject *ascii; 312 unsigned int kind; 313 314 assert(PyUnicode_Check(op)); 315 316 ascii = (PyASCIIObject *)op; 317 kind = ascii->state.kind; 318 319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 320 assert(kind == PyUnicode_1BYTE_KIND); 321 assert(ascii->state.ready == 1); 322 } 323 else { 324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 325 void *data; 326 327 if (ascii->state.compact == 1) { 328 data = compact + 1; 329 assert(kind == PyUnicode_1BYTE_KIND 330 || kind == PyUnicode_2BYTE_KIND 331 || kind == PyUnicode_4BYTE_KIND); 332 assert(ascii->state.ascii == 0); 333 assert(ascii->state.ready == 1); 334 assert (compact->utf8 != data); 335 } 336 else { 337 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 338 339 data = unicode->data.any; 340 if (kind == PyUnicode_WCHAR_KIND) { 341 assert(ascii->length == 0); 342 assert(ascii->hash == -1); 343 assert(ascii->state.compact == 0); 344 assert(ascii->state.ascii == 0); 345 assert(ascii->state.ready == 0); 346 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 347 assert(ascii->wstr != NULL); 348 assert(data == NULL); 349 assert(compact->utf8 == NULL); 350 } 351 else { 352 assert(kind == PyUnicode_1BYTE_KIND 353 || kind == PyUnicode_2BYTE_KIND 354 || kind == PyUnicode_4BYTE_KIND); 355 assert(ascii->state.compact == 0); 356 assert(ascii->state.ready == 1); 357 assert(data != NULL); 358 if (ascii->state.ascii) { 359 assert (compact->utf8 == data); 360 assert (compact->utf8_length == ascii->length); 361 } 362 else 363 assert (compact->utf8 != data); 364 } 365 } 366 if (kind != PyUnicode_WCHAR_KIND) { 367 if ( 368#if SIZEOF_WCHAR_T == 2 369 kind == PyUnicode_2BYTE_KIND 370#else 371 kind == PyUnicode_4BYTE_KIND 372#endif 373 ) 374 { 375 assert(ascii->wstr == data); 376 assert(compact->wstr_length == ascii->length); 377 } else 378 assert(ascii->wstr != data); 379 } 380 381 if (compact->utf8 == NULL) 382 assert(compact->utf8_length == 0); 383 if (ascii->wstr == NULL) 384 assert(compact->wstr_length == 0); 385 } 386 /* check that the best kind is used */ 387 if (check_content && kind != PyUnicode_WCHAR_KIND) 388 { 389 Py_ssize_t i; 390 Py_UCS4 maxchar = 0; 391 void *data; 392 Py_UCS4 ch; 393 394 data = PyUnicode_DATA(ascii); 395 for (i=0; i < ascii->length; i++) 396 { 397 ch = PyUnicode_READ(kind, data, i); 398 if (ch > maxchar) 399 maxchar = ch; 400 } 401 if (kind == PyUnicode_1BYTE_KIND) { 402 if (ascii->state.ascii == 0) { 403 assert(maxchar >= 128); 404 assert(maxchar <= 255); 405 } 406 else 407 assert(maxchar < 128); 408 } 409 else if (kind == PyUnicode_2BYTE_KIND) { 410 assert(maxchar >= 0x100); 411 assert(maxchar <= 0xFFFF); 412 } 413 else { 414 assert(maxchar >= 0x10000); 415 assert(maxchar <= MAX_UNICODE); 416 } 417 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 418 } 419 return 1; 420} 421#endif 422 423static PyObject* 424unicode_result_wchar(PyObject *unicode) 425{ 426#ifndef Py_DEBUG 427 Py_ssize_t len; 428 429 len = _PyUnicode_WSTR_LENGTH(unicode); 430 if (len == 0) { 431 Py_DECREF(unicode); 432 _Py_RETURN_UNICODE_EMPTY(); 433 } 434 435 if (len == 1) { 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 437 if ((Py_UCS4)ch < 256) { 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 439 Py_DECREF(unicode); 440 return latin1_char; 441 } 442 } 443 444 if (_PyUnicode_Ready(unicode) < 0) { 445 Py_DECREF(unicode); 446 return NULL; 447 } 448#else 449 assert(Py_REFCNT(unicode) == 1); 450 451 /* don't make the result ready in debug mode to ensure that the caller 452 makes the string ready before using it */ 453 assert(_PyUnicode_CheckConsistency(unicode, 1)); 454#endif 455 return unicode; 456} 457 458static PyObject* 459unicode_result_ready(PyObject *unicode) 460{ 461 Py_ssize_t length; 462 463 length = PyUnicode_GET_LENGTH(unicode); 464 if (length == 0) { 465 if (unicode != unicode_empty) { 466 Py_DECREF(unicode); 467 _Py_RETURN_UNICODE_EMPTY(); 468 } 469 return unicode_empty; 470 } 471 472 if (length == 1) { 473 void *data = PyUnicode_DATA(unicode); 474 int kind = PyUnicode_KIND(unicode); 475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 476 if (ch < 256) { 477 PyObject *latin1_char = unicode_latin1[ch]; 478 if (latin1_char != NULL) { 479 if (unicode != latin1_char) { 480 Py_INCREF(latin1_char); 481 Py_DECREF(unicode); 482 } 483 return latin1_char; 484 } 485 else { 486 assert(_PyUnicode_CheckConsistency(unicode, 1)); 487 Py_INCREF(unicode); 488 unicode_latin1[ch] = unicode; 489 return unicode; 490 } 491 } 492 } 493 494 assert(_PyUnicode_CheckConsistency(unicode, 1)); 495 return unicode; 496} 497 498static PyObject* 499unicode_result(PyObject *unicode) 500{ 501 assert(_PyUnicode_CHECK(unicode)); 502 if (PyUnicode_IS_READY(unicode)) 503 return unicode_result_ready(unicode); 504 else 505 return unicode_result_wchar(unicode); 506} 507 508static PyObject* 509unicode_result_unchanged(PyObject *unicode) 510{ 511 if (PyUnicode_CheckExact(unicode)) { 512 if (PyUnicode_READY(unicode) == -1) 513 return NULL; 514 Py_INCREF(unicode); 515 return unicode; 516 } 517 else 518 /* Subtype -- return genuine unicode string with the same value. */ 519 return _PyUnicode_Copy(unicode); 520} 521 522#ifdef HAVE_MBCS 523static OSVERSIONINFOEX winver; 524#endif 525 526/* --- Bloom Filters ----------------------------------------------------- */ 527 528/* stuff to implement simple "bloom filters" for Unicode characters. 529 to keep things simple, we use a single bitmask, using the least 5 530 bits from each unicode characters as the bit index. */ 531 532/* the linebreak mask is set up by Unicode_Init below */ 533 534#if LONG_BIT >= 128 535#define BLOOM_WIDTH 128 536#elif LONG_BIT >= 64 537#define BLOOM_WIDTH 64 538#elif LONG_BIT >= 32 539#define BLOOM_WIDTH 32 540#else 541#error "LONG_BIT is smaller than 32" 542#endif 543 544#define BLOOM_MASK unsigned long 545 546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 547 548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 549 550#define BLOOM_LINEBREAK(ch) \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 553 554Py_LOCAL_INLINE(BLOOM_MASK) 555make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 556{ 557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 558 do { \ 559 TYPE *data = (TYPE *)PTR; \ 560 TYPE *end = data + LEN; \ 561 Py_UCS4 ch; \ 562 for (; data != end; data++) { \ 563 ch = *data; \ 564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 565 } \ 566 break; \ 567 } while (0) 568 569 /* calculate simple bloom-style bitmask for a given unicode string */ 570 571 BLOOM_MASK mask; 572 573 mask = 0; 574 switch (kind) { 575 case PyUnicode_1BYTE_KIND: 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 577 break; 578 case PyUnicode_2BYTE_KIND: 579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 580 break; 581 case PyUnicode_4BYTE_KIND: 582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 583 break; 584 default: 585 assert(0); 586 } 587 return mask; 588 589#undef BLOOM_UPDATE 590} 591 592/* Compilation of templated routines */ 593 594#include "stringlib/asciilib.h" 595#include "stringlib/fastsearch.h" 596#include "stringlib/partition.h" 597#include "stringlib/split.h" 598#include "stringlib/count.h" 599#include "stringlib/find.h" 600#include "stringlib/find_max_char.h" 601#include "stringlib/localeutil.h" 602#include "stringlib/undef.h" 603 604#include "stringlib/ucs1lib.h" 605#include "stringlib/fastsearch.h" 606#include "stringlib/partition.h" 607#include "stringlib/split.h" 608#include "stringlib/count.h" 609#include "stringlib/find.h" 610#include "stringlib/replace.h" 611#include "stringlib/find_max_char.h" 612#include "stringlib/localeutil.h" 613#include "stringlib/undef.h" 614 615#include "stringlib/ucs2lib.h" 616#include "stringlib/fastsearch.h" 617#include "stringlib/partition.h" 618#include "stringlib/split.h" 619#include "stringlib/count.h" 620#include "stringlib/find.h" 621#include "stringlib/replace.h" 622#include "stringlib/find_max_char.h" 623#include "stringlib/localeutil.h" 624#include "stringlib/undef.h" 625 626#include "stringlib/ucs4lib.h" 627#include "stringlib/fastsearch.h" 628#include "stringlib/partition.h" 629#include "stringlib/split.h" 630#include "stringlib/count.h" 631#include "stringlib/find.h" 632#include "stringlib/replace.h" 633#include "stringlib/find_max_char.h" 634#include "stringlib/localeutil.h" 635#include "stringlib/undef.h" 636 637#include "stringlib/unicodedefs.h" 638#include "stringlib/fastsearch.h" 639#include "stringlib/count.h" 640#include "stringlib/find.h" 641#include "stringlib/undef.h" 642 643/* --- Unicode Object ----------------------------------------------------- */ 644 645static PyObject * 646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 647 648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 649 Py_ssize_t size, Py_UCS4 ch, 650 int direction) 651{ 652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 653 654 switch (kind) { 655 case PyUnicode_1BYTE_KIND: 656 { 657 Py_UCS1 ch1 = (Py_UCS1) ch; 658 if (ch1 == ch) 659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 660 else 661 return -1; 662 } 663 case PyUnicode_2BYTE_KIND: 664 { 665 Py_UCS2 ch2 = (Py_UCS2) ch; 666 if (ch2 == ch) 667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 668 else 669 return -1; 670 } 671 case PyUnicode_4BYTE_KIND: 672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 673 default: 674 assert(0); 675 return -1; 676 } 677} 678 679#ifdef Py_DEBUG 680/* Fill the data of an Unicode string with invalid characters to detect bugs 681 earlier. 682 683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 685 invalid character in Unicode 6.0. */ 686static void 687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 688{ 689 int kind = PyUnicode_KIND(unicode); 690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 691 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 692 if (length <= old_length) 693 return; 694 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 695} 696#endif 697 698static PyObject* 699resize_compact(PyObject *unicode, Py_ssize_t length) 700{ 701 Py_ssize_t char_size; 702 Py_ssize_t struct_size; 703 Py_ssize_t new_size; 704 int share_wstr; 705 PyObject *new_unicode; 706#ifdef Py_DEBUG 707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 708#endif 709 710 assert(unicode_modifiable(unicode)); 711 assert(PyUnicode_IS_READY(unicode)); 712 assert(PyUnicode_IS_COMPACT(unicode)); 713 714 char_size = PyUnicode_KIND(unicode); 715 if (PyUnicode_IS_ASCII(unicode)) 716 struct_size = sizeof(PyASCIIObject); 717 else 718 struct_size = sizeof(PyCompactUnicodeObject); 719 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 720 721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 722 PyErr_NoMemory(); 723 return NULL; 724 } 725 new_size = (struct_size + (length + 1) * char_size); 726 727 _Py_DEC_REFTOTAL; 728 _Py_ForgetReference(unicode); 729 730 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 731 if (new_unicode == NULL) { 732 _Py_NewReference(unicode); 733 PyErr_NoMemory(); 734 return NULL; 735 } 736 unicode = new_unicode; 737 _Py_NewReference(unicode); 738 739 _PyUnicode_LENGTH(unicode) = length; 740 if (share_wstr) { 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 742 if (!PyUnicode_IS_ASCII(unicode)) 743 _PyUnicode_WSTR_LENGTH(unicode) = length; 744 } 745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 746 PyObject_DEL(_PyUnicode_WSTR(unicode)); 747 _PyUnicode_WSTR(unicode) = NULL; 748 } 749#ifdef Py_DEBUG 750 unicode_fill_invalid(unicode, old_length); 751#endif 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 753 length, 0); 754 assert(_PyUnicode_CheckConsistency(unicode, 0)); 755 return unicode; 756} 757 758static int 759resize_inplace(PyObject *unicode, Py_ssize_t length) 760{ 761 wchar_t *wstr; 762 Py_ssize_t new_size; 763 assert(!PyUnicode_IS_COMPACT(unicode)); 764 assert(Py_REFCNT(unicode) == 1); 765 766 if (PyUnicode_IS_READY(unicode)) { 767 Py_ssize_t char_size; 768 int share_wstr, share_utf8; 769 void *data; 770#ifdef Py_DEBUG 771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 772#endif 773 774 data = _PyUnicode_DATA_ANY(unicode); 775 char_size = PyUnicode_KIND(unicode); 776 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 778 779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 780 PyErr_NoMemory(); 781 return -1; 782 } 783 new_size = (length + 1) * char_size; 784 785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 786 { 787 PyObject_DEL(_PyUnicode_UTF8(unicode)); 788 _PyUnicode_UTF8(unicode) = NULL; 789 _PyUnicode_UTF8_LENGTH(unicode) = 0; 790 } 791 792 data = (PyObject *)PyObject_REALLOC(data, new_size); 793 if (data == NULL) { 794 PyErr_NoMemory(); 795 return -1; 796 } 797 _PyUnicode_DATA_ANY(unicode) = data; 798 if (share_wstr) { 799 _PyUnicode_WSTR(unicode) = data; 800 _PyUnicode_WSTR_LENGTH(unicode) = length; 801 } 802 if (share_utf8) { 803 _PyUnicode_UTF8(unicode) = data; 804 _PyUnicode_UTF8_LENGTH(unicode) = length; 805 } 806 _PyUnicode_LENGTH(unicode) = length; 807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 808#ifdef Py_DEBUG 809 unicode_fill_invalid(unicode, old_length); 810#endif 811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 812 assert(_PyUnicode_CheckConsistency(unicode, 0)); 813 return 0; 814 } 815 } 816 assert(_PyUnicode_WSTR(unicode) != NULL); 817 818 /* check for integer overflow */ 819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 820 PyErr_NoMemory(); 821 return -1; 822 } 823 new_size = sizeof(wchar_t) * (length + 1); 824 wstr = _PyUnicode_WSTR(unicode); 825 wstr = PyObject_REALLOC(wstr, new_size); 826 if (!wstr) { 827 PyErr_NoMemory(); 828 return -1; 829 } 830 _PyUnicode_WSTR(unicode) = wstr; 831 _PyUnicode_WSTR(unicode)[length] = 0; 832 _PyUnicode_WSTR_LENGTH(unicode) = length; 833 assert(_PyUnicode_CheckConsistency(unicode, 0)); 834 return 0; 835} 836 837static PyObject* 838resize_copy(PyObject *unicode, Py_ssize_t length) 839{ 840 Py_ssize_t copy_length; 841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 842 PyObject *copy; 843 844 if (PyUnicode_READY(unicode) == -1) 845 return NULL; 846 847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 848 if (copy == NULL) 849 return NULL; 850 851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 853 return copy; 854 } 855 else { 856 PyObject *w; 857 858 w = (PyObject*)_PyUnicode_New(length); 859 if (w == NULL) 860 return NULL; 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 862 copy_length = Py_MIN(copy_length, length); 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 864 copy_length * sizeof(wchar_t)); 865 return w; 866 } 867} 868 869/* We allocate one more byte to make sure the string is 870 Ux0000 terminated; some code (e.g. new_identifier) 871 relies on that. 872 873 XXX This allocator could further be enhanced by assuring that the 874 free list never reduces its size below 1. 875 876*/ 877 878static PyUnicodeObject * 879_PyUnicode_New(Py_ssize_t length) 880{ 881 PyUnicodeObject *unicode; 882 size_t new_size; 883 884 /* Optimization for empty strings */ 885 if (length == 0 && unicode_empty != NULL) { 886 Py_INCREF(unicode_empty); 887 return (PyUnicodeObject*)unicode_empty; 888 } 889 890 /* Ensure we won't overflow the size. */ 891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 892 return (PyUnicodeObject *)PyErr_NoMemory(); 893 } 894 if (length < 0) { 895 PyErr_SetString(PyExc_SystemError, 896 "Negative size passed to _PyUnicode_New"); 897 return NULL; 898 } 899 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 901 if (unicode == NULL) 902 return NULL; 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 904 905 _PyUnicode_WSTR_LENGTH(unicode) = length; 906 _PyUnicode_HASH(unicode) = -1; 907 _PyUnicode_STATE(unicode).interned = 0; 908 _PyUnicode_STATE(unicode).kind = 0; 909 _PyUnicode_STATE(unicode).compact = 0; 910 _PyUnicode_STATE(unicode).ready = 0; 911 _PyUnicode_STATE(unicode).ascii = 0; 912 _PyUnicode_DATA_ANY(unicode) = NULL; 913 _PyUnicode_LENGTH(unicode) = 0; 914 _PyUnicode_UTF8(unicode) = NULL; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0; 916 917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 918 if (!_PyUnicode_WSTR(unicode)) { 919 Py_DECREF(unicode); 920 PyErr_NoMemory(); 921 return NULL; 922 } 923 924 /* Initialize the first element to guard against cases where 925 * the caller fails before initializing str -- unicode_resize() 926 * reads str[0], and the Keep-Alive optimization can keep memory 927 * allocated for str alive across a call to unicode_dealloc(unicode). 928 * We don't want unicode_resize to read uninitialized memory in 929 * that case. 930 */ 931 _PyUnicode_WSTR(unicode)[0] = 0; 932 _PyUnicode_WSTR(unicode)[length] = 0; 933 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 935 return unicode; 936} 937 938static const char* 939unicode_kind_name(PyObject *unicode) 940{ 941 /* don't check consistency: unicode_kind_name() is called from 942 _PyUnicode_Dump() */ 943 if (!PyUnicode_IS_COMPACT(unicode)) 944 { 945 if (!PyUnicode_IS_READY(unicode)) 946 return "wstr"; 947 switch (PyUnicode_KIND(unicode)) 948 { 949 case PyUnicode_1BYTE_KIND: 950 if (PyUnicode_IS_ASCII(unicode)) 951 return "legacy ascii"; 952 else 953 return "legacy latin1"; 954 case PyUnicode_2BYTE_KIND: 955 return "legacy UCS2"; 956 case PyUnicode_4BYTE_KIND: 957 return "legacy UCS4"; 958 default: 959 return "<legacy invalid kind>"; 960 } 961 } 962 assert(PyUnicode_IS_READY(unicode)); 963 switch (PyUnicode_KIND(unicode)) { 964 case PyUnicode_1BYTE_KIND: 965 if (PyUnicode_IS_ASCII(unicode)) 966 return "ascii"; 967 else 968 return "latin1"; 969 case PyUnicode_2BYTE_KIND: 970 return "UCS2"; 971 case PyUnicode_4BYTE_KIND: 972 return "UCS4"; 973 default: 974 return "<invalid compact kind>"; 975 } 976} 977 978#ifdef Py_DEBUG 979/* Functions wrapping macros for use in debugger */ 980char *_PyUnicode_utf8(void *unicode){ 981 return PyUnicode_UTF8(unicode); 982} 983 984void *_PyUnicode_compact_data(void *unicode) { 985 return _PyUnicode_COMPACT_DATA(unicode); 986} 987void *_PyUnicode_data(void *unicode){ 988 printf("obj %p\n", unicode); 989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 994 return PyUnicode_DATA(unicode); 995} 996 997void 998_PyUnicode_Dump(PyObject *op) 999{ 1000 PyASCIIObject *ascii = (PyASCIIObject *)op; 1001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1003 void *data; 1004 1005 if (ascii->state.compact) 1006 { 1007 if (ascii->state.ascii) 1008 data = (ascii + 1); 1009 else 1010 data = (compact + 1); 1011 } 1012 else 1013 data = unicode->data.any; 1014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1015 unicode_kind_name(op), ascii->length); 1016 1017 if (ascii->wstr == data) 1018 printf("shared "); 1019 printf("wstr=%p", ascii->wstr); 1020 1021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1023 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1024 printf("shared "); 1025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1026 compact->utf8, compact->utf8_length); 1027 } 1028 printf(", data=%p\n", data); 1029} 1030#endif 1031 1032PyObject * 1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1034{ 1035 PyObject *obj; 1036 PyCompactUnicodeObject *unicode; 1037 void *data; 1038 enum PyUnicode_Kind kind; 1039 int is_sharing, is_ascii; 1040 Py_ssize_t char_size; 1041 Py_ssize_t struct_size; 1042 1043 /* Optimization for empty strings */ 1044 if (size == 0 && unicode_empty != NULL) { 1045 Py_INCREF(unicode_empty); 1046 return unicode_empty; 1047 } 1048 1049 is_ascii = 0; 1050 is_sharing = 0; 1051 struct_size = sizeof(PyCompactUnicodeObject); 1052 if (maxchar < 128) { 1053 kind = PyUnicode_1BYTE_KIND; 1054 char_size = 1; 1055 is_ascii = 1; 1056 struct_size = sizeof(PyASCIIObject); 1057 } 1058 else if (maxchar < 256) { 1059 kind = PyUnicode_1BYTE_KIND; 1060 char_size = 1; 1061 } 1062 else if (maxchar < 65536) { 1063 kind = PyUnicode_2BYTE_KIND; 1064 char_size = 2; 1065 if (sizeof(wchar_t) == 2) 1066 is_sharing = 1; 1067 } 1068 else { 1069 if (maxchar > MAX_UNICODE) { 1070 PyErr_SetString(PyExc_SystemError, 1071 "invalid maximum character passed to PyUnicode_New"); 1072 return NULL; 1073 } 1074 kind = PyUnicode_4BYTE_KIND; 1075 char_size = 4; 1076 if (sizeof(wchar_t) == 4) 1077 is_sharing = 1; 1078 } 1079 1080 /* Ensure we won't overflow the size. */ 1081 if (size < 0) { 1082 PyErr_SetString(PyExc_SystemError, 1083 "Negative size passed to PyUnicode_New"); 1084 return NULL; 1085 } 1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1087 return PyErr_NoMemory(); 1088 1089 /* Duplicated allocation code from _PyObject_New() instead of a call to 1090 * PyObject_New() so we are able to allocate space for the object and 1091 * it's data buffer. 1092 */ 1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1094 if (obj == NULL) 1095 return PyErr_NoMemory(); 1096 obj = PyObject_INIT(obj, &PyUnicode_Type); 1097 if (obj == NULL) 1098 return NULL; 1099 1100 unicode = (PyCompactUnicodeObject *)obj; 1101 if (is_ascii) 1102 data = ((PyASCIIObject*)obj) + 1; 1103 else 1104 data = unicode + 1; 1105 _PyUnicode_LENGTH(unicode) = size; 1106 _PyUnicode_HASH(unicode) = -1; 1107 _PyUnicode_STATE(unicode).interned = 0; 1108 _PyUnicode_STATE(unicode).kind = kind; 1109 _PyUnicode_STATE(unicode).compact = 1; 1110 _PyUnicode_STATE(unicode).ready = 1; 1111 _PyUnicode_STATE(unicode).ascii = is_ascii; 1112 if (is_ascii) { 1113 ((char*)data)[size] = 0; 1114 _PyUnicode_WSTR(unicode) = NULL; 1115 } 1116 else if (kind == PyUnicode_1BYTE_KIND) { 1117 ((char*)data)[size] = 0; 1118 _PyUnicode_WSTR(unicode) = NULL; 1119 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1120 unicode->utf8 = NULL; 1121 unicode->utf8_length = 0; 1122 } 1123 else { 1124 unicode->utf8 = NULL; 1125 unicode->utf8_length = 0; 1126 if (kind == PyUnicode_2BYTE_KIND) 1127 ((Py_UCS2*)data)[size] = 0; 1128 else /* kind == PyUnicode_4BYTE_KIND */ 1129 ((Py_UCS4*)data)[size] = 0; 1130 if (is_sharing) { 1131 _PyUnicode_WSTR_LENGTH(unicode) = size; 1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1133 } 1134 else { 1135 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1136 _PyUnicode_WSTR(unicode) = NULL; 1137 } 1138 } 1139#ifdef Py_DEBUG 1140 unicode_fill_invalid((PyObject*)unicode, 0); 1141#endif 1142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1143 return obj; 1144} 1145 1146#if SIZEOF_WCHAR_T == 2 1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1148 will decode surrogate pairs, the other conversions are implemented as macros 1149 for efficiency. 1150 1151 This function assumes that unicode can hold one more code point than wstr 1152 characters for a terminating null character. */ 1153static void 1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1155 PyObject *unicode) 1156{ 1157 const wchar_t *iter; 1158 Py_UCS4 *ucs4_out; 1159 1160 assert(unicode != NULL); 1161 assert(_PyUnicode_CHECK(unicode)); 1162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1164 1165 for (iter = begin; iter < end; ) { 1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1167 _PyUnicode_GET_LENGTH(unicode))); 1168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1169 && (iter+1) < end 1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1171 { 1172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1173 iter += 2; 1174 } 1175 else { 1176 *ucs4_out++ = *iter; 1177 iter++; 1178 } 1179 } 1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1181 _PyUnicode_GET_LENGTH(unicode))); 1182 1183} 1184#endif 1185 1186static int 1187unicode_check_modifiable(PyObject *unicode) 1188{ 1189 if (!unicode_modifiable(unicode)) { 1190 PyErr_SetString(PyExc_SystemError, 1191 "Cannot modify a string currently used"); 1192 return -1; 1193 } 1194 return 0; 1195} 1196 1197static int 1198_copy_characters(PyObject *to, Py_ssize_t to_start, 1199 PyObject *from, Py_ssize_t from_start, 1200 Py_ssize_t how_many, int check_maxchar) 1201{ 1202 unsigned int from_kind, to_kind; 1203 void *from_data, *to_data; 1204 1205 assert(0 <= how_many); 1206 assert(0 <= from_start); 1207 assert(0 <= to_start); 1208 assert(PyUnicode_Check(from)); 1209 assert(PyUnicode_IS_READY(from)); 1210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1211 1212 assert(PyUnicode_Check(to)); 1213 assert(PyUnicode_IS_READY(to)); 1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1215 1216 if (how_many == 0) 1217 return 0; 1218 1219 from_kind = PyUnicode_KIND(from); 1220 from_data = PyUnicode_DATA(from); 1221 to_kind = PyUnicode_KIND(to); 1222 to_data = PyUnicode_DATA(to); 1223 1224#ifdef Py_DEBUG 1225 if (!check_maxchar 1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1227 { 1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1229 Py_UCS4 ch; 1230 Py_ssize_t i; 1231 for (i=0; i < how_many; i++) { 1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1233 assert(ch <= to_maxchar); 1234 } 1235 } 1236#endif 1237 1238 if (from_kind == to_kind) { 1239 if (check_maxchar 1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1241 { 1242 /* Writing Latin-1 characters into an ASCII string requires to 1243 check that all written characters are pure ASCII */ 1244 Py_UCS4 max_char; 1245 max_char = ucs1lib_find_max_char(from_data, 1246 (Py_UCS1*)from_data + how_many); 1247 if (max_char >= 128) 1248 return -1; 1249 } 1250 Py_MEMCPY((char*)to_data + to_kind * to_start, 1251 (char*)from_data + from_kind * from_start, 1252 to_kind * how_many); 1253 } 1254 else if (from_kind == PyUnicode_1BYTE_KIND 1255 && to_kind == PyUnicode_2BYTE_KIND) 1256 { 1257 _PyUnicode_CONVERT_BYTES( 1258 Py_UCS1, Py_UCS2, 1259 PyUnicode_1BYTE_DATA(from) + from_start, 1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1261 PyUnicode_2BYTE_DATA(to) + to_start 1262 ); 1263 } 1264 else if (from_kind == PyUnicode_1BYTE_KIND 1265 && to_kind == PyUnicode_4BYTE_KIND) 1266 { 1267 _PyUnicode_CONVERT_BYTES( 1268 Py_UCS1, Py_UCS4, 1269 PyUnicode_1BYTE_DATA(from) + from_start, 1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1271 PyUnicode_4BYTE_DATA(to) + to_start 1272 ); 1273 } 1274 else if (from_kind == PyUnicode_2BYTE_KIND 1275 && to_kind == PyUnicode_4BYTE_KIND) 1276 { 1277 _PyUnicode_CONVERT_BYTES( 1278 Py_UCS2, Py_UCS4, 1279 PyUnicode_2BYTE_DATA(from) + from_start, 1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1281 PyUnicode_4BYTE_DATA(to) + to_start 1282 ); 1283 } 1284 else { 1285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1286 1287 if (!check_maxchar) { 1288 if (from_kind == PyUnicode_2BYTE_KIND 1289 && to_kind == PyUnicode_1BYTE_KIND) 1290 { 1291 _PyUnicode_CONVERT_BYTES( 1292 Py_UCS2, Py_UCS1, 1293 PyUnicode_2BYTE_DATA(from) + from_start, 1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1295 PyUnicode_1BYTE_DATA(to) + to_start 1296 ); 1297 } 1298 else if (from_kind == PyUnicode_4BYTE_KIND 1299 && to_kind == PyUnicode_1BYTE_KIND) 1300 { 1301 _PyUnicode_CONVERT_BYTES( 1302 Py_UCS4, Py_UCS1, 1303 PyUnicode_4BYTE_DATA(from) + from_start, 1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1305 PyUnicode_1BYTE_DATA(to) + to_start 1306 ); 1307 } 1308 else if (from_kind == PyUnicode_4BYTE_KIND 1309 && to_kind == PyUnicode_2BYTE_KIND) 1310 { 1311 _PyUnicode_CONVERT_BYTES( 1312 Py_UCS4, Py_UCS2, 1313 PyUnicode_4BYTE_DATA(from) + from_start, 1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1315 PyUnicode_2BYTE_DATA(to) + to_start 1316 ); 1317 } 1318 else { 1319 assert(0); 1320 return -1; 1321 } 1322 } 1323 else { 1324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1325 Py_UCS4 ch; 1326 Py_ssize_t i; 1327 1328 for (i=0; i < how_many; i++) { 1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1330 if (ch > to_maxchar) 1331 return -1; 1332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1333 } 1334 } 1335 } 1336 return 0; 1337} 1338 1339void 1340_PyUnicode_FastCopyCharacters( 1341 PyObject *to, Py_ssize_t to_start, 1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1343{ 1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1345} 1346 1347Py_ssize_t 1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1349 PyObject *from, Py_ssize_t from_start, 1350 Py_ssize_t how_many) 1351{ 1352 int err; 1353 1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1355 PyErr_BadInternalCall(); 1356 return -1; 1357 } 1358 1359 if (PyUnicode_READY(from) == -1) 1360 return -1; 1361 if (PyUnicode_READY(to) == -1) 1362 return -1; 1363 1364 if (from_start < 0) { 1365 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1366 return -1; 1367 } 1368 if (to_start < 0) { 1369 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1370 return -1; 1371 } 1372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1374 PyErr_Format(PyExc_SystemError, 1375 "Cannot write %zi characters at %zi " 1376 "in a string of %zi characters", 1377 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1378 return -1; 1379 } 1380 1381 if (how_many == 0) 1382 return 0; 1383 1384 if (unicode_check_modifiable(to)) 1385 return -1; 1386 1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1388 if (err) { 1389 PyErr_Format(PyExc_SystemError, 1390 "Cannot copy %s characters " 1391 "into a string of %s characters", 1392 unicode_kind_name(from), 1393 unicode_kind_name(to)); 1394 return -1; 1395 } 1396 return how_many; 1397} 1398 1399/* Find the maximum code point and count the number of surrogate pairs so a 1400 correct string length can be computed before converting a string to UCS4. 1401 This function counts single surrogates as a character and not as a pair. 1402 1403 Return 0 on success, or -1 on error. */ 1404static int 1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1407{ 1408 const wchar_t *iter; 1409 Py_UCS4 ch; 1410 1411 assert(num_surrogates != NULL && maxchar != NULL); 1412 *num_surrogates = 0; 1413 *maxchar = 0; 1414 1415 for (iter = begin; iter < end; ) { 1416#if SIZEOF_WCHAR_T == 2 1417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1418 && (iter+1) < end 1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1420 { 1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1422 ++(*num_surrogates); 1423 iter += 2; 1424 } 1425 else 1426#endif 1427 { 1428 ch = *iter; 1429 iter++; 1430 } 1431 if (ch > *maxchar) { 1432 *maxchar = ch; 1433 if (*maxchar > MAX_UNICODE) { 1434 PyErr_Format(PyExc_ValueError, 1435 "character U+%x is not in range [U+0000; U+10ffff]", 1436 ch); 1437 return -1; 1438 } 1439 } 1440 } 1441 return 0; 1442} 1443 1444int 1445_PyUnicode_Ready(PyObject *unicode) 1446{ 1447 wchar_t *end; 1448 Py_UCS4 maxchar = 0; 1449 Py_ssize_t num_surrogates; 1450#if SIZEOF_WCHAR_T == 2 1451 Py_ssize_t length_wo_surrogates; 1452#endif 1453 1454 /* _PyUnicode_Ready() is only intended for old-style API usage where 1455 strings were created using _PyObject_New() and where no canonical 1456 representation (the str field) has been set yet aka strings 1457 which are not yet ready. */ 1458 assert(_PyUnicode_CHECK(unicode)); 1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1460 assert(_PyUnicode_WSTR(unicode) != NULL); 1461 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1462 assert(_PyUnicode_UTF8(unicode) == NULL); 1463 /* Actually, it should neither be interned nor be anything else: */ 1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1465 1466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1468 &maxchar, &num_surrogates) == -1) 1469 return -1; 1470 1471 if (maxchar < 256) { 1472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1473 if (!_PyUnicode_DATA_ANY(unicode)) { 1474 PyErr_NoMemory(); 1475 return -1; 1476 } 1477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1478 _PyUnicode_WSTR(unicode), end, 1479 PyUnicode_1BYTE_DATA(unicode)); 1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1483 if (maxchar < 128) { 1484 _PyUnicode_STATE(unicode).ascii = 1; 1485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1487 } 1488 else { 1489 _PyUnicode_STATE(unicode).ascii = 0; 1490 _PyUnicode_UTF8(unicode) = NULL; 1491 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1492 } 1493 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1494 _PyUnicode_WSTR(unicode) = NULL; 1495 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1496 } 1497 /* In this case we might have to convert down from 4-byte native 1498 wchar_t to 2-byte unicode. */ 1499 else if (maxchar < 65536) { 1500 assert(num_surrogates == 0 && 1501 "FindMaxCharAndNumSurrogatePairs() messed up"); 1502 1503#if SIZEOF_WCHAR_T == 2 1504 /* We can share representations and are done. */ 1505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1509 _PyUnicode_UTF8(unicode) = NULL; 1510 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1511#else 1512 /* sizeof(wchar_t) == 4 */ 1513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1515 if (!_PyUnicode_DATA_ANY(unicode)) { 1516 PyErr_NoMemory(); 1517 return -1; 1518 } 1519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1520 _PyUnicode_WSTR(unicode), end, 1521 PyUnicode_2BYTE_DATA(unicode)); 1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1525 _PyUnicode_UTF8(unicode) = NULL; 1526 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1527 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1528 _PyUnicode_WSTR(unicode) = NULL; 1529 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1530#endif 1531 } 1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1533 else { 1534#if SIZEOF_WCHAR_T == 2 1535 /* in case the native representation is 2-bytes, we need to allocate a 1536 new normalized 4-byte version. */ 1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1538 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1539 PyErr_NoMemory(); 1540 return -1; 1541 } 1542 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1543 if (!_PyUnicode_DATA_ANY(unicode)) { 1544 PyErr_NoMemory(); 1545 return -1; 1546 } 1547 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1548 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1549 _PyUnicode_UTF8(unicode) = NULL; 1550 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1551 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1552 _PyUnicode_STATE(unicode).ready = 1; 1553 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1554 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1555 _PyUnicode_WSTR(unicode) = NULL; 1556 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1557#else 1558 assert(num_surrogates == 0); 1559 1560 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1561 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1562 _PyUnicode_UTF8(unicode) = NULL; 1563 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1564 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1565#endif 1566 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1567 } 1568 _PyUnicode_STATE(unicode).ready = 1; 1569 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1570 return 0; 1571} 1572 1573static void 1574unicode_dealloc(PyObject *unicode) 1575{ 1576 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1577 case SSTATE_NOT_INTERNED: 1578 break; 1579 1580 case SSTATE_INTERNED_MORTAL: 1581 /* revive dead object temporarily for DelItem */ 1582 Py_REFCNT(unicode) = 3; 1583 if (PyDict_DelItem(interned, unicode) != 0) 1584 Py_FatalError( 1585 "deletion of interned string failed"); 1586 break; 1587 1588 case SSTATE_INTERNED_IMMORTAL: 1589 Py_FatalError("Immortal interned string died."); 1590 1591 default: 1592 Py_FatalError("Inconsistent interned string state."); 1593 } 1594 1595 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1596 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1597 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1598 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1599 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1600 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1601 1602 Py_TYPE(unicode)->tp_free(unicode); 1603} 1604 1605#ifdef Py_DEBUG 1606static int 1607unicode_is_singleton(PyObject *unicode) 1608{ 1609 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1610 if (unicode == unicode_empty) 1611 return 1; 1612 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1613 { 1614 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1615 if (ch < 256 && unicode_latin1[ch] == unicode) 1616 return 1; 1617 } 1618 return 0; 1619} 1620#endif 1621 1622static int 1623unicode_modifiable(PyObject *unicode) 1624{ 1625 assert(_PyUnicode_CHECK(unicode)); 1626 if (Py_REFCNT(unicode) != 1) 1627 return 0; 1628 if (_PyUnicode_HASH(unicode) != -1) 1629 return 0; 1630 if (PyUnicode_CHECK_INTERNED(unicode)) 1631 return 0; 1632 if (!PyUnicode_CheckExact(unicode)) 1633 return 0; 1634#ifdef Py_DEBUG 1635 /* singleton refcount is greater than 1 */ 1636 assert(!unicode_is_singleton(unicode)); 1637#endif 1638 return 1; 1639} 1640 1641static int 1642unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1643{ 1644 PyObject *unicode; 1645 Py_ssize_t old_length; 1646 1647 assert(p_unicode != NULL); 1648 unicode = *p_unicode; 1649 1650 assert(unicode != NULL); 1651 assert(PyUnicode_Check(unicode)); 1652 assert(0 <= length); 1653 1654 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1655 old_length = PyUnicode_WSTR_LENGTH(unicode); 1656 else 1657 old_length = PyUnicode_GET_LENGTH(unicode); 1658 if (old_length == length) 1659 return 0; 1660 1661 if (length == 0) { 1662 _Py_INCREF_UNICODE_EMPTY(); 1663 if (!unicode_empty) 1664 return -1; 1665 Py_DECREF(*p_unicode); 1666 *p_unicode = unicode_empty; 1667 return 0; 1668 } 1669 1670 if (!unicode_modifiable(unicode)) { 1671 PyObject *copy = resize_copy(unicode, length); 1672 if (copy == NULL) 1673 return -1; 1674 Py_DECREF(*p_unicode); 1675 *p_unicode = copy; 1676 return 0; 1677 } 1678 1679 if (PyUnicode_IS_COMPACT(unicode)) { 1680 PyObject *new_unicode = resize_compact(unicode, length); 1681 if (new_unicode == NULL) 1682 return -1; 1683 *p_unicode = new_unicode; 1684 return 0; 1685 } 1686 return resize_inplace(unicode, length); 1687} 1688 1689int 1690PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1691{ 1692 PyObject *unicode; 1693 if (p_unicode == NULL) { 1694 PyErr_BadInternalCall(); 1695 return -1; 1696 } 1697 unicode = *p_unicode; 1698 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1699 { 1700 PyErr_BadInternalCall(); 1701 return -1; 1702 } 1703 return unicode_resize(p_unicode, length); 1704} 1705 1706/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1707 1708 WARNING: The function doesn't copy the terminating null character and 1709 doesn't check the maximum character (may write a latin1 character in an 1710 ASCII string). */ 1711static void 1712unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1713 const char *str, Py_ssize_t len) 1714{ 1715 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1716 void *data = PyUnicode_DATA(unicode); 1717 const char *end = str + len; 1718 1719 switch (kind) { 1720 case PyUnicode_1BYTE_KIND: { 1721 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1722#ifdef Py_DEBUG 1723 if (PyUnicode_IS_ASCII(unicode)) { 1724 Py_UCS4 maxchar = ucs1lib_find_max_char( 1725 (const Py_UCS1*)str, 1726 (const Py_UCS1*)str + len); 1727 assert(maxchar < 128); 1728 } 1729#endif 1730 memcpy((char *) data + index, str, len); 1731 break; 1732 } 1733 case PyUnicode_2BYTE_KIND: { 1734 Py_UCS2 *start = (Py_UCS2 *)data + index; 1735 Py_UCS2 *ucs2 = start; 1736 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1737 1738 for (; str < end; ++ucs2, ++str) 1739 *ucs2 = (Py_UCS2)*str; 1740 1741 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1742 break; 1743 } 1744 default: { 1745 Py_UCS4 *start = (Py_UCS4 *)data + index; 1746 Py_UCS4 *ucs4 = start; 1747 assert(kind == PyUnicode_4BYTE_KIND); 1748 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1749 1750 for (; str < end; ++ucs4, ++str) 1751 *ucs4 = (Py_UCS4)*str; 1752 1753 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1754 } 1755 } 1756} 1757 1758static PyObject* 1759get_latin1_char(unsigned char ch) 1760{ 1761 PyObject *unicode = unicode_latin1[ch]; 1762 if (!unicode) { 1763 unicode = PyUnicode_New(1, ch); 1764 if (!unicode) 1765 return NULL; 1766 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1767 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1768 unicode_latin1[ch] = unicode; 1769 } 1770 Py_INCREF(unicode); 1771 return unicode; 1772} 1773 1774static PyObject* 1775unicode_char(Py_UCS4 ch) 1776{ 1777 PyObject *unicode; 1778 1779 assert(ch <= MAX_UNICODE); 1780 1781 if (ch < 256) 1782 return get_latin1_char(ch); 1783 1784 unicode = PyUnicode_New(1, ch); 1785 if (unicode == NULL) 1786 return NULL; 1787 switch (PyUnicode_KIND(unicode)) { 1788 case PyUnicode_1BYTE_KIND: 1789 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1790 break; 1791 case PyUnicode_2BYTE_KIND: 1792 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1793 break; 1794 default: 1795 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1796 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1797 } 1798 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1799 return unicode; 1800} 1801 1802PyObject * 1803PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1804{ 1805 PyObject *unicode; 1806 Py_UCS4 maxchar = 0; 1807 Py_ssize_t num_surrogates; 1808 1809 if (u == NULL) 1810 return (PyObject*)_PyUnicode_New(size); 1811 1812 /* If the Unicode data is known at construction time, we can apply 1813 some optimizations which share commonly used objects. */ 1814 1815 /* Optimization for empty strings */ 1816 if (size == 0) 1817 _Py_RETURN_UNICODE_EMPTY(); 1818 1819 /* Single character Unicode objects in the Latin-1 range are 1820 shared when using this constructor */ 1821 if (size == 1 && (Py_UCS4)*u < 256) 1822 return get_latin1_char((unsigned char)*u); 1823 1824 /* If not empty and not single character, copy the Unicode data 1825 into the new object */ 1826 if (find_maxchar_surrogates(u, u + size, 1827 &maxchar, &num_surrogates) == -1) 1828 return NULL; 1829 1830 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1831 if (!unicode) 1832 return NULL; 1833 1834 switch (PyUnicode_KIND(unicode)) { 1835 case PyUnicode_1BYTE_KIND: 1836 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1837 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1838 break; 1839 case PyUnicode_2BYTE_KIND: 1840#if Py_UNICODE_SIZE == 2 1841 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1842#else 1843 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1844 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1845#endif 1846 break; 1847 case PyUnicode_4BYTE_KIND: 1848#if SIZEOF_WCHAR_T == 2 1849 /* This is the only case which has to process surrogates, thus 1850 a simple copy loop is not enough and we need a function. */ 1851 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1852#else 1853 assert(num_surrogates == 0); 1854 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1855#endif 1856 break; 1857 default: 1858 assert(0 && "Impossible state"); 1859 } 1860 1861 return unicode_result(unicode); 1862} 1863 1864PyObject * 1865PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1866{ 1867 if (size < 0) { 1868 PyErr_SetString(PyExc_SystemError, 1869 "Negative size passed to PyUnicode_FromStringAndSize"); 1870 return NULL; 1871 } 1872 if (u != NULL) 1873 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1874 else 1875 return (PyObject *)_PyUnicode_New(size); 1876} 1877 1878PyObject * 1879PyUnicode_FromString(const char *u) 1880{ 1881 size_t size = strlen(u); 1882 if (size > PY_SSIZE_T_MAX) { 1883 PyErr_SetString(PyExc_OverflowError, "input too long"); 1884 return NULL; 1885 } 1886 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1887} 1888 1889PyObject * 1890_PyUnicode_FromId(_Py_Identifier *id) 1891{ 1892 if (!id->object) { 1893 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1894 strlen(id->string), 1895 NULL, NULL); 1896 if (!id->object) 1897 return NULL; 1898 PyUnicode_InternInPlace(&id->object); 1899 assert(!id->next); 1900 id->next = static_strings; 1901 static_strings = id; 1902 } 1903 return id->object; 1904} 1905 1906void 1907_PyUnicode_ClearStaticStrings() 1908{ 1909 _Py_Identifier *tmp, *s = static_strings; 1910 while (s) { 1911 Py_CLEAR(s->object); 1912 tmp = s->next; 1913 s->next = NULL; 1914 s = tmp; 1915 } 1916 static_strings = NULL; 1917} 1918 1919/* Internal function, doesn't check maximum character */ 1920 1921PyObject* 1922_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1923{ 1924 const unsigned char *s = (const unsigned char *)buffer; 1925 PyObject *unicode; 1926 if (size == 1) { 1927#ifdef Py_DEBUG 1928 assert((unsigned char)s[0] < 128); 1929#endif 1930 return get_latin1_char(s[0]); 1931 } 1932 unicode = PyUnicode_New(size, 127); 1933 if (!unicode) 1934 return NULL; 1935 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1936 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1937 return unicode; 1938} 1939 1940static Py_UCS4 1941kind_maxchar_limit(unsigned int kind) 1942{ 1943 switch (kind) { 1944 case PyUnicode_1BYTE_KIND: 1945 return 0x80; 1946 case PyUnicode_2BYTE_KIND: 1947 return 0x100; 1948 case PyUnicode_4BYTE_KIND: 1949 return 0x10000; 1950 default: 1951 assert(0 && "invalid kind"); 1952 return MAX_UNICODE; 1953 } 1954} 1955 1956Py_LOCAL_INLINE(Py_UCS4) 1957align_maxchar(Py_UCS4 maxchar) 1958{ 1959 if (maxchar <= 127) 1960 return 127; 1961 else if (maxchar <= 255) 1962 return 255; 1963 else if (maxchar <= 65535) 1964 return 65535; 1965 else 1966 return MAX_UNICODE; 1967} 1968 1969static PyObject* 1970_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1971{ 1972 PyObject *res; 1973 unsigned char max_char; 1974 1975 if (size == 0) 1976 _Py_RETURN_UNICODE_EMPTY(); 1977 assert(size > 0); 1978 if (size == 1) 1979 return get_latin1_char(u[0]); 1980 1981 max_char = ucs1lib_find_max_char(u, u + size); 1982 res = PyUnicode_New(size, max_char); 1983 if (!res) 1984 return NULL; 1985 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1986 assert(_PyUnicode_CheckConsistency(res, 1)); 1987 return res; 1988} 1989 1990static PyObject* 1991_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1992{ 1993 PyObject *res; 1994 Py_UCS2 max_char; 1995 1996 if (size == 0) 1997 _Py_RETURN_UNICODE_EMPTY(); 1998 assert(size > 0); 1999 if (size == 1) 2000 return unicode_char(u[0]); 2001 2002 max_char = ucs2lib_find_max_char(u, u + size); 2003 res = PyUnicode_New(size, max_char); 2004 if (!res) 2005 return NULL; 2006 if (max_char >= 256) 2007 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2008 else { 2009 _PyUnicode_CONVERT_BYTES( 2010 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2011 } 2012 assert(_PyUnicode_CheckConsistency(res, 1)); 2013 return res; 2014} 2015 2016static PyObject* 2017_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2018{ 2019 PyObject *res; 2020 Py_UCS4 max_char; 2021 2022 if (size == 0) 2023 _Py_RETURN_UNICODE_EMPTY(); 2024 assert(size > 0); 2025 if (size == 1) 2026 return unicode_char(u[0]); 2027 2028 max_char = ucs4lib_find_max_char(u, u + size); 2029 res = PyUnicode_New(size, max_char); 2030 if (!res) 2031 return NULL; 2032 if (max_char < 256) 2033 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2034 PyUnicode_1BYTE_DATA(res)); 2035 else if (max_char < 0x10000) 2036 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2037 PyUnicode_2BYTE_DATA(res)); 2038 else 2039 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2040 assert(_PyUnicode_CheckConsistency(res, 1)); 2041 return res; 2042} 2043 2044PyObject* 2045PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2046{ 2047 if (size < 0) { 2048 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2049 return NULL; 2050 } 2051 switch (kind) { 2052 case PyUnicode_1BYTE_KIND: 2053 return _PyUnicode_FromUCS1(buffer, size); 2054 case PyUnicode_2BYTE_KIND: 2055 return _PyUnicode_FromUCS2(buffer, size); 2056 case PyUnicode_4BYTE_KIND: 2057 return _PyUnicode_FromUCS4(buffer, size); 2058 default: 2059 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2060 return NULL; 2061 } 2062} 2063 2064Py_UCS4 2065_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2066{ 2067 enum PyUnicode_Kind kind; 2068 void *startptr, *endptr; 2069 2070 assert(PyUnicode_IS_READY(unicode)); 2071 assert(0 <= start); 2072 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2073 assert(start <= end); 2074 2075 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2076 return PyUnicode_MAX_CHAR_VALUE(unicode); 2077 2078 if (start == end) 2079 return 127; 2080 2081 if (PyUnicode_IS_ASCII(unicode)) 2082 return 127; 2083 2084 kind = PyUnicode_KIND(unicode); 2085 startptr = PyUnicode_DATA(unicode); 2086 endptr = (char *)startptr + end * kind; 2087 startptr = (char *)startptr + start * kind; 2088 switch(kind) { 2089 case PyUnicode_1BYTE_KIND: 2090 return ucs1lib_find_max_char(startptr, endptr); 2091 case PyUnicode_2BYTE_KIND: 2092 return ucs2lib_find_max_char(startptr, endptr); 2093 case PyUnicode_4BYTE_KIND: 2094 return ucs4lib_find_max_char(startptr, endptr); 2095 default: 2096 assert(0); 2097 return 0; 2098 } 2099} 2100 2101/* Ensure that a string uses the most efficient storage, if it is not the 2102 case: create a new string with of the right kind. Write NULL into *p_unicode 2103 on error. */ 2104static void 2105unicode_adjust_maxchar(PyObject **p_unicode) 2106{ 2107 PyObject *unicode, *copy; 2108 Py_UCS4 max_char; 2109 Py_ssize_t len; 2110 unsigned int kind; 2111 2112 assert(p_unicode != NULL); 2113 unicode = *p_unicode; 2114 assert(PyUnicode_IS_READY(unicode)); 2115 if (PyUnicode_IS_ASCII(unicode)) 2116 return; 2117 2118 len = PyUnicode_GET_LENGTH(unicode); 2119 kind = PyUnicode_KIND(unicode); 2120 if (kind == PyUnicode_1BYTE_KIND) { 2121 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2122 max_char = ucs1lib_find_max_char(u, u + len); 2123 if (max_char >= 128) 2124 return; 2125 } 2126 else if (kind == PyUnicode_2BYTE_KIND) { 2127 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2128 max_char = ucs2lib_find_max_char(u, u + len); 2129 if (max_char >= 256) 2130 return; 2131 } 2132 else { 2133 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2134 assert(kind == PyUnicode_4BYTE_KIND); 2135 max_char = ucs4lib_find_max_char(u, u + len); 2136 if (max_char >= 0x10000) 2137 return; 2138 } 2139 copy = PyUnicode_New(len, max_char); 2140 if (copy != NULL) 2141 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2142 Py_DECREF(unicode); 2143 *p_unicode = copy; 2144} 2145 2146PyObject* 2147_PyUnicode_Copy(PyObject *unicode) 2148{ 2149 Py_ssize_t length; 2150 PyObject *copy; 2151 2152 if (!PyUnicode_Check(unicode)) { 2153 PyErr_BadInternalCall(); 2154 return NULL; 2155 } 2156 if (PyUnicode_READY(unicode) == -1) 2157 return NULL; 2158 2159 length = PyUnicode_GET_LENGTH(unicode); 2160 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2161 if (!copy) 2162 return NULL; 2163 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2164 2165 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2166 length * PyUnicode_KIND(unicode)); 2167 assert(_PyUnicode_CheckConsistency(copy, 1)); 2168 return copy; 2169} 2170 2171 2172/* Widen Unicode objects to larger buffers. Don't write terminating null 2173 character. Return NULL on error. */ 2174 2175void* 2176_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2177{ 2178 Py_ssize_t len; 2179 void *result; 2180 unsigned int skind; 2181 2182 if (PyUnicode_READY(s) == -1) 2183 return NULL; 2184 2185 len = PyUnicode_GET_LENGTH(s); 2186 skind = PyUnicode_KIND(s); 2187 if (skind >= kind) { 2188 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2189 return NULL; 2190 } 2191 switch (kind) { 2192 case PyUnicode_2BYTE_KIND: 2193 result = PyMem_New(Py_UCS2, len); 2194 if (!result) 2195 return PyErr_NoMemory(); 2196 assert(skind == PyUnicode_1BYTE_KIND); 2197 _PyUnicode_CONVERT_BYTES( 2198 Py_UCS1, Py_UCS2, 2199 PyUnicode_1BYTE_DATA(s), 2200 PyUnicode_1BYTE_DATA(s) + len, 2201 result); 2202 return result; 2203 case PyUnicode_4BYTE_KIND: 2204 result = PyMem_New(Py_UCS4, len); 2205 if (!result) 2206 return PyErr_NoMemory(); 2207 if (skind == PyUnicode_2BYTE_KIND) { 2208 _PyUnicode_CONVERT_BYTES( 2209 Py_UCS2, Py_UCS4, 2210 PyUnicode_2BYTE_DATA(s), 2211 PyUnicode_2BYTE_DATA(s) + len, 2212 result); 2213 } 2214 else { 2215 assert(skind == PyUnicode_1BYTE_KIND); 2216 _PyUnicode_CONVERT_BYTES( 2217 Py_UCS1, Py_UCS4, 2218 PyUnicode_1BYTE_DATA(s), 2219 PyUnicode_1BYTE_DATA(s) + len, 2220 result); 2221 } 2222 return result; 2223 default: 2224 break; 2225 } 2226 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2227 return NULL; 2228} 2229 2230static Py_UCS4* 2231as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2232 int copy_null) 2233{ 2234 int kind; 2235 void *data; 2236 Py_ssize_t len, targetlen; 2237 if (PyUnicode_READY(string) == -1) 2238 return NULL; 2239 kind = PyUnicode_KIND(string); 2240 data = PyUnicode_DATA(string); 2241 len = PyUnicode_GET_LENGTH(string); 2242 targetlen = len; 2243 if (copy_null) 2244 targetlen++; 2245 if (!target) { 2246 target = PyMem_New(Py_UCS4, targetlen); 2247 if (!target) { 2248 PyErr_NoMemory(); 2249 return NULL; 2250 } 2251 } 2252 else { 2253 if (targetsize < targetlen) { 2254 PyErr_Format(PyExc_SystemError, 2255 "string is longer than the buffer"); 2256 if (copy_null && 0 < targetsize) 2257 target[0] = 0; 2258 return NULL; 2259 } 2260 } 2261 if (kind == PyUnicode_1BYTE_KIND) { 2262 Py_UCS1 *start = (Py_UCS1 *) data; 2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2264 } 2265 else if (kind == PyUnicode_2BYTE_KIND) { 2266 Py_UCS2 *start = (Py_UCS2 *) data; 2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2268 } 2269 else { 2270 assert(kind == PyUnicode_4BYTE_KIND); 2271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2272 } 2273 if (copy_null) 2274 target[len] = 0; 2275 return target; 2276} 2277 2278Py_UCS4* 2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2280 int copy_null) 2281{ 2282 if (target == NULL || targetsize < 0) { 2283 PyErr_BadInternalCall(); 2284 return NULL; 2285 } 2286 return as_ucs4(string, target, targetsize, copy_null); 2287} 2288 2289Py_UCS4* 2290PyUnicode_AsUCS4Copy(PyObject *string) 2291{ 2292 return as_ucs4(string, NULL, 0, 1); 2293} 2294 2295#ifdef HAVE_WCHAR_H 2296 2297PyObject * 2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2299{ 2300 if (w == NULL) { 2301 if (size == 0) 2302 _Py_RETURN_UNICODE_EMPTY(); 2303 PyErr_BadInternalCall(); 2304 return NULL; 2305 } 2306 2307 if (size == -1) { 2308 size = wcslen(w); 2309 } 2310 2311 return PyUnicode_FromUnicode(w, size); 2312} 2313 2314#endif /* HAVE_WCHAR_H */ 2315 2316static void 2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2318 char c) 2319{ 2320 *fmt++ = '%'; 2321 if (longflag) 2322 *fmt++ = 'l'; 2323 else if (longlongflag) { 2324 /* longlongflag should only ever be nonzero on machines with 2325 HAVE_LONG_LONG defined */ 2326#ifdef HAVE_LONG_LONG 2327 char *f = PY_FORMAT_LONG_LONG; 2328 while (*f) 2329 *fmt++ = *f++; 2330#else 2331 /* we shouldn't ever get here */ 2332 assert(0); 2333 *fmt++ = 'l'; 2334#endif 2335 } 2336 else if (size_tflag) { 2337 char *f = PY_FORMAT_SIZE_T; 2338 while (*f) 2339 *fmt++ = *f++; 2340 } 2341 *fmt++ = c; 2342 *fmt = '\0'; 2343} 2344 2345/* maximum number of characters required for output of %lld or %p. 2346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2349 2350static int 2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2352 Py_ssize_t width, Py_ssize_t precision) 2353{ 2354 Py_ssize_t length, fill, arglen; 2355 Py_UCS4 maxchar; 2356 2357 if (PyUnicode_READY(str) == -1) 2358 return -1; 2359 2360 length = PyUnicode_GET_LENGTH(str); 2361 if ((precision == -1 || precision >= length) 2362 && width <= length) 2363 return _PyUnicodeWriter_WriteStr(writer, str); 2364 2365 if (precision != -1) 2366 length = Py_MIN(precision, length); 2367 2368 arglen = Py_MAX(length, width); 2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2371 else 2372 maxchar = writer->maxchar; 2373 2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2375 return -1; 2376 2377 if (width > length) { 2378 fill = width - length; 2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2380 return -1; 2381 writer->pos += fill; 2382 } 2383 2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2385 str, 0, length); 2386 writer->pos += length; 2387 return 0; 2388} 2389 2390static int 2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2392 Py_ssize_t width, Py_ssize_t precision) 2393{ 2394 /* UTF-8 */ 2395 Py_ssize_t length; 2396 PyObject *unicode; 2397 int res; 2398 2399 length = strlen(str); 2400 if (precision != -1) 2401 length = Py_MIN(length, precision); 2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2403 if (unicode == NULL) 2404 return -1; 2405 2406 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2407 Py_DECREF(unicode); 2408 return res; 2409} 2410 2411static const char* 2412unicode_fromformat_arg(_PyUnicodeWriter *writer, 2413 const char *f, va_list *vargs) 2414{ 2415 const char *p; 2416 Py_ssize_t len; 2417 int zeropad; 2418 Py_ssize_t width; 2419 Py_ssize_t precision; 2420 int longflag; 2421 int longlongflag; 2422 int size_tflag; 2423 Py_ssize_t fill; 2424 2425 p = f; 2426 f++; 2427 zeropad = 0; 2428 if (*f == '0') { 2429 zeropad = 1; 2430 f++; 2431 } 2432 2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2434 width = -1; 2435 if (Py_ISDIGIT((unsigned)*f)) { 2436 width = *f - '0'; 2437 f++; 2438 while (Py_ISDIGIT((unsigned)*f)) { 2439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2440 PyErr_SetString(PyExc_ValueError, 2441 "width too big"); 2442 return NULL; 2443 } 2444 width = (width * 10) + (*f - '0'); 2445 f++; 2446 } 2447 } 2448 precision = -1; 2449 if (*f == '.') { 2450 f++; 2451 if (Py_ISDIGIT((unsigned)*f)) { 2452 precision = (*f - '0'); 2453 f++; 2454 while (Py_ISDIGIT((unsigned)*f)) { 2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2456 PyErr_SetString(PyExc_ValueError, 2457 "precision too big"); 2458 return NULL; 2459 } 2460 precision = (precision * 10) + (*f - '0'); 2461 f++; 2462 } 2463 } 2464 if (*f == '%') { 2465 /* "%.3%s" => f points to "3" */ 2466 f--; 2467 } 2468 } 2469 if (*f == '\0') { 2470 /* bogus format "%.123" => go backward, f points to "3" */ 2471 f--; 2472 } 2473 2474 /* Handle %ld, %lu, %lld and %llu. */ 2475 longflag = 0; 2476 longlongflag = 0; 2477 size_tflag = 0; 2478 if (*f == 'l') { 2479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2480 longflag = 1; 2481 ++f; 2482 } 2483#ifdef HAVE_LONG_LONG 2484 else if (f[1] == 'l' && 2485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2486 longlongflag = 1; 2487 f += 2; 2488 } 2489#endif 2490 } 2491 /* handle the size_t flag. */ 2492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2493 size_tflag = 1; 2494 ++f; 2495 } 2496 2497 if (f[1] == '\0') 2498 writer->overallocate = 0; 2499 2500 switch (*f) { 2501 case 'c': 2502 { 2503 int ordinal = va_arg(*vargs, int); 2504 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2505 PyErr_SetString(PyExc_OverflowError, 2506 "character argument not in range(0x110000)"); 2507 return NULL; 2508 } 2509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2510 return NULL; 2511 break; 2512 } 2513 2514 case 'i': 2515 case 'd': 2516 case 'u': 2517 case 'x': 2518 { 2519 /* used by sprintf */ 2520 char fmt[10]; /* should be enough for "%0lld\0" */ 2521 char buffer[MAX_LONG_LONG_CHARS]; 2522 Py_ssize_t arglen; 2523 2524 if (*f == 'u') { 2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2526 2527 if (longflag) 2528 len = sprintf(buffer, fmt, 2529 va_arg(*vargs, unsigned long)); 2530#ifdef HAVE_LONG_LONG 2531 else if (longlongflag) 2532 len = sprintf(buffer, fmt, 2533 va_arg(*vargs, unsigned PY_LONG_LONG)); 2534#endif 2535 else if (size_tflag) 2536 len = sprintf(buffer, fmt, 2537 va_arg(*vargs, size_t)); 2538 else 2539 len = sprintf(buffer, fmt, 2540 va_arg(*vargs, unsigned int)); 2541 } 2542 else if (*f == 'x') { 2543 makefmt(fmt, 0, 0, 0, 'x'); 2544 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2545 } 2546 else { 2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2548 2549 if (longflag) 2550 len = sprintf(buffer, fmt, 2551 va_arg(*vargs, long)); 2552#ifdef HAVE_LONG_LONG 2553 else if (longlongflag) 2554 len = sprintf(buffer, fmt, 2555 va_arg(*vargs, PY_LONG_LONG)); 2556#endif 2557 else if (size_tflag) 2558 len = sprintf(buffer, fmt, 2559 va_arg(*vargs, Py_ssize_t)); 2560 else 2561 len = sprintf(buffer, fmt, 2562 va_arg(*vargs, int)); 2563 } 2564 assert(len >= 0); 2565 2566 if (precision < len) 2567 precision = len; 2568 2569 arglen = Py_MAX(precision, width); 2570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2571 return NULL; 2572 2573 if (width > precision) { 2574 Py_UCS4 fillchar; 2575 fill = width - precision; 2576 fillchar = zeropad?'0':' '; 2577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2578 return NULL; 2579 writer->pos += fill; 2580 } 2581 if (precision > len) { 2582 fill = precision - len; 2583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2584 return NULL; 2585 writer->pos += fill; 2586 } 2587 2588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2589 return NULL; 2590 break; 2591 } 2592 2593 case 'p': 2594 { 2595 char number[MAX_LONG_LONG_CHARS]; 2596 2597 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2598 assert(len >= 0); 2599 2600 /* %p is ill-defined: ensure leading 0x. */ 2601 if (number[1] == 'X') 2602 number[1] = 'x'; 2603 else if (number[1] != 'x') { 2604 memmove(number + 2, number, 2605 strlen(number) + 1); 2606 number[0] = '0'; 2607 number[1] = 'x'; 2608 len += 2; 2609 } 2610 2611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2612 return NULL; 2613 break; 2614 } 2615 2616 case 's': 2617 { 2618 /* UTF-8 */ 2619 const char *s = va_arg(*vargs, const char*); 2620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2621 return NULL; 2622 break; 2623 } 2624 2625 case 'U': 2626 { 2627 PyObject *obj = va_arg(*vargs, PyObject *); 2628 assert(obj && _PyUnicode_CHECK(obj)); 2629 2630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2631 return NULL; 2632 break; 2633 } 2634 2635 case 'V': 2636 { 2637 PyObject *obj = va_arg(*vargs, PyObject *); 2638 const char *str = va_arg(*vargs, const char *); 2639 if (obj) { 2640 assert(_PyUnicode_CHECK(obj)); 2641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2642 return NULL; 2643 } 2644 else { 2645 assert(str != NULL); 2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2647 return NULL; 2648 } 2649 break; 2650 } 2651 2652 case 'S': 2653 { 2654 PyObject *obj = va_arg(*vargs, PyObject *); 2655 PyObject *str; 2656 assert(obj); 2657 str = PyObject_Str(obj); 2658 if (!str) 2659 return NULL; 2660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2661 Py_DECREF(str); 2662 return NULL; 2663 } 2664 Py_DECREF(str); 2665 break; 2666 } 2667 2668 case 'R': 2669 { 2670 PyObject *obj = va_arg(*vargs, PyObject *); 2671 PyObject *repr; 2672 assert(obj); 2673 repr = PyObject_Repr(obj); 2674 if (!repr) 2675 return NULL; 2676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2677 Py_DECREF(repr); 2678 return NULL; 2679 } 2680 Py_DECREF(repr); 2681 break; 2682 } 2683 2684 case 'A': 2685 { 2686 PyObject *obj = va_arg(*vargs, PyObject *); 2687 PyObject *ascii; 2688 assert(obj); 2689 ascii = PyObject_ASCII(obj); 2690 if (!ascii) 2691 return NULL; 2692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2693 Py_DECREF(ascii); 2694 return NULL; 2695 } 2696 Py_DECREF(ascii); 2697 break; 2698 } 2699 2700 case '%': 2701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2702 return NULL; 2703 break; 2704 2705 default: 2706 /* if we stumble upon an unknown formatting code, copy the rest 2707 of the format string to the output string. (we cannot just 2708 skip the code, since there's no way to know what's in the 2709 argument list) */ 2710 len = strlen(p); 2711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2712 return NULL; 2713 f = p+len; 2714 return f; 2715 } 2716 2717 f++; 2718 return f; 2719} 2720 2721PyObject * 2722PyUnicode_FromFormatV(const char *format, va_list vargs) 2723{ 2724 va_list vargs2; 2725 const char *f; 2726 _PyUnicodeWriter writer; 2727 2728 _PyUnicodeWriter_Init(&writer); 2729 writer.min_length = strlen(format) + 100; 2730 writer.overallocate = 1; 2731 2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2733 Copy it to be able to pass a reference to a subfunction. */ 2734 Py_VA_COPY(vargs2, vargs); 2735 2736 for (f = format; *f; ) { 2737 if (*f == '%') { 2738 f = unicode_fromformat_arg(&writer, f, &vargs2); 2739 if (f == NULL) 2740 goto fail; 2741 } 2742 else { 2743 const char *p; 2744 Py_ssize_t len; 2745 2746 p = f; 2747 do 2748 { 2749 if ((unsigned char)*p > 127) { 2750 PyErr_Format(PyExc_ValueError, 2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2752 "string, got a non-ASCII byte: 0x%02x", 2753 (unsigned char)*p); 2754 return NULL; 2755 } 2756 p++; 2757 } 2758 while (*p != '\0' && *p != '%'); 2759 len = p - f; 2760 2761 if (*p == '\0') 2762 writer.overallocate = 0; 2763 2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2765 goto fail; 2766 2767 f = p; 2768 } 2769 } 2770 return _PyUnicodeWriter_Finish(&writer); 2771 2772 fail: 2773 _PyUnicodeWriter_Dealloc(&writer); 2774 return NULL; 2775} 2776 2777PyObject * 2778PyUnicode_FromFormat(const char *format, ...) 2779{ 2780 PyObject* ret; 2781 va_list vargs; 2782 2783#ifdef HAVE_STDARG_PROTOTYPES 2784 va_start(vargs, format); 2785#else 2786 va_start(vargs); 2787#endif 2788 ret = PyUnicode_FromFormatV(format, vargs); 2789 va_end(vargs); 2790 return ret; 2791} 2792 2793#ifdef HAVE_WCHAR_H 2794 2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2796 convert a Unicode object to a wide character string. 2797 2798 - If w is NULL: return the number of wide characters (including the null 2799 character) required to convert the unicode object. Ignore size argument. 2800 2801 - Otherwise: return the number of wide characters (excluding the null 2802 character) written into w. Write at most size wide characters (including 2803 the null character). */ 2804static Py_ssize_t 2805unicode_aswidechar(PyObject *unicode, 2806 wchar_t *w, 2807 Py_ssize_t size) 2808{ 2809 Py_ssize_t res; 2810 const wchar_t *wstr; 2811 2812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2813 if (wstr == NULL) 2814 return -1; 2815 2816 if (w != NULL) { 2817 if (size > res) 2818 size = res + 1; 2819 else 2820 res = size; 2821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2822 return res; 2823 } 2824 else 2825 return res + 1; 2826} 2827 2828Py_ssize_t 2829PyUnicode_AsWideChar(PyObject *unicode, 2830 wchar_t *w, 2831 Py_ssize_t size) 2832{ 2833 if (unicode == NULL) { 2834 PyErr_BadInternalCall(); 2835 return -1; 2836 } 2837 return unicode_aswidechar(unicode, w, size); 2838} 2839 2840wchar_t* 2841PyUnicode_AsWideCharString(PyObject *unicode, 2842 Py_ssize_t *size) 2843{ 2844 wchar_t* buffer; 2845 Py_ssize_t buflen; 2846 2847 if (unicode == NULL) { 2848 PyErr_BadInternalCall(); 2849 return NULL; 2850 } 2851 2852 buflen = unicode_aswidechar(unicode, NULL, 0); 2853 if (buflen == -1) 2854 return NULL; 2855 buffer = PyMem_NEW(wchar_t, buflen); 2856 if (buffer == NULL) { 2857 PyErr_NoMemory(); 2858 return NULL; 2859 } 2860 buflen = unicode_aswidechar(unicode, buffer, buflen); 2861 if (buflen == -1) { 2862 PyMem_FREE(buffer); 2863 return NULL; 2864 } 2865 if (size != NULL) 2866 *size = buflen; 2867 return buffer; 2868} 2869 2870#endif /* HAVE_WCHAR_H */ 2871 2872PyObject * 2873PyUnicode_FromOrdinal(int ordinal) 2874{ 2875 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2876 PyErr_SetString(PyExc_ValueError, 2877 "chr() arg not in range(0x110000)"); 2878 return NULL; 2879 } 2880 2881 return unicode_char((Py_UCS4)ordinal); 2882} 2883 2884PyObject * 2885PyUnicode_FromObject(PyObject *obj) 2886{ 2887 /* XXX Perhaps we should make this API an alias of 2888 PyObject_Str() instead ?! */ 2889 if (PyUnicode_CheckExact(obj)) { 2890 if (PyUnicode_READY(obj) == -1) 2891 return NULL; 2892 Py_INCREF(obj); 2893 return obj; 2894 } 2895 if (PyUnicode_Check(obj)) { 2896 /* For a Unicode subtype that's not a Unicode object, 2897 return a true Unicode object with the same data. */ 2898 return _PyUnicode_Copy(obj); 2899 } 2900 PyErr_Format(PyExc_TypeError, 2901 "Can't convert '%.100s' object to str implicitly", 2902 Py_TYPE(obj)->tp_name); 2903 return NULL; 2904} 2905 2906PyObject * 2907PyUnicode_FromEncodedObject(PyObject *obj, 2908 const char *encoding, 2909 const char *errors) 2910{ 2911 Py_buffer buffer; 2912 PyObject *v; 2913 2914 if (obj == NULL) { 2915 PyErr_BadInternalCall(); 2916 return NULL; 2917 } 2918 2919 /* Decoding bytes objects is the most common case and should be fast */ 2920 if (PyBytes_Check(obj)) { 2921 if (PyBytes_GET_SIZE(obj) == 0) 2922 _Py_RETURN_UNICODE_EMPTY(); 2923 v = PyUnicode_Decode( 2924 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2925 encoding, errors); 2926 return v; 2927 } 2928 2929 if (PyUnicode_Check(obj)) { 2930 PyErr_SetString(PyExc_TypeError, 2931 "decoding str is not supported"); 2932 return NULL; 2933 } 2934 2935 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2936 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2937 PyErr_Format(PyExc_TypeError, 2938 "coercing to str: need a bytes-like object, %.80s found", 2939 Py_TYPE(obj)->tp_name); 2940 return NULL; 2941 } 2942 2943 if (buffer.len == 0) { 2944 PyBuffer_Release(&buffer); 2945 _Py_RETURN_UNICODE_EMPTY(); 2946 } 2947 2948 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2949 PyBuffer_Release(&buffer); 2950 return v; 2951} 2952 2953/* Convert encoding to lower case and replace '_' with '-' in order to 2954 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2955 1 on success. */ 2956int 2957_Py_normalize_encoding(const char *encoding, 2958 char *lower, 2959 size_t lower_len) 2960{ 2961 const char *e; 2962 char *l; 2963 char *l_end; 2964 2965 if (encoding == NULL) { 2966 /* 6 == strlen("utf-8") + 1 */ 2967 if (lower_len < 6) 2968 return 0; 2969 strcpy(lower, "utf-8"); 2970 return 1; 2971 } 2972 e = encoding; 2973 l = lower; 2974 l_end = &lower[lower_len - 1]; 2975 while (*e) { 2976 if (l == l_end) 2977 return 0; 2978 if (Py_ISUPPER(*e)) { 2979 *l++ = Py_TOLOWER(*e++); 2980 } 2981 else if (*e == '_') { 2982 *l++ = '-'; 2983 e++; 2984 } 2985 else { 2986 *l++ = *e++; 2987 } 2988 } 2989 *l = '\0'; 2990 return 1; 2991} 2992 2993PyObject * 2994PyUnicode_Decode(const char *s, 2995 Py_ssize_t size, 2996 const char *encoding, 2997 const char *errors) 2998{ 2999 PyObject *buffer = NULL, *unicode; 3000 Py_buffer info; 3001 char lower[11]; /* Enough for any encoding shortcut */ 3002 3003 /* Shortcuts for common default encodings */ 3004 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3005 if ((strcmp(lower, "utf-8") == 0) || 3006 (strcmp(lower, "utf8") == 0)) 3007 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3008 else if ((strcmp(lower, "latin-1") == 0) || 3009 (strcmp(lower, "latin1") == 0) || 3010 (strcmp(lower, "iso-8859-1") == 0) || 3011 (strcmp(lower, "iso8859-1") == 0)) 3012 return PyUnicode_DecodeLatin1(s, size, errors); 3013#ifdef HAVE_MBCS 3014 else if (strcmp(lower, "mbcs") == 0) 3015 return PyUnicode_DecodeMBCS(s, size, errors); 3016#endif 3017 else if (strcmp(lower, "ascii") == 0) 3018 return PyUnicode_DecodeASCII(s, size, errors); 3019 else if (strcmp(lower, "utf-16") == 0) 3020 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3021 else if (strcmp(lower, "utf-32") == 0) 3022 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3023 } 3024 3025 /* Decode via the codec registry */ 3026 buffer = NULL; 3027 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3028 goto onError; 3029 buffer = PyMemoryView_FromBuffer(&info); 3030 if (buffer == NULL) 3031 goto onError; 3032 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3033 if (unicode == NULL) 3034 goto onError; 3035 if (!PyUnicode_Check(unicode)) { 3036 PyErr_Format(PyExc_TypeError, 3037 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3038 "use codecs.decode() to decode to arbitrary types", 3039 encoding, 3040 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3041 Py_DECREF(unicode); 3042 goto onError; 3043 } 3044 Py_DECREF(buffer); 3045 return unicode_result(unicode); 3046 3047 onError: 3048 Py_XDECREF(buffer); 3049 return NULL; 3050} 3051 3052PyObject * 3053PyUnicode_AsDecodedObject(PyObject *unicode, 3054 const char *encoding, 3055 const char *errors) 3056{ 3057 PyObject *v; 3058 3059 if (!PyUnicode_Check(unicode)) { 3060 PyErr_BadArgument(); 3061 goto onError; 3062 } 3063 3064 if (encoding == NULL) 3065 encoding = PyUnicode_GetDefaultEncoding(); 3066 3067 /* Decode via the codec registry */ 3068 v = PyCodec_Decode(unicode, encoding, errors); 3069 if (v == NULL) 3070 goto onError; 3071 return unicode_result(v); 3072 3073 onError: 3074 return NULL; 3075} 3076 3077PyObject * 3078PyUnicode_AsDecodedUnicode(PyObject *unicode, 3079 const char *encoding, 3080 const char *errors) 3081{ 3082 PyObject *v; 3083 3084 if (!PyUnicode_Check(unicode)) { 3085 PyErr_BadArgument(); 3086 goto onError; 3087 } 3088 3089 if (encoding == NULL) 3090 encoding = PyUnicode_GetDefaultEncoding(); 3091 3092 /* Decode via the codec registry */ 3093 v = PyCodec_Decode(unicode, encoding, errors); 3094 if (v == NULL) 3095 goto onError; 3096 if (!PyUnicode_Check(v)) { 3097 PyErr_Format(PyExc_TypeError, 3098 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3099 "use codecs.decode() to decode to arbitrary types", 3100 encoding, 3101 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3102 Py_DECREF(v); 3103 goto onError; 3104 } 3105 return unicode_result(v); 3106 3107 onError: 3108 return NULL; 3109} 3110 3111PyObject * 3112PyUnicode_Encode(const Py_UNICODE *s, 3113 Py_ssize_t size, 3114 const char *encoding, 3115 const char *errors) 3116{ 3117 PyObject *v, *unicode; 3118 3119 unicode = PyUnicode_FromUnicode(s, size); 3120 if (unicode == NULL) 3121 return NULL; 3122 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3123 Py_DECREF(unicode); 3124 return v; 3125} 3126 3127PyObject * 3128PyUnicode_AsEncodedObject(PyObject *unicode, 3129 const char *encoding, 3130 const char *errors) 3131{ 3132 PyObject *v; 3133 3134 if (!PyUnicode_Check(unicode)) { 3135 PyErr_BadArgument(); 3136 goto onError; 3137 } 3138 3139 if (encoding == NULL) 3140 encoding = PyUnicode_GetDefaultEncoding(); 3141 3142 /* Encode via the codec registry */ 3143 v = PyCodec_Encode(unicode, encoding, errors); 3144 if (v == NULL) 3145 goto onError; 3146 return v; 3147 3148 onError: 3149 return NULL; 3150} 3151 3152static size_t 3153wcstombs_errorpos(const wchar_t *wstr) 3154{ 3155 size_t len; 3156#if SIZEOF_WCHAR_T == 2 3157 wchar_t buf[3]; 3158#else 3159 wchar_t buf[2]; 3160#endif 3161 char outbuf[MB_LEN_MAX]; 3162 const wchar_t *start, *previous; 3163 3164#if SIZEOF_WCHAR_T == 2 3165 buf[2] = 0; 3166#else 3167 buf[1] = 0; 3168#endif 3169 start = wstr; 3170 while (*wstr != L'\0') 3171 { 3172 previous = wstr; 3173#if SIZEOF_WCHAR_T == 2 3174 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3175 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3176 { 3177 buf[0] = wstr[0]; 3178 buf[1] = wstr[1]; 3179 wstr += 2; 3180 } 3181 else { 3182 buf[0] = *wstr; 3183 buf[1] = 0; 3184 wstr++; 3185 } 3186#else 3187 buf[0] = *wstr; 3188 wstr++; 3189#endif 3190 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3191 if (len == (size_t)-1) 3192 return previous - start; 3193 } 3194 3195 /* failed to find the unencodable character */ 3196 return 0; 3197} 3198 3199static int 3200locale_error_handler(const char *errors, int *surrogateescape) 3201{ 3202 if (errors == NULL) { 3203 *surrogateescape = 0; 3204 return 0; 3205 } 3206 3207 if (strcmp(errors, "strict") == 0) { 3208 *surrogateescape = 0; 3209 return 0; 3210 } 3211 if (strcmp(errors, "surrogateescape") == 0) { 3212 *surrogateescape = 1; 3213 return 0; 3214 } 3215 PyErr_Format(PyExc_ValueError, 3216 "only 'strict' and 'surrogateescape' error handlers " 3217 "are supported, not '%s'", 3218 errors); 3219 return -1; 3220} 3221 3222PyObject * 3223PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3224{ 3225 Py_ssize_t wlen, wlen2; 3226 wchar_t *wstr; 3227 PyObject *bytes = NULL; 3228 char *errmsg; 3229 PyObject *reason = NULL; 3230 PyObject *exc; 3231 size_t error_pos; 3232 int surrogateescape; 3233 3234 if (locale_error_handler(errors, &surrogateescape) < 0) 3235 return NULL; 3236 3237 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3238 if (wstr == NULL) 3239 return NULL; 3240 3241 wlen2 = wcslen(wstr); 3242 if (wlen2 != wlen) { 3243 PyMem_Free(wstr); 3244 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3245 return NULL; 3246 } 3247 3248 if (surrogateescape) { 3249 /* "surrogateescape" error handler */ 3250 char *str; 3251 3252 str = _Py_wchar2char(wstr, &error_pos); 3253 if (str == NULL) { 3254 if (error_pos == (size_t)-1) { 3255 PyErr_NoMemory(); 3256 PyMem_Free(wstr); 3257 return NULL; 3258 } 3259 else { 3260 goto encode_error; 3261 } 3262 } 3263 PyMem_Free(wstr); 3264 3265 bytes = PyBytes_FromString(str); 3266 PyMem_Free(str); 3267 } 3268 else { 3269 /* strict mode */ 3270 size_t len, len2; 3271 3272 len = wcstombs(NULL, wstr, 0); 3273 if (len == (size_t)-1) { 3274 error_pos = (size_t)-1; 3275 goto encode_error; 3276 } 3277 3278 bytes = PyBytes_FromStringAndSize(NULL, len); 3279 if (bytes == NULL) { 3280 PyMem_Free(wstr); 3281 return NULL; 3282 } 3283 3284 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3285 if (len2 == (size_t)-1 || len2 > len) { 3286 error_pos = (size_t)-1; 3287 goto encode_error; 3288 } 3289 PyMem_Free(wstr); 3290 } 3291 return bytes; 3292 3293encode_error: 3294 errmsg = strerror(errno); 3295 assert(errmsg != NULL); 3296 3297 if (error_pos == (size_t)-1) 3298 error_pos = wcstombs_errorpos(wstr); 3299 3300 PyMem_Free(wstr); 3301 Py_XDECREF(bytes); 3302 3303 if (errmsg != NULL) { 3304 size_t errlen; 3305 wstr = _Py_char2wchar(errmsg, &errlen); 3306 if (wstr != NULL) { 3307 reason = PyUnicode_FromWideChar(wstr, errlen); 3308 PyMem_RawFree(wstr); 3309 } else 3310 errmsg = NULL; 3311 } 3312 if (errmsg == NULL) 3313 reason = PyUnicode_FromString( 3314 "wcstombs() encountered an unencodable " 3315 "wide character"); 3316 if (reason == NULL) 3317 return NULL; 3318 3319 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3320 "locale", unicode, 3321 (Py_ssize_t)error_pos, 3322 (Py_ssize_t)(error_pos+1), 3323 reason); 3324 Py_DECREF(reason); 3325 if (exc != NULL) { 3326 PyCodec_StrictErrors(exc); 3327 Py_XDECREF(exc); 3328 } 3329 return NULL; 3330} 3331 3332PyObject * 3333PyUnicode_EncodeFSDefault(PyObject *unicode) 3334{ 3335#ifdef HAVE_MBCS 3336 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3337#elif defined(__APPLE__) 3338 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3339#else 3340 PyInterpreterState *interp = PyThreadState_GET()->interp; 3341 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3342 cannot use it to encode and decode filenames before it is loaded. Load 3343 the Python codec requires to encode at least its own filename. Use the C 3344 version of the locale codec until the codec registry is initialized and 3345 the Python codec is loaded. 3346 3347 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3348 cannot only rely on it: check also interp->fscodec_initialized for 3349 subinterpreters. */ 3350 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3351 return PyUnicode_AsEncodedString(unicode, 3352 Py_FileSystemDefaultEncoding, 3353 "surrogateescape"); 3354 } 3355 else { 3356 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3357 } 3358#endif 3359} 3360 3361PyObject * 3362PyUnicode_AsEncodedString(PyObject *unicode, 3363 const char *encoding, 3364 const char *errors) 3365{ 3366 PyObject *v; 3367 char lower[11]; /* Enough for any encoding shortcut */ 3368 3369 if (!PyUnicode_Check(unicode)) { 3370 PyErr_BadArgument(); 3371 return NULL; 3372 } 3373 3374 /* Shortcuts for common default encodings */ 3375 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3376 if ((strcmp(lower, "utf-8") == 0) || 3377 (strcmp(lower, "utf8") == 0)) 3378 { 3379 if (errors == NULL || strcmp(errors, "strict") == 0) 3380 return _PyUnicode_AsUTF8String(unicode, NULL); 3381 else 3382 return _PyUnicode_AsUTF8String(unicode, errors); 3383 } 3384 else if ((strcmp(lower, "latin-1") == 0) || 3385 (strcmp(lower, "latin1") == 0) || 3386 (strcmp(lower, "iso-8859-1") == 0) || 3387 (strcmp(lower, "iso8859-1") == 0)) 3388 return _PyUnicode_AsLatin1String(unicode, errors); 3389#ifdef HAVE_MBCS 3390 else if (strcmp(lower, "mbcs") == 0) 3391 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3392#endif 3393 else if (strcmp(lower, "ascii") == 0) 3394 return _PyUnicode_AsASCIIString(unicode, errors); 3395 } 3396 3397 /* Encode via the codec registry */ 3398 v = _PyCodec_EncodeText(unicode, encoding, errors); 3399 if (v == NULL) 3400 return NULL; 3401 3402 /* The normal path */ 3403 if (PyBytes_Check(v)) 3404 return v; 3405 3406 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3407 if (PyByteArray_Check(v)) { 3408 int error; 3409 PyObject *b; 3410 3411 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3412 "encoder %s returned bytearray instead of bytes; " 3413 "use codecs.encode() to encode to arbitrary types", 3414 encoding); 3415 if (error) { 3416 Py_DECREF(v); 3417 return NULL; 3418 } 3419 3420 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3421 Py_DECREF(v); 3422 return b; 3423 } 3424 3425 PyErr_Format(PyExc_TypeError, 3426 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3427 "use codecs.encode() to encode to arbitrary types", 3428 encoding, 3429 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3430 Py_DECREF(v); 3431 return NULL; 3432} 3433 3434PyObject * 3435PyUnicode_AsEncodedUnicode(PyObject *unicode, 3436 const char *encoding, 3437 const char *errors) 3438{ 3439 PyObject *v; 3440 3441 if (!PyUnicode_Check(unicode)) { 3442 PyErr_BadArgument(); 3443 goto onError; 3444 } 3445 3446 if (encoding == NULL) 3447 encoding = PyUnicode_GetDefaultEncoding(); 3448 3449 /* Encode via the codec registry */ 3450 v = PyCodec_Encode(unicode, encoding, errors); 3451 if (v == NULL) 3452 goto onError; 3453 if (!PyUnicode_Check(v)) { 3454 PyErr_Format(PyExc_TypeError, 3455 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3456 "use codecs.encode() to encode to arbitrary types", 3457 encoding, 3458 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3459 Py_DECREF(v); 3460 goto onError; 3461 } 3462 return v; 3463 3464 onError: 3465 return NULL; 3466} 3467 3468static size_t 3469mbstowcs_errorpos(const char *str, size_t len) 3470{ 3471#ifdef HAVE_MBRTOWC 3472 const char *start = str; 3473 mbstate_t mbs; 3474 size_t converted; 3475 wchar_t ch; 3476 3477 memset(&mbs, 0, sizeof mbs); 3478 while (len) 3479 { 3480 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3481 if (converted == 0) 3482 /* Reached end of string */ 3483 break; 3484 if (converted == (size_t)-1 || converted == (size_t)-2) { 3485 /* Conversion error or incomplete character */ 3486 return str - start; 3487 } 3488 else { 3489 str += converted; 3490 len -= converted; 3491 } 3492 } 3493 /* failed to find the undecodable byte sequence */ 3494 return 0; 3495#endif 3496 return 0; 3497} 3498 3499PyObject* 3500PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3501 const char *errors) 3502{ 3503 wchar_t smallbuf[256]; 3504 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3505 wchar_t *wstr; 3506 size_t wlen, wlen2; 3507 PyObject *unicode; 3508 int surrogateescape; 3509 size_t error_pos; 3510 char *errmsg; 3511 PyObject *reason, *exc; 3512 3513 if (locale_error_handler(errors, &surrogateescape) < 0) 3514 return NULL; 3515 3516 if (str[len] != '\0' || len != strlen(str)) { 3517 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3518 return NULL; 3519 } 3520 3521 if (surrogateescape) { 3522 /* "surrogateescape" error handler */ 3523 wstr = _Py_char2wchar(str, &wlen); 3524 if (wstr == NULL) { 3525 if (wlen == (size_t)-1) 3526 PyErr_NoMemory(); 3527 else 3528 PyErr_SetFromErrno(PyExc_OSError); 3529 return NULL; 3530 } 3531 3532 unicode = PyUnicode_FromWideChar(wstr, wlen); 3533 PyMem_RawFree(wstr); 3534 } 3535 else { 3536 /* strict mode */ 3537#ifndef HAVE_BROKEN_MBSTOWCS 3538 wlen = mbstowcs(NULL, str, 0); 3539#else 3540 wlen = len; 3541#endif 3542 if (wlen == (size_t)-1) 3543 goto decode_error; 3544 if (wlen+1 <= smallbuf_len) { 3545 wstr = smallbuf; 3546 } 3547 else { 3548 wstr = PyMem_New(wchar_t, wlen+1); 3549 if (!wstr) 3550 return PyErr_NoMemory(); 3551 } 3552 3553 wlen2 = mbstowcs(wstr, str, wlen+1); 3554 if (wlen2 == (size_t)-1) { 3555 if (wstr != smallbuf) 3556 PyMem_Free(wstr); 3557 goto decode_error; 3558 } 3559#ifdef HAVE_BROKEN_MBSTOWCS 3560 assert(wlen2 == wlen); 3561#endif 3562 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3563 if (wstr != smallbuf) 3564 PyMem_Free(wstr); 3565 } 3566 return unicode; 3567 3568decode_error: 3569 reason = NULL; 3570 errmsg = strerror(errno); 3571 assert(errmsg != NULL); 3572 3573 error_pos = mbstowcs_errorpos(str, len); 3574 if (errmsg != NULL) { 3575 size_t errlen; 3576 wstr = _Py_char2wchar(errmsg, &errlen); 3577 if (wstr != NULL) { 3578 reason = PyUnicode_FromWideChar(wstr, errlen); 3579 PyMem_RawFree(wstr); 3580 } 3581 } 3582 if (reason == NULL) 3583 reason = PyUnicode_FromString( 3584 "mbstowcs() encountered an invalid multibyte sequence"); 3585 if (reason == NULL) 3586 return NULL; 3587 3588 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3589 "locale", str, len, 3590 (Py_ssize_t)error_pos, 3591 (Py_ssize_t)(error_pos+1), 3592 reason); 3593 Py_DECREF(reason); 3594 if (exc != NULL) { 3595 PyCodec_StrictErrors(exc); 3596 Py_XDECREF(exc); 3597 } 3598 return NULL; 3599} 3600 3601PyObject* 3602PyUnicode_DecodeLocale(const char *str, const char *errors) 3603{ 3604 Py_ssize_t size = (Py_ssize_t)strlen(str); 3605 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3606} 3607 3608 3609PyObject* 3610PyUnicode_DecodeFSDefault(const char *s) { 3611 Py_ssize_t size = (Py_ssize_t)strlen(s); 3612 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3613} 3614 3615PyObject* 3616PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3617{ 3618#ifdef HAVE_MBCS 3619 return PyUnicode_DecodeMBCS(s, size, NULL); 3620#elif defined(__APPLE__) 3621 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3622#else 3623 PyInterpreterState *interp = PyThreadState_GET()->interp; 3624 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3625 cannot use it to encode and decode filenames before it is loaded. Load 3626 the Python codec requires to encode at least its own filename. Use the C 3627 version of the locale codec until the codec registry is initialized and 3628 the Python codec is loaded. 3629 3630 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3631 cannot only rely on it: check also interp->fscodec_initialized for 3632 subinterpreters. */ 3633 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3634 return PyUnicode_Decode(s, size, 3635 Py_FileSystemDefaultEncoding, 3636 "surrogateescape"); 3637 } 3638 else { 3639 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3640 } 3641#endif 3642} 3643 3644 3645int 3646_PyUnicode_HasNULChars(PyObject* str) 3647{ 3648 Py_ssize_t pos; 3649 3650 if (PyUnicode_READY(str) == -1) 3651 return -1; 3652 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3653 PyUnicode_GET_LENGTH(str), '\0', 1); 3654 if (pos == -1) 3655 return 0; 3656 else 3657 return 1; 3658} 3659 3660int 3661PyUnicode_FSConverter(PyObject* arg, void* addr) 3662{ 3663 PyObject *output = NULL; 3664 Py_ssize_t size; 3665 void *data; 3666 if (arg == NULL) { 3667 Py_DECREF(*(PyObject**)addr); 3668 return 1; 3669 } 3670 if (PyBytes_Check(arg)) { 3671 output = arg; 3672 Py_INCREF(output); 3673 } 3674 else { 3675 arg = PyUnicode_FromObject(arg); 3676 if (!arg) 3677 return 0; 3678 output = PyUnicode_EncodeFSDefault(arg); 3679 Py_DECREF(arg); 3680 if (!output) 3681 return 0; 3682 if (!PyBytes_Check(output)) { 3683 Py_DECREF(output); 3684 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3685 return 0; 3686 } 3687 } 3688 size = PyBytes_GET_SIZE(output); 3689 data = PyBytes_AS_STRING(output); 3690 if (size != strlen(data)) { 3691 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3692 Py_DECREF(output); 3693 return 0; 3694 } 3695 *(PyObject**)addr = output; 3696 return Py_CLEANUP_SUPPORTED; 3697} 3698 3699 3700int 3701PyUnicode_FSDecoder(PyObject* arg, void* addr) 3702{ 3703 PyObject *output = NULL; 3704 if (arg == NULL) { 3705 Py_DECREF(*(PyObject**)addr); 3706 return 1; 3707 } 3708 if (PyUnicode_Check(arg)) { 3709 if (PyUnicode_READY(arg) == -1) 3710 return 0; 3711 output = arg; 3712 Py_INCREF(output); 3713 } 3714 else { 3715 arg = PyBytes_FromObject(arg); 3716 if (!arg) 3717 return 0; 3718 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3719 PyBytes_GET_SIZE(arg)); 3720 Py_DECREF(arg); 3721 if (!output) 3722 return 0; 3723 if (!PyUnicode_Check(output)) { 3724 Py_DECREF(output); 3725 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3726 return 0; 3727 } 3728 } 3729 if (PyUnicode_READY(output) == -1) { 3730 Py_DECREF(output); 3731 return 0; 3732 } 3733 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3734 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3735 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3736 Py_DECREF(output); 3737 return 0; 3738 } 3739 *(PyObject**)addr = output; 3740 return Py_CLEANUP_SUPPORTED; 3741} 3742 3743 3744char* 3745PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3746{ 3747 PyObject *bytes; 3748 3749 if (!PyUnicode_Check(unicode)) { 3750 PyErr_BadArgument(); 3751 return NULL; 3752 } 3753 if (PyUnicode_READY(unicode) == -1) 3754 return NULL; 3755 3756 if (PyUnicode_UTF8(unicode) == NULL) { 3757 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3758 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3759 if (bytes == NULL) 3760 return NULL; 3761 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3762 if (_PyUnicode_UTF8(unicode) == NULL) { 3763 PyErr_NoMemory(); 3764 Py_DECREF(bytes); 3765 return NULL; 3766 } 3767 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3768 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3769 PyBytes_AS_STRING(bytes), 3770 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3771 Py_DECREF(bytes); 3772 } 3773 3774 if (psize) 3775 *psize = PyUnicode_UTF8_LENGTH(unicode); 3776 return PyUnicode_UTF8(unicode); 3777} 3778 3779char* 3780PyUnicode_AsUTF8(PyObject *unicode) 3781{ 3782 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3783} 3784 3785Py_UNICODE * 3786PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3787{ 3788 const unsigned char *one_byte; 3789#if SIZEOF_WCHAR_T == 4 3790 const Py_UCS2 *two_bytes; 3791#else 3792 const Py_UCS4 *four_bytes; 3793 const Py_UCS4 *ucs4_end; 3794 Py_ssize_t num_surrogates; 3795#endif 3796 wchar_t *w; 3797 wchar_t *wchar_end; 3798 3799 if (!PyUnicode_Check(unicode)) { 3800 PyErr_BadArgument(); 3801 return NULL; 3802 } 3803 if (_PyUnicode_WSTR(unicode) == NULL) { 3804 /* Non-ASCII compact unicode object */ 3805 assert(_PyUnicode_KIND(unicode) != 0); 3806 assert(PyUnicode_IS_READY(unicode)); 3807 3808 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3809#if SIZEOF_WCHAR_T == 2 3810 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3811 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3812 num_surrogates = 0; 3813 3814 for (; four_bytes < ucs4_end; ++four_bytes) { 3815 if (*four_bytes > 0xFFFF) 3816 ++num_surrogates; 3817 } 3818 3819 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3820 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3821 if (!_PyUnicode_WSTR(unicode)) { 3822 PyErr_NoMemory(); 3823 return NULL; 3824 } 3825 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3826 3827 w = _PyUnicode_WSTR(unicode); 3828 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3829 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3830 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3831 if (*four_bytes > 0xFFFF) { 3832 assert(*four_bytes <= MAX_UNICODE); 3833 /* encode surrogate pair in this case */ 3834 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3835 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3836 } 3837 else 3838 *w = *four_bytes; 3839 3840 if (w > wchar_end) { 3841 assert(0 && "Miscalculated string end"); 3842 } 3843 } 3844 *w = 0; 3845#else 3846 /* sizeof(wchar_t) == 4 */ 3847 Py_FatalError("Impossible unicode object state, wstr and str " 3848 "should share memory already."); 3849 return NULL; 3850#endif 3851 } 3852 else { 3853 if ((size_t)_PyUnicode_LENGTH(unicode) > 3854 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 3855 PyErr_NoMemory(); 3856 return NULL; 3857 } 3858 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3859 (_PyUnicode_LENGTH(unicode) + 1)); 3860 if (!_PyUnicode_WSTR(unicode)) { 3861 PyErr_NoMemory(); 3862 return NULL; 3863 } 3864 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3865 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3866 w = _PyUnicode_WSTR(unicode); 3867 wchar_end = w + _PyUnicode_LENGTH(unicode); 3868 3869 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3870 one_byte = PyUnicode_1BYTE_DATA(unicode); 3871 for (; w < wchar_end; ++one_byte, ++w) 3872 *w = *one_byte; 3873 /* null-terminate the wstr */ 3874 *w = 0; 3875 } 3876 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3877#if SIZEOF_WCHAR_T == 4 3878 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3879 for (; w < wchar_end; ++two_bytes, ++w) 3880 *w = *two_bytes; 3881 /* null-terminate the wstr */ 3882 *w = 0; 3883#else 3884 /* sizeof(wchar_t) == 2 */ 3885 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3886 _PyUnicode_WSTR(unicode) = NULL; 3887 Py_FatalError("Impossible unicode object state, wstr " 3888 "and str should share memory already."); 3889 return NULL; 3890#endif 3891 } 3892 else { 3893 assert(0 && "This should never happen."); 3894 } 3895 } 3896 } 3897 if (size != NULL) 3898 *size = PyUnicode_WSTR_LENGTH(unicode); 3899 return _PyUnicode_WSTR(unicode); 3900} 3901 3902Py_UNICODE * 3903PyUnicode_AsUnicode(PyObject *unicode) 3904{ 3905 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3906} 3907 3908 3909Py_ssize_t 3910PyUnicode_GetSize(PyObject *unicode) 3911{ 3912 if (!PyUnicode_Check(unicode)) { 3913 PyErr_BadArgument(); 3914 goto onError; 3915 } 3916 return PyUnicode_GET_SIZE(unicode); 3917 3918 onError: 3919 return -1; 3920} 3921 3922Py_ssize_t 3923PyUnicode_GetLength(PyObject *unicode) 3924{ 3925 if (!PyUnicode_Check(unicode)) { 3926 PyErr_BadArgument(); 3927 return -1; 3928 } 3929 if (PyUnicode_READY(unicode) == -1) 3930 return -1; 3931 return PyUnicode_GET_LENGTH(unicode); 3932} 3933 3934Py_UCS4 3935PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3936{ 3937 void *data; 3938 int kind; 3939 3940 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3941 PyErr_BadArgument(); 3942 return (Py_UCS4)-1; 3943 } 3944 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3945 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3946 return (Py_UCS4)-1; 3947 } 3948 data = PyUnicode_DATA(unicode); 3949 kind = PyUnicode_KIND(unicode); 3950 return PyUnicode_READ(kind, data, index); 3951} 3952 3953int 3954PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3955{ 3956 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3957 PyErr_BadArgument(); 3958 return -1; 3959 } 3960 assert(PyUnicode_IS_READY(unicode)); 3961 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3962 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3963 return -1; 3964 } 3965 if (unicode_check_modifiable(unicode)) 3966 return -1; 3967 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3968 PyErr_SetString(PyExc_ValueError, "character out of range"); 3969 return -1; 3970 } 3971 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3972 index, ch); 3973 return 0; 3974} 3975 3976const char * 3977PyUnicode_GetDefaultEncoding(void) 3978{ 3979 return "utf-8"; 3980} 3981 3982/* create or adjust a UnicodeDecodeError */ 3983static void 3984make_decode_exception(PyObject **exceptionObject, 3985 const char *encoding, 3986 const char *input, Py_ssize_t length, 3987 Py_ssize_t startpos, Py_ssize_t endpos, 3988 const char *reason) 3989{ 3990 if (*exceptionObject == NULL) { 3991 *exceptionObject = PyUnicodeDecodeError_Create( 3992 encoding, input, length, startpos, endpos, reason); 3993 } 3994 else { 3995 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3996 goto onError; 3997 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3998 goto onError; 3999 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4000 goto onError; 4001 } 4002 return; 4003 4004onError: 4005 Py_CLEAR(*exceptionObject); 4006} 4007 4008#ifdef HAVE_MBCS 4009/* error handling callback helper: 4010 build arguments, call the callback and check the arguments, 4011 if no exception occurred, copy the replacement to the output 4012 and adjust various state variables. 4013 return 0 on success, -1 on error 4014*/ 4015 4016static int 4017unicode_decode_call_errorhandler_wchar( 4018 const char *errors, PyObject **errorHandler, 4019 const char *encoding, const char *reason, 4020 const char **input, const char **inend, Py_ssize_t *startinpos, 4021 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4022 PyObject **output, Py_ssize_t *outpos) 4023{ 4024 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4025 4026 PyObject *restuple = NULL; 4027 PyObject *repunicode = NULL; 4028 Py_ssize_t outsize; 4029 Py_ssize_t insize; 4030 Py_ssize_t requiredsize; 4031 Py_ssize_t newpos; 4032 PyObject *inputobj = NULL; 4033 wchar_t *repwstr; 4034 Py_ssize_t repwlen; 4035 4036 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4037 outsize = _PyUnicode_WSTR_LENGTH(*output); 4038 4039 if (*errorHandler == NULL) { 4040 *errorHandler = PyCodec_LookupError(errors); 4041 if (*errorHandler == NULL) 4042 goto onError; 4043 } 4044 4045 make_decode_exception(exceptionObject, 4046 encoding, 4047 *input, *inend - *input, 4048 *startinpos, *endinpos, 4049 reason); 4050 if (*exceptionObject == NULL) 4051 goto onError; 4052 4053 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4054 if (restuple == NULL) 4055 goto onError; 4056 if (!PyTuple_Check(restuple)) { 4057 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4058 goto onError; 4059 } 4060 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4061 goto onError; 4062 4063 /* Copy back the bytes variables, which might have been modified by the 4064 callback */ 4065 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4066 if (!inputobj) 4067 goto onError; 4068 if (!PyBytes_Check(inputobj)) { 4069 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4070 } 4071 *input = PyBytes_AS_STRING(inputobj); 4072 insize = PyBytes_GET_SIZE(inputobj); 4073 *inend = *input + insize; 4074 /* we can DECREF safely, as the exception has another reference, 4075 so the object won't go away. */ 4076 Py_DECREF(inputobj); 4077 4078 if (newpos<0) 4079 newpos = insize+newpos; 4080 if (newpos<0 || newpos>insize) { 4081 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4082 goto onError; 4083 } 4084 4085 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4086 if (repwstr == NULL) 4087 goto onError; 4088 /* need more space? (at least enough for what we 4089 have+the replacement+the rest of the string (starting 4090 at the new input position), so we won't have to check space 4091 when there are no errors in the rest of the string) */ 4092 requiredsize = *outpos; 4093 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4094 goto overflow; 4095 requiredsize += repwlen; 4096 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4097 goto overflow; 4098 requiredsize += insize - newpos; 4099 if (requiredsize > outsize) { 4100 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4101 requiredsize = 2*outsize; 4102 if (unicode_resize(output, requiredsize) < 0) 4103 goto onError; 4104 } 4105 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4106 *outpos += repwlen; 4107 *endinpos = newpos; 4108 *inptr = *input + newpos; 4109 4110 /* we made it! */ 4111 Py_XDECREF(restuple); 4112 return 0; 4113 4114 overflow: 4115 PyErr_SetString(PyExc_OverflowError, 4116 "decoded result is too long for a Python string"); 4117 4118 onError: 4119 Py_XDECREF(restuple); 4120 return -1; 4121} 4122#endif /* HAVE_MBCS */ 4123 4124static int 4125unicode_decode_call_errorhandler_writer( 4126 const char *errors, PyObject **errorHandler, 4127 const char *encoding, const char *reason, 4128 const char **input, const char **inend, Py_ssize_t *startinpos, 4129 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4130 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4131{ 4132 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4133 4134 PyObject *restuple = NULL; 4135 PyObject *repunicode = NULL; 4136 Py_ssize_t insize; 4137 Py_ssize_t newpos; 4138 Py_ssize_t replen; 4139 PyObject *inputobj = NULL; 4140 4141 if (*errorHandler == NULL) { 4142 *errorHandler = PyCodec_LookupError(errors); 4143 if (*errorHandler == NULL) 4144 goto onError; 4145 } 4146 4147 make_decode_exception(exceptionObject, 4148 encoding, 4149 *input, *inend - *input, 4150 *startinpos, *endinpos, 4151 reason); 4152 if (*exceptionObject == NULL) 4153 goto onError; 4154 4155 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4156 if (restuple == NULL) 4157 goto onError; 4158 if (!PyTuple_Check(restuple)) { 4159 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4160 goto onError; 4161 } 4162 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4163 goto onError; 4164 4165 /* Copy back the bytes variables, which might have been modified by the 4166 callback */ 4167 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4168 if (!inputobj) 4169 goto onError; 4170 if (!PyBytes_Check(inputobj)) { 4171 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4172 } 4173 *input = PyBytes_AS_STRING(inputobj); 4174 insize = PyBytes_GET_SIZE(inputobj); 4175 *inend = *input + insize; 4176 /* we can DECREF safely, as the exception has another reference, 4177 so the object won't go away. */ 4178 Py_DECREF(inputobj); 4179 4180 if (newpos<0) 4181 newpos = insize+newpos; 4182 if (newpos<0 || newpos>insize) { 4183 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4184 goto onError; 4185 } 4186 4187 if (PyUnicode_READY(repunicode) < 0) 4188 goto onError; 4189 replen = PyUnicode_GET_LENGTH(repunicode); 4190 if (replen > 1) { 4191 writer->min_length += replen - 1; 4192 writer->overallocate = 1; 4193 if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4194 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4195 goto onError; 4196 } 4197 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4198 goto onError; 4199 4200 *endinpos = newpos; 4201 *inptr = *input + newpos; 4202 4203 /* we made it! */ 4204 Py_XDECREF(restuple); 4205 return 0; 4206 4207 onError: 4208 Py_XDECREF(restuple); 4209 return -1; 4210} 4211 4212/* --- UTF-7 Codec -------------------------------------------------------- */ 4213 4214/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4215 4216/* Three simple macros defining base-64. */ 4217 4218/* Is c a base-64 character? */ 4219 4220#define IS_BASE64(c) \ 4221 (((c) >= 'A' && (c) <= 'Z') || \ 4222 ((c) >= 'a' && (c) <= 'z') || \ 4223 ((c) >= '0' && (c) <= '9') || \ 4224 (c) == '+' || (c) == '/') 4225 4226/* given that c is a base-64 character, what is its base-64 value? */ 4227 4228#define FROM_BASE64(c) \ 4229 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4230 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4231 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4232 (c) == '+' ? 62 : 63) 4233 4234/* What is the base-64 character of the bottom 6 bits of n? */ 4235 4236#define TO_BASE64(n) \ 4237 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4238 4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4240 * decoded as itself. We are permissive on decoding; the only ASCII 4241 * byte not decoding to itself is the + which begins a base64 4242 * string. */ 4243 4244#define DECODE_DIRECT(c) \ 4245 ((c) <= 127 && (c) != '+') 4246 4247/* The UTF-7 encoder treats ASCII characters differently according to 4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4249 * the above). See RFC2152. This array identifies these different 4250 * sets: 4251 * 0 : "Set D" 4252 * alphanumeric and '(),-./:? 4253 * 1 : "Set O" 4254 * !"#$%&*;<=>@[]^_`{|} 4255 * 2 : "whitespace" 4256 * ht nl cr sp 4257 * 3 : special (must be base64 encoded) 4258 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4259 */ 4260 4261static 4262char utf7_category[128] = { 4263/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4264 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4266 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4267/* sp ! " # $ % & ' ( ) * + , - . / */ 4268 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4269/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4271/* @ A B C D E F G H I J K L M N O */ 4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4273/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4275/* ` a b c d e f g h i j k l m n o */ 4276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4277/* p q r s t u v w x y z { | } ~ del */ 4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4279}; 4280 4281/* ENCODE_DIRECT: this character should be encoded as itself. The 4282 * answer depends on whether we are encoding set O as itself, and also 4283 * on whether we are encoding whitespace as itself. RFC2152 makes it 4284 * clear that the answers to these questions vary between 4285 * applications, so this code needs to be flexible. */ 4286 4287#define ENCODE_DIRECT(c, directO, directWS) \ 4288 ((c) < 128 && (c) > 0 && \ 4289 ((utf7_category[(c)] == 0) || \ 4290 (directWS && (utf7_category[(c)] == 2)) || \ 4291 (directO && (utf7_category[(c)] == 1)))) 4292 4293PyObject * 4294PyUnicode_DecodeUTF7(const char *s, 4295 Py_ssize_t size, 4296 const char *errors) 4297{ 4298 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4299} 4300 4301/* The decoder. The only state we preserve is our read position, 4302 * i.e. how many characters we have consumed. So if we end in the 4303 * middle of a shift sequence we have to back off the read position 4304 * and the output to the beginning of the sequence, otherwise we lose 4305 * all the shift state (seen bits, number of bits seen, high 4306 * surrogate). */ 4307 4308PyObject * 4309PyUnicode_DecodeUTF7Stateful(const char *s, 4310 Py_ssize_t size, 4311 const char *errors, 4312 Py_ssize_t *consumed) 4313{ 4314 const char *starts = s; 4315 Py_ssize_t startinpos; 4316 Py_ssize_t endinpos; 4317 const char *e; 4318 _PyUnicodeWriter writer; 4319 const char *errmsg = ""; 4320 int inShift = 0; 4321 Py_ssize_t shiftOutStart; 4322 unsigned int base64bits = 0; 4323 unsigned long base64buffer = 0; 4324 Py_UCS4 surrogate = 0; 4325 PyObject *errorHandler = NULL; 4326 PyObject *exc = NULL; 4327 4328 if (size == 0) { 4329 if (consumed) 4330 *consumed = 0; 4331 _Py_RETURN_UNICODE_EMPTY(); 4332 } 4333 4334 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4335 _PyUnicodeWriter_Init(&writer); 4336 writer.min_length = size; 4337 4338 shiftOutStart = 0; 4339 e = s + size; 4340 4341 while (s < e) { 4342 Py_UCS4 ch; 4343 restart: 4344 ch = (unsigned char) *s; 4345 4346 if (inShift) { /* in a base-64 section */ 4347 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4348 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4349 base64bits += 6; 4350 s++; 4351 if (base64bits >= 16) { 4352 /* we have enough bits for a UTF-16 value */ 4353 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4354 base64bits -= 16; 4355 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4356 assert(outCh <= 0xffff); 4357 if (surrogate) { 4358 /* expecting a second surrogate */ 4359 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4360 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4361 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4362 goto onError; 4363 surrogate = 0; 4364 continue; 4365 } 4366 else { 4367 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4368 goto onError; 4369 surrogate = 0; 4370 } 4371 } 4372 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4373 /* first surrogate */ 4374 surrogate = outCh; 4375 } 4376 else { 4377 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4378 goto onError; 4379 } 4380 } 4381 } 4382 else { /* now leaving a base-64 section */ 4383 inShift = 0; 4384 s++; 4385 if (surrogate) { 4386 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4387 goto onError; 4388 surrogate = 0; 4389 } 4390 if (base64bits > 0) { /* left-over bits */ 4391 if (base64bits >= 6) { 4392 /* We've seen at least one base-64 character */ 4393 errmsg = "partial character in shift sequence"; 4394 goto utf7Error; 4395 } 4396 else { 4397 /* Some bits remain; they should be zero */ 4398 if (base64buffer != 0) { 4399 errmsg = "non-zero padding bits in shift sequence"; 4400 goto utf7Error; 4401 } 4402 } 4403 } 4404 if (ch != '-') { 4405 /* '-' is absorbed; other terminating 4406 characters are preserved */ 4407 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4408 goto onError; 4409 } 4410 } 4411 } 4412 else if ( ch == '+' ) { 4413 startinpos = s-starts; 4414 s++; /* consume '+' */ 4415 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4416 s++; 4417 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4418 goto onError; 4419 } 4420 else { /* begin base64-encoded section */ 4421 inShift = 1; 4422 shiftOutStart = writer.pos; 4423 base64bits = 0; 4424 base64buffer = 0; 4425 } 4426 } 4427 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4428 s++; 4429 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4430 goto onError; 4431 } 4432 else { 4433 startinpos = s-starts; 4434 s++; 4435 errmsg = "unexpected special character"; 4436 goto utf7Error; 4437 } 4438 continue; 4439utf7Error: 4440 endinpos = s-starts; 4441 if (unicode_decode_call_errorhandler_writer( 4442 errors, &errorHandler, 4443 "utf7", errmsg, 4444 &starts, &e, &startinpos, &endinpos, &exc, &s, 4445 &writer)) 4446 goto onError; 4447 } 4448 4449 /* end of string */ 4450 4451 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4452 /* if we're in an inconsistent state, that's an error */ 4453 if (surrogate || 4454 (base64bits >= 6) || 4455 (base64bits > 0 && base64buffer != 0)) { 4456 endinpos = size; 4457 if (unicode_decode_call_errorhandler_writer( 4458 errors, &errorHandler, 4459 "utf7", "unterminated shift sequence", 4460 &starts, &e, &startinpos, &endinpos, &exc, &s, 4461 &writer)) 4462 goto onError; 4463 if (s < e) 4464 goto restart; 4465 } 4466 } 4467 4468 /* return state */ 4469 if (consumed) { 4470 if (inShift) { 4471 *consumed = startinpos; 4472 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4473 PyObject *result = PyUnicode_FromKindAndData( 4474 writer.kind, writer.data, shiftOutStart); 4475 Py_XDECREF(errorHandler); 4476 Py_XDECREF(exc); 4477 _PyUnicodeWriter_Dealloc(&writer); 4478 return result; 4479 } 4480 writer.pos = shiftOutStart; /* back off output */ 4481 } 4482 else { 4483 *consumed = s-starts; 4484 } 4485 } 4486 4487 Py_XDECREF(errorHandler); 4488 Py_XDECREF(exc); 4489 return _PyUnicodeWriter_Finish(&writer); 4490 4491 onError: 4492 Py_XDECREF(errorHandler); 4493 Py_XDECREF(exc); 4494 _PyUnicodeWriter_Dealloc(&writer); 4495 return NULL; 4496} 4497 4498 4499PyObject * 4500_PyUnicode_EncodeUTF7(PyObject *str, 4501 int base64SetO, 4502 int base64WhiteSpace, 4503 const char *errors) 4504{ 4505 int kind; 4506 void *data; 4507 Py_ssize_t len; 4508 PyObject *v; 4509 int inShift = 0; 4510 Py_ssize_t i; 4511 unsigned int base64bits = 0; 4512 unsigned long base64buffer = 0; 4513 char * out; 4514 char * start; 4515 4516 if (PyUnicode_READY(str) == -1) 4517 return NULL; 4518 kind = PyUnicode_KIND(str); 4519 data = PyUnicode_DATA(str); 4520 len = PyUnicode_GET_LENGTH(str); 4521 4522 if (len == 0) 4523 return PyBytes_FromStringAndSize(NULL, 0); 4524 4525 /* It might be possible to tighten this worst case */ 4526 if (len > PY_SSIZE_T_MAX / 8) 4527 return PyErr_NoMemory(); 4528 v = PyBytes_FromStringAndSize(NULL, len * 8); 4529 if (v == NULL) 4530 return NULL; 4531 4532 start = out = PyBytes_AS_STRING(v); 4533 for (i = 0; i < len; ++i) { 4534 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4535 4536 if (inShift) { 4537 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4538 /* shifting out */ 4539 if (base64bits) { /* output remaining bits */ 4540 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4541 base64buffer = 0; 4542 base64bits = 0; 4543 } 4544 inShift = 0; 4545 /* Characters not in the BASE64 set implicitly unshift the sequence 4546 so no '-' is required, except if the character is itself a '-' */ 4547 if (IS_BASE64(ch) || ch == '-') { 4548 *out++ = '-'; 4549 } 4550 *out++ = (char) ch; 4551 } 4552 else { 4553 goto encode_char; 4554 } 4555 } 4556 else { /* not in a shift sequence */ 4557 if (ch == '+') { 4558 *out++ = '+'; 4559 *out++ = '-'; 4560 } 4561 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4562 *out++ = (char) ch; 4563 } 4564 else { 4565 *out++ = '+'; 4566 inShift = 1; 4567 goto encode_char; 4568 } 4569 } 4570 continue; 4571encode_char: 4572 if (ch >= 0x10000) { 4573 assert(ch <= MAX_UNICODE); 4574 4575 /* code first surrogate */ 4576 base64bits += 16; 4577 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4578 while (base64bits >= 6) { 4579 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4580 base64bits -= 6; 4581 } 4582 /* prepare second surrogate */ 4583 ch = Py_UNICODE_LOW_SURROGATE(ch); 4584 } 4585 base64bits += 16; 4586 base64buffer = (base64buffer << 16) | ch; 4587 while (base64bits >= 6) { 4588 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4589 base64bits -= 6; 4590 } 4591 } 4592 if (base64bits) 4593 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4594 if (inShift) 4595 *out++ = '-'; 4596 if (_PyBytes_Resize(&v, out - start) < 0) 4597 return NULL; 4598 return v; 4599} 4600PyObject * 4601PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4602 Py_ssize_t size, 4603 int base64SetO, 4604 int base64WhiteSpace, 4605 const char *errors) 4606{ 4607 PyObject *result; 4608 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4609 if (tmp == NULL) 4610 return NULL; 4611 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4612 base64WhiteSpace, errors); 4613 Py_DECREF(tmp); 4614 return result; 4615} 4616 4617#undef IS_BASE64 4618#undef FROM_BASE64 4619#undef TO_BASE64 4620#undef DECODE_DIRECT 4621#undef ENCODE_DIRECT 4622 4623/* --- UTF-8 Codec -------------------------------------------------------- */ 4624 4625PyObject * 4626PyUnicode_DecodeUTF8(const char *s, 4627 Py_ssize_t size, 4628 const char *errors) 4629{ 4630 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4631} 4632 4633#include "stringlib/asciilib.h" 4634#include "stringlib/codecs.h" 4635#include "stringlib/undef.h" 4636 4637#include "stringlib/ucs1lib.h" 4638#include "stringlib/codecs.h" 4639#include "stringlib/undef.h" 4640 4641#include "stringlib/ucs2lib.h" 4642#include "stringlib/codecs.h" 4643#include "stringlib/undef.h" 4644 4645#include "stringlib/ucs4lib.h" 4646#include "stringlib/codecs.h" 4647#include "stringlib/undef.h" 4648 4649/* Mask to quickly check whether a C 'long' contains a 4650 non-ASCII, UTF8-encoded char. */ 4651#if (SIZEOF_LONG == 8) 4652# define ASCII_CHAR_MASK 0x8080808080808080UL 4653#elif (SIZEOF_LONG == 4) 4654# define ASCII_CHAR_MASK 0x80808080UL 4655#else 4656# error C 'long' size should be either 4 or 8! 4657#endif 4658 4659static Py_ssize_t 4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4661{ 4662 const char *p = start; 4663 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4664 4665 /* 4666 * Issue #17237: m68k is a bit different from most architectures in 4667 * that objects do not use "natural alignment" - for example, int and 4668 * long are only aligned at 2-byte boundaries. Therefore the assert() 4669 * won't work; also, tests have shown that skipping the "optimised 4670 * version" will even speed up m68k. 4671 */ 4672#if !defined(__m68k__) 4673#if SIZEOF_LONG <= SIZEOF_VOID_P 4674 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4675 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4676 /* Fast path, see in STRINGLIB(utf8_decode) for 4677 an explanation. */ 4678 /* Help allocation */ 4679 const char *_p = p; 4680 Py_UCS1 * q = dest; 4681 while (_p < aligned_end) { 4682 unsigned long value = *(const unsigned long *) _p; 4683 if (value & ASCII_CHAR_MASK) 4684 break; 4685 *((unsigned long *)q) = value; 4686 _p += SIZEOF_LONG; 4687 q += SIZEOF_LONG; 4688 } 4689 p = _p; 4690 while (p < end) { 4691 if ((unsigned char)*p & 0x80) 4692 break; 4693 *q++ = *p++; 4694 } 4695 return p - start; 4696 } 4697#endif 4698#endif 4699 while (p < end) { 4700 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4701 for an explanation. */ 4702 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4703 /* Help allocation */ 4704 const char *_p = p; 4705 while (_p < aligned_end) { 4706 unsigned long value = *(unsigned long *) _p; 4707 if (value & ASCII_CHAR_MASK) 4708 break; 4709 _p += SIZEOF_LONG; 4710 } 4711 p = _p; 4712 if (_p == end) 4713 break; 4714 } 4715 if ((unsigned char)*p & 0x80) 4716 break; 4717 ++p; 4718 } 4719 memcpy(dest, start, p - start); 4720 return p - start; 4721} 4722 4723PyObject * 4724PyUnicode_DecodeUTF8Stateful(const char *s, 4725 Py_ssize_t size, 4726 const char *errors, 4727 Py_ssize_t *consumed) 4728{ 4729 _PyUnicodeWriter writer; 4730 const char *starts = s; 4731 const char *end = s + size; 4732 4733 Py_ssize_t startinpos; 4734 Py_ssize_t endinpos; 4735 const char *errmsg = ""; 4736 PyObject *errorHandler = NULL; 4737 PyObject *exc = NULL; 4738 4739 if (size == 0) { 4740 if (consumed) 4741 *consumed = 0; 4742 _Py_RETURN_UNICODE_EMPTY(); 4743 } 4744 4745 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4746 if (size == 1 && (unsigned char)s[0] < 128) { 4747 if (consumed) 4748 *consumed = 1; 4749 return get_latin1_char((unsigned char)s[0]); 4750 } 4751 4752 _PyUnicodeWriter_Init(&writer); 4753 writer.min_length = size; 4754 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4755 goto onError; 4756 4757 writer.pos = ascii_decode(s, end, writer.data); 4758 s += writer.pos; 4759 while (s < end) { 4760 Py_UCS4 ch; 4761 int kind = writer.kind; 4762 if (kind == PyUnicode_1BYTE_KIND) { 4763 if (PyUnicode_IS_ASCII(writer.buffer)) 4764 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4765 else 4766 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4767 } else if (kind == PyUnicode_2BYTE_KIND) { 4768 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4769 } else { 4770 assert(kind == PyUnicode_4BYTE_KIND); 4771 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4772 } 4773 4774 switch (ch) { 4775 case 0: 4776 if (s == end || consumed) 4777 goto End; 4778 errmsg = "unexpected end of data"; 4779 startinpos = s - starts; 4780 endinpos = end - starts; 4781 break; 4782 case 1: 4783 errmsg = "invalid start byte"; 4784 startinpos = s - starts; 4785 endinpos = startinpos + 1; 4786 break; 4787 case 2: 4788 case 3: 4789 case 4: 4790 errmsg = "invalid continuation byte"; 4791 startinpos = s - starts; 4792 endinpos = startinpos + ch - 1; 4793 break; 4794 default: 4795 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4796 goto onError; 4797 continue; 4798 } 4799 4800 if (unicode_decode_call_errorhandler_writer( 4801 errors, &errorHandler, 4802 "utf-8", errmsg, 4803 &starts, &end, &startinpos, &endinpos, &exc, &s, 4804 &writer)) 4805 goto onError; 4806 } 4807 4808End: 4809 if (consumed) 4810 *consumed = s - starts; 4811 4812 Py_XDECREF(errorHandler); 4813 Py_XDECREF(exc); 4814 return _PyUnicodeWriter_Finish(&writer); 4815 4816onError: 4817 Py_XDECREF(errorHandler); 4818 Py_XDECREF(exc); 4819 _PyUnicodeWriter_Dealloc(&writer); 4820 return NULL; 4821} 4822 4823#ifdef __APPLE__ 4824 4825/* Simplified UTF-8 decoder using surrogateescape error handler, 4826 used to decode the command line arguments on Mac OS X. 4827 4828 Return a pointer to a newly allocated wide character string (use 4829 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4830 4831wchar_t* 4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4833{ 4834 const char *e; 4835 wchar_t *unicode; 4836 Py_ssize_t outpos; 4837 4838 /* Note: size will always be longer than the resulting Unicode 4839 character count */ 4840 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4841 return NULL; 4842 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4843 if (!unicode) 4844 return NULL; 4845 4846 /* Unpack UTF-8 encoded data */ 4847 e = s + size; 4848 outpos = 0; 4849 while (s < e) { 4850 Py_UCS4 ch; 4851#if SIZEOF_WCHAR_T == 4 4852 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4853#else 4854 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4855#endif 4856 if (ch > 0xFF) { 4857#if SIZEOF_WCHAR_T == 4 4858 assert(0); 4859#else 4860 assert(Py_UNICODE_IS_SURROGATE(ch)); 4861 /* compute and append the two surrogates: */ 4862 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4863 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4864#endif 4865 } 4866 else { 4867 if (!ch && s == e) 4868 break; 4869 /* surrogateescape */ 4870 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4871 } 4872 } 4873 unicode[outpos] = L'\0'; 4874 return unicode; 4875} 4876 4877#endif /* __APPLE__ */ 4878 4879/* Primary internal function which creates utf8 encoded bytes objects. 4880 4881 Allocation strategy: if the string is short, convert into a stack buffer 4882 and allocate exactly as much space needed at the end. Else allocate the 4883 maximum possible needed (4 result bytes per Unicode character), and return 4884 the excess memory at the end. 4885*/ 4886PyObject * 4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4888{ 4889 enum PyUnicode_Kind kind; 4890 void *data; 4891 Py_ssize_t size; 4892 4893 if (!PyUnicode_Check(unicode)) { 4894 PyErr_BadArgument(); 4895 return NULL; 4896 } 4897 4898 if (PyUnicode_READY(unicode) == -1) 4899 return NULL; 4900 4901 if (PyUnicode_UTF8(unicode)) 4902 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4903 PyUnicode_UTF8_LENGTH(unicode)); 4904 4905 kind = PyUnicode_KIND(unicode); 4906 data = PyUnicode_DATA(unicode); 4907 size = PyUnicode_GET_LENGTH(unicode); 4908 4909 switch (kind) { 4910 default: 4911 assert(0); 4912 case PyUnicode_1BYTE_KIND: 4913 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4914 assert(!PyUnicode_IS_ASCII(unicode)); 4915 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4916 case PyUnicode_2BYTE_KIND: 4917 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4918 case PyUnicode_4BYTE_KIND: 4919 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4920 } 4921} 4922 4923PyObject * 4924PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4925 Py_ssize_t size, 4926 const char *errors) 4927{ 4928 PyObject *v, *unicode; 4929 4930 unicode = PyUnicode_FromUnicode(s, size); 4931 if (unicode == NULL) 4932 return NULL; 4933 v = _PyUnicode_AsUTF8String(unicode, errors); 4934 Py_DECREF(unicode); 4935 return v; 4936} 4937 4938PyObject * 4939PyUnicode_AsUTF8String(PyObject *unicode) 4940{ 4941 return _PyUnicode_AsUTF8String(unicode, NULL); 4942} 4943 4944/* --- UTF-32 Codec ------------------------------------------------------- */ 4945 4946PyObject * 4947PyUnicode_DecodeUTF32(const char *s, 4948 Py_ssize_t size, 4949 const char *errors, 4950 int *byteorder) 4951{ 4952 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4953} 4954 4955PyObject * 4956PyUnicode_DecodeUTF32Stateful(const char *s, 4957 Py_ssize_t size, 4958 const char *errors, 4959 int *byteorder, 4960 Py_ssize_t *consumed) 4961{ 4962 const char *starts = s; 4963 Py_ssize_t startinpos; 4964 Py_ssize_t endinpos; 4965 _PyUnicodeWriter writer; 4966 const unsigned char *q, *e; 4967 int le, bo = 0; /* assume native ordering by default */ 4968 const char *encoding; 4969 const char *errmsg = ""; 4970 PyObject *errorHandler = NULL; 4971 PyObject *exc = NULL; 4972 4973 q = (unsigned char *)s; 4974 e = q + size; 4975 4976 if (byteorder) 4977 bo = *byteorder; 4978 4979 /* Check for BOM marks (U+FEFF) in the input and adjust current 4980 byte order setting accordingly. In native mode, the leading BOM 4981 mark is skipped, in all other modes, it is copied to the output 4982 stream as-is (giving a ZWNBSP character). */ 4983 if (bo == 0 && size >= 4) { 4984 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4985 if (bom == 0x0000FEFF) { 4986 bo = -1; 4987 q += 4; 4988 } 4989 else if (bom == 0xFFFE0000) { 4990 bo = 1; 4991 q += 4; 4992 } 4993 if (byteorder) 4994 *byteorder = bo; 4995 } 4996 4997 if (q == e) { 4998 if (consumed) 4999 *consumed = size; 5000 _Py_RETURN_UNICODE_EMPTY(); 5001 } 5002 5003#ifdef WORDS_BIGENDIAN 5004 le = bo < 0; 5005#else 5006 le = bo <= 0; 5007#endif 5008 encoding = le ? "utf-32-le" : "utf-32-be"; 5009 5010 _PyUnicodeWriter_Init(&writer); 5011 writer.min_length = (e - q + 3) / 4; 5012 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5013 goto onError; 5014 5015 while (1) { 5016 Py_UCS4 ch = 0; 5017 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5018 5019 if (e - q >= 4) { 5020 enum PyUnicode_Kind kind = writer.kind; 5021 void *data = writer.data; 5022 const unsigned char *last = e - 4; 5023 Py_ssize_t pos = writer.pos; 5024 if (le) { 5025 do { 5026 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5027 if (ch > maxch) 5028 break; 5029 if (kind != PyUnicode_1BYTE_KIND && 5030 Py_UNICODE_IS_SURROGATE(ch)) 5031 break; 5032 PyUnicode_WRITE(kind, data, pos++, ch); 5033 q += 4; 5034 } while (q <= last); 5035 } 5036 else { 5037 do { 5038 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5039 if (ch > maxch) 5040 break; 5041 if (kind != PyUnicode_1BYTE_KIND && 5042 Py_UNICODE_IS_SURROGATE(ch)) 5043 break; 5044 PyUnicode_WRITE(kind, data, pos++, ch); 5045 q += 4; 5046 } while (q <= last); 5047 } 5048 writer.pos = pos; 5049 } 5050 5051 if (Py_UNICODE_IS_SURROGATE(ch)) { 5052 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5053 startinpos = ((const char *)q) - starts; 5054 endinpos = startinpos + 4; 5055 } 5056 else if (ch <= maxch) { 5057 if (q == e || consumed) 5058 break; 5059 /* remaining bytes at the end? (size should be divisible by 4) */ 5060 errmsg = "truncated data"; 5061 startinpos = ((const char *)q) - starts; 5062 endinpos = ((const char *)e) - starts; 5063 } 5064 else { 5065 if (ch < 0x110000) { 5066 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5067 goto onError; 5068 q += 4; 5069 continue; 5070 } 5071 errmsg = "code point not in range(0x110000)"; 5072 startinpos = ((const char *)q) - starts; 5073 endinpos = startinpos + 4; 5074 } 5075 5076 /* The remaining input chars are ignored if the callback 5077 chooses to skip the input */ 5078 if (unicode_decode_call_errorhandler_writer( 5079 errors, &errorHandler, 5080 encoding, errmsg, 5081 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5082 &writer)) 5083 goto onError; 5084 } 5085 5086 if (consumed) 5087 *consumed = (const char *)q-starts; 5088 5089 Py_XDECREF(errorHandler); 5090 Py_XDECREF(exc); 5091 return _PyUnicodeWriter_Finish(&writer); 5092 5093 onError: 5094 _PyUnicodeWriter_Dealloc(&writer); 5095 Py_XDECREF(errorHandler); 5096 Py_XDECREF(exc); 5097 return NULL; 5098} 5099 5100PyObject * 5101_PyUnicode_EncodeUTF32(PyObject *str, 5102 const char *errors, 5103 int byteorder) 5104{ 5105 int kind; 5106 void *data; 5107 Py_ssize_t len; 5108 PyObject *v; 5109 unsigned char *p; 5110 Py_ssize_t nsize, i; 5111 /* Offsets from p for storing byte pairs in the right order. */ 5112#if PY_LITTLE_ENDIAN 5113 int iorder[] = {0, 1, 2, 3}; 5114#else 5115 int iorder[] = {3, 2, 1, 0}; 5116#endif 5117 const char *encoding; 5118 PyObject *errorHandler = NULL; 5119 PyObject *exc = NULL; 5120 PyObject *rep = NULL; 5121 5122#define STORECHAR(CH) \ 5123 do { \ 5124 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5125 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5126 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5127 p[iorder[0]] = (CH) & 0xff; \ 5128 p += 4; \ 5129 } while(0) 5130 5131 if (!PyUnicode_Check(str)) { 5132 PyErr_BadArgument(); 5133 return NULL; 5134 } 5135 if (PyUnicode_READY(str) == -1) 5136 return NULL; 5137 kind = PyUnicode_KIND(str); 5138 data = PyUnicode_DATA(str); 5139 len = PyUnicode_GET_LENGTH(str); 5140 5141 nsize = len + (byteorder == 0); 5142 if (nsize > PY_SSIZE_T_MAX / 4) 5143 return PyErr_NoMemory(); 5144 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5145 if (v == NULL) 5146 return NULL; 5147 5148 p = (unsigned char *)PyBytes_AS_STRING(v); 5149 if (byteorder == 0) 5150 STORECHAR(0xFEFF); 5151 if (len == 0) 5152 return v; 5153 5154 if (byteorder == -1) { 5155 /* force LE */ 5156 iorder[0] = 0; 5157 iorder[1] = 1; 5158 iorder[2] = 2; 5159 iorder[3] = 3; 5160 encoding = "utf-32-le"; 5161 } 5162 else if (byteorder == 1) { 5163 /* force BE */ 5164 iorder[0] = 3; 5165 iorder[1] = 2; 5166 iorder[2] = 1; 5167 iorder[3] = 0; 5168 encoding = "utf-32-be"; 5169 } 5170 else 5171 encoding = "utf-32"; 5172 5173 if (kind == PyUnicode_1BYTE_KIND) { 5174 for (i = 0; i < len; i++) 5175 STORECHAR(PyUnicode_READ(kind, data, i)); 5176 return v; 5177 } 5178 5179 for (i = 0; i < len;) { 5180 Py_ssize_t repsize, moreunits; 5181 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5182 i++; 5183 assert(ch <= MAX_UNICODE); 5184 if (!Py_UNICODE_IS_SURROGATE(ch)) { 5185 STORECHAR(ch); 5186 continue; 5187 } 5188 5189 rep = unicode_encode_call_errorhandler( 5190 errors, &errorHandler, 5191 encoding, "surrogates not allowed", 5192 str, &exc, i-1, i, &i); 5193 5194 if (!rep) 5195 goto error; 5196 5197 if (PyBytes_Check(rep)) { 5198 repsize = PyBytes_GET_SIZE(rep); 5199 if (repsize & 3) { 5200 raise_encode_exception(&exc, encoding, 5201 str, i - 1, i, 5202 "surrogates not allowed"); 5203 goto error; 5204 } 5205 moreunits = repsize / 4; 5206 } 5207 else { 5208 assert(PyUnicode_Check(rep)); 5209 if (PyUnicode_READY(rep) < 0) 5210 goto error; 5211 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5212 if (!PyUnicode_IS_ASCII(rep)) { 5213 raise_encode_exception(&exc, encoding, 5214 str, i - 1, i, 5215 "surrogates not allowed"); 5216 goto error; 5217 } 5218 } 5219 5220 /* four bytes are reserved for each surrogate */ 5221 if (moreunits > 1) { 5222 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); 5223 Py_ssize_t morebytes = 4 * (moreunits - 1); 5224 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5225 /* integer overflow */ 5226 PyErr_NoMemory(); 5227 goto error; 5228 } 5229 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5230 goto error; 5231 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; 5232 } 5233 5234 if (PyBytes_Check(rep)) { 5235 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); 5236 p += repsize; 5237 } else /* rep is unicode */ { 5238 const Py_UCS1 *repdata; 5239 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5240 repdata = PyUnicode_1BYTE_DATA(rep); 5241 while (repsize--) { 5242 Py_UCS4 ch = *repdata++; 5243 STORECHAR(ch); 5244 } 5245 } 5246 5247 Py_CLEAR(rep); 5248 } 5249 5250 /* Cut back to size actually needed. This is necessary for, for example, 5251 encoding of a string containing isolated surrogates and the 'ignore' 5252 handler is used. */ 5253 nsize = p - (unsigned char*) PyBytes_AS_STRING(v); 5254 if (nsize != PyBytes_GET_SIZE(v)) 5255 _PyBytes_Resize(&v, nsize); 5256 Py_XDECREF(errorHandler); 5257 Py_XDECREF(exc); 5258 return v; 5259 error: 5260 Py_XDECREF(rep); 5261 Py_XDECREF(errorHandler); 5262 Py_XDECREF(exc); 5263 Py_XDECREF(v); 5264 return NULL; 5265#undef STORECHAR 5266} 5267 5268PyObject * 5269PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5270 Py_ssize_t size, 5271 const char *errors, 5272 int byteorder) 5273{ 5274 PyObject *result; 5275 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5276 if (tmp == NULL) 5277 return NULL; 5278 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5279 Py_DECREF(tmp); 5280 return result; 5281} 5282 5283PyObject * 5284PyUnicode_AsUTF32String(PyObject *unicode) 5285{ 5286 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5287} 5288 5289/* --- UTF-16 Codec ------------------------------------------------------- */ 5290 5291PyObject * 5292PyUnicode_DecodeUTF16(const char *s, 5293 Py_ssize_t size, 5294 const char *errors, 5295 int *byteorder) 5296{ 5297 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5298} 5299 5300PyObject * 5301PyUnicode_DecodeUTF16Stateful(const char *s, 5302 Py_ssize_t size, 5303 const char *errors, 5304 int *byteorder, 5305 Py_ssize_t *consumed) 5306{ 5307 const char *starts = s; 5308 Py_ssize_t startinpos; 5309 Py_ssize_t endinpos; 5310 _PyUnicodeWriter writer; 5311 const unsigned char *q, *e; 5312 int bo = 0; /* assume native ordering by default */ 5313 int native_ordering; 5314 const char *errmsg = ""; 5315 PyObject *errorHandler = NULL; 5316 PyObject *exc = NULL; 5317 const char *encoding; 5318 5319 q = (unsigned char *)s; 5320 e = q + size; 5321 5322 if (byteorder) 5323 bo = *byteorder; 5324 5325 /* Check for BOM marks (U+FEFF) in the input and adjust current 5326 byte order setting accordingly. In native mode, the leading BOM 5327 mark is skipped, in all other modes, it is copied to the output 5328 stream as-is (giving a ZWNBSP character). */ 5329 if (bo == 0 && size >= 2) { 5330 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5331 if (bom == 0xFEFF) { 5332 q += 2; 5333 bo = -1; 5334 } 5335 else if (bom == 0xFFFE) { 5336 q += 2; 5337 bo = 1; 5338 } 5339 if (byteorder) 5340 *byteorder = bo; 5341 } 5342 5343 if (q == e) { 5344 if (consumed) 5345 *consumed = size; 5346 _Py_RETURN_UNICODE_EMPTY(); 5347 } 5348 5349#if PY_LITTLE_ENDIAN 5350 native_ordering = bo <= 0; 5351 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5352#else 5353 native_ordering = bo >= 0; 5354 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5355#endif 5356 5357 /* Note: size will always be longer than the resulting Unicode 5358 character count */ 5359 _PyUnicodeWriter_Init(&writer); 5360 writer.min_length = (e - q + 1) / 2; 5361 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5362 goto onError; 5363 5364 while (1) { 5365 Py_UCS4 ch = 0; 5366 if (e - q >= 2) { 5367 int kind = writer.kind; 5368 if (kind == PyUnicode_1BYTE_KIND) { 5369 if (PyUnicode_IS_ASCII(writer.buffer)) 5370 ch = asciilib_utf16_decode(&q, e, 5371 (Py_UCS1*)writer.data, &writer.pos, 5372 native_ordering); 5373 else 5374 ch = ucs1lib_utf16_decode(&q, e, 5375 (Py_UCS1*)writer.data, &writer.pos, 5376 native_ordering); 5377 } else if (kind == PyUnicode_2BYTE_KIND) { 5378 ch = ucs2lib_utf16_decode(&q, e, 5379 (Py_UCS2*)writer.data, &writer.pos, 5380 native_ordering); 5381 } else { 5382 assert(kind == PyUnicode_4BYTE_KIND); 5383 ch = ucs4lib_utf16_decode(&q, e, 5384 (Py_UCS4*)writer.data, &writer.pos, 5385 native_ordering); 5386 } 5387 } 5388 5389 switch (ch) 5390 { 5391 case 0: 5392 /* remaining byte at the end? (size should be even) */ 5393 if (q == e || consumed) 5394 goto End; 5395 errmsg = "truncated data"; 5396 startinpos = ((const char *)q) - starts; 5397 endinpos = ((const char *)e) - starts; 5398 break; 5399 /* The remaining input chars are ignored if the callback 5400 chooses to skip the input */ 5401 case 1: 5402 q -= 2; 5403 if (consumed) 5404 goto End; 5405 errmsg = "unexpected end of data"; 5406 startinpos = ((const char *)q) - starts; 5407 endinpos = ((const char *)e) - starts; 5408 break; 5409 case 2: 5410 errmsg = "illegal encoding"; 5411 startinpos = ((const char *)q) - 2 - starts; 5412 endinpos = startinpos + 2; 5413 break; 5414 case 3: 5415 errmsg = "illegal UTF-16 surrogate"; 5416 startinpos = ((const char *)q) - 4 - starts; 5417 endinpos = startinpos + 2; 5418 break; 5419 default: 5420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5421 goto onError; 5422 continue; 5423 } 5424 5425 if (unicode_decode_call_errorhandler_writer( 5426 errors, 5427 &errorHandler, 5428 encoding, errmsg, 5429 &starts, 5430 (const char **)&e, 5431 &startinpos, 5432 &endinpos, 5433 &exc, 5434 (const char **)&q, 5435 &writer)) 5436 goto onError; 5437 } 5438 5439End: 5440 if (consumed) 5441 *consumed = (const char *)q-starts; 5442 5443 Py_XDECREF(errorHandler); 5444 Py_XDECREF(exc); 5445 return _PyUnicodeWriter_Finish(&writer); 5446 5447 onError: 5448 _PyUnicodeWriter_Dealloc(&writer); 5449 Py_XDECREF(errorHandler); 5450 Py_XDECREF(exc); 5451 return NULL; 5452} 5453 5454PyObject * 5455_PyUnicode_EncodeUTF16(PyObject *str, 5456 const char *errors, 5457 int byteorder) 5458{ 5459 enum PyUnicode_Kind kind; 5460 const void *data; 5461 Py_ssize_t len; 5462 PyObject *v; 5463 unsigned short *out; 5464 Py_ssize_t pairs; 5465#if PY_BIG_ENDIAN 5466 int native_ordering = byteorder >= 0; 5467#else 5468 int native_ordering = byteorder <= 0; 5469#endif 5470 const char *encoding; 5471 Py_ssize_t nsize, pos; 5472 PyObject *errorHandler = NULL; 5473 PyObject *exc = NULL; 5474 PyObject *rep = NULL; 5475 5476 if (!PyUnicode_Check(str)) { 5477 PyErr_BadArgument(); 5478 return NULL; 5479 } 5480 if (PyUnicode_READY(str) == -1) 5481 return NULL; 5482 kind = PyUnicode_KIND(str); 5483 data = PyUnicode_DATA(str); 5484 len = PyUnicode_GET_LENGTH(str); 5485 5486 pairs = 0; 5487 if (kind == PyUnicode_4BYTE_KIND) { 5488 const Py_UCS4 *in = (const Py_UCS4 *)data; 5489 const Py_UCS4 *end = in + len; 5490 while (in < end) 5491 if (*in++ >= 0x10000) 5492 pairs++; 5493 } 5494 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5495 return PyErr_NoMemory(); 5496 nsize = len + pairs + (byteorder == 0); 5497 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5498 if (v == NULL) 5499 return NULL; 5500 5501 /* output buffer is 2-bytes aligned */ 5502 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5503 out = (unsigned short *)PyBytes_AS_STRING(v); 5504 if (byteorder == 0) 5505 *out++ = 0xFEFF; 5506 if (len == 0) 5507 goto done; 5508 5509 if (kind == PyUnicode_1BYTE_KIND) { 5510 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5511 goto done; 5512 } 5513 5514 if (byteorder < 0) 5515 encoding = "utf-16-le"; 5516 else if (byteorder > 0) 5517 encoding = "utf-16-be"; 5518 else 5519 encoding = "utf-16"; 5520 5521 pos = 0; 5522 while (pos < len) { 5523 Py_ssize_t repsize, moreunits; 5524 5525 if (kind == PyUnicode_2BYTE_KIND) { 5526 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5527 &out, native_ordering); 5528 } 5529 else { 5530 assert(kind == PyUnicode_4BYTE_KIND); 5531 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5532 &out, native_ordering); 5533 } 5534 if (pos == len) 5535 break; 5536 5537 rep = unicode_encode_call_errorhandler( 5538 errors, &errorHandler, 5539 encoding, "surrogates not allowed", 5540 str, &exc, pos, pos + 1, &pos); 5541 if (!rep) 5542 goto error; 5543 5544 if (PyBytes_Check(rep)) { 5545 repsize = PyBytes_GET_SIZE(rep); 5546 if (repsize & 1) { 5547 raise_encode_exception(&exc, encoding, 5548 str, pos - 1, pos, 5549 "surrogates not allowed"); 5550 goto error; 5551 } 5552 moreunits = repsize / 2; 5553 } 5554 else { 5555 assert(PyUnicode_Check(rep)); 5556 if (PyUnicode_READY(rep) < 0) 5557 goto error; 5558 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5559 if (!PyUnicode_IS_ASCII(rep)) { 5560 raise_encode_exception(&exc, encoding, 5561 str, pos - 1, pos, 5562 "surrogates not allowed"); 5563 goto error; 5564 } 5565 } 5566 5567 /* two bytes are reserved for each surrogate */ 5568 if (moreunits > 1) { 5569 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5570 Py_ssize_t morebytes = 2 * (moreunits - 1); 5571 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5572 /* integer overflow */ 5573 PyErr_NoMemory(); 5574 goto error; 5575 } 5576 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5577 goto error; 5578 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5579 } 5580 5581 if (PyBytes_Check(rep)) { 5582 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5583 out += moreunits; 5584 } else /* rep is unicode */ { 5585 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5586 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5587 &out, native_ordering); 5588 } 5589 5590 Py_CLEAR(rep); 5591 } 5592 5593 /* Cut back to size actually needed. This is necessary for, for example, 5594 encoding of a string containing isolated surrogates and the 'ignore' handler 5595 is used. */ 5596 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5597 if (nsize != PyBytes_GET_SIZE(v)) 5598 _PyBytes_Resize(&v, nsize); 5599 Py_XDECREF(errorHandler); 5600 Py_XDECREF(exc); 5601 done: 5602 return v; 5603 error: 5604 Py_XDECREF(rep); 5605 Py_XDECREF(errorHandler); 5606 Py_XDECREF(exc); 5607 Py_XDECREF(v); 5608 return NULL; 5609#undef STORECHAR 5610} 5611 5612PyObject * 5613PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5614 Py_ssize_t size, 5615 const char *errors, 5616 int byteorder) 5617{ 5618 PyObject *result; 5619 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5620 if (tmp == NULL) 5621 return NULL; 5622 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5623 Py_DECREF(tmp); 5624 return result; 5625} 5626 5627PyObject * 5628PyUnicode_AsUTF16String(PyObject *unicode) 5629{ 5630 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5631} 5632 5633/* --- Unicode Escape Codec ----------------------------------------------- */ 5634 5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5636 if all the escapes in the string make it still a valid ASCII string. 5637 Returns -1 if any escapes were found which cause the string to 5638 pop out of ASCII range. Otherwise returns the length of the 5639 required buffer to hold the string. 5640 */ 5641static Py_ssize_t 5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5643{ 5644 const unsigned char *p = (const unsigned char *)s; 5645 const unsigned char *end = p + size; 5646 Py_ssize_t length = 0; 5647 5648 if (size < 0) 5649 return -1; 5650 5651 for (; p < end; ++p) { 5652 if (*p > 127) { 5653 /* Non-ASCII */ 5654 return -1; 5655 } 5656 else if (*p != '\\') { 5657 /* Normal character */ 5658 ++length; 5659 } 5660 else { 5661 /* Backslash-escape, check next char */ 5662 ++p; 5663 /* Escape sequence reaches till end of string or 5664 non-ASCII follow-up. */ 5665 if (p >= end || *p > 127) 5666 return -1; 5667 switch (*p) { 5668 case '\n': 5669 /* backslash + \n result in zero characters */ 5670 break; 5671 case '\\': case '\'': case '\"': 5672 case 'b': case 'f': case 't': 5673 case 'n': case 'r': case 'v': case 'a': 5674 ++length; 5675 break; 5676 case '0': case '1': case '2': case '3': 5677 case '4': case '5': case '6': case '7': 5678 case 'x': case 'u': case 'U': case 'N': 5679 /* these do not guarantee ASCII characters */ 5680 return -1; 5681 default: 5682 /* count the backslash + the other character */ 5683 length += 2; 5684 } 5685 } 5686 } 5687 return length; 5688} 5689 5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5691 5692PyObject * 5693PyUnicode_DecodeUnicodeEscape(const char *s, 5694 Py_ssize_t size, 5695 const char *errors) 5696{ 5697 const char *starts = s; 5698 Py_ssize_t startinpos; 5699 Py_ssize_t endinpos; 5700 _PyUnicodeWriter writer; 5701 const char *end; 5702 char* message; 5703 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5704 PyObject *errorHandler = NULL; 5705 PyObject *exc = NULL; 5706 Py_ssize_t len; 5707 5708 len = length_of_escaped_ascii_string(s, size); 5709 if (len == 0) 5710 _Py_RETURN_UNICODE_EMPTY(); 5711 5712 /* After length_of_escaped_ascii_string() there are two alternatives, 5713 either the string is pure ASCII with named escapes like \n, etc. 5714 and we determined it's exact size (common case) 5715 or it contains \x, \u, ... escape sequences. then we create a 5716 legacy wchar string and resize it at the end of this function. */ 5717 _PyUnicodeWriter_Init(&writer); 5718 if (len > 0) { 5719 writer.min_length = len; 5720 } 5721 else { 5722 /* Escaped strings will always be longer than the resulting 5723 Unicode string, so we start with size here and then reduce the 5724 length after conversion to the true value. 5725 (but if the error callback returns a long replacement string 5726 we'll have to allocate more space) */ 5727 writer.min_length = size; 5728 } 5729 5730 if (size == 0) 5731 return _PyUnicodeWriter_Finish(&writer); 5732 end = s + size; 5733 5734 while (s < end) { 5735 unsigned char c; 5736 Py_UCS4 x; 5737 int digits; 5738 5739 /* Non-escape characters are interpreted as Unicode ordinals */ 5740 if (*s != '\\') { 5741 x = (unsigned char)*s; 5742 s++; 5743 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5744 goto onError; 5745 continue; 5746 } 5747 5748 startinpos = s-starts; 5749 /* \ - Escapes */ 5750 s++; 5751 c = *s++; 5752 if (s > end) 5753 c = '\0'; /* Invalid after \ */ 5754 5755 switch (c) { 5756 5757 /* \x escapes */ 5758#define WRITECHAR(ch) \ 5759 do { \ 5760 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5761 goto onError; \ 5762 } while(0) 5763 5764 case '\n': break; 5765 case '\\': WRITECHAR('\\'); break; 5766 case '\'': WRITECHAR('\''); break; 5767 case '\"': WRITECHAR('\"'); break; 5768 case 'b': WRITECHAR('\b'); break; 5769 /* FF */ 5770 case 'f': WRITECHAR('\014'); break; 5771 case 't': WRITECHAR('\t'); break; 5772 case 'n': WRITECHAR('\n'); break; 5773 case 'r': WRITECHAR('\r'); break; 5774 /* VT */ 5775 case 'v': WRITECHAR('\013'); break; 5776 /* BEL, not classic C */ 5777 case 'a': WRITECHAR('\007'); break; 5778 5779 /* \OOO (octal) escapes */ 5780 case '0': case '1': case '2': case '3': 5781 case '4': case '5': case '6': case '7': 5782 x = s[-1] - '0'; 5783 if (s < end && '0' <= *s && *s <= '7') { 5784 x = (x<<3) + *s++ - '0'; 5785 if (s < end && '0' <= *s && *s <= '7') 5786 x = (x<<3) + *s++ - '0'; 5787 } 5788 WRITECHAR(x); 5789 break; 5790 5791 /* hex escapes */ 5792 /* \xXX */ 5793 case 'x': 5794 digits = 2; 5795 message = "truncated \\xXX escape"; 5796 goto hexescape; 5797 5798 /* \uXXXX */ 5799 case 'u': 5800 digits = 4; 5801 message = "truncated \\uXXXX escape"; 5802 goto hexescape; 5803 5804 /* \UXXXXXXXX */ 5805 case 'U': 5806 digits = 8; 5807 message = "truncated \\UXXXXXXXX escape"; 5808 hexescape: 5809 chr = 0; 5810 if (end - s < digits) { 5811 /* count only hex digits */ 5812 for (; s < end; ++s) { 5813 c = (unsigned char)*s; 5814 if (!Py_ISXDIGIT(c)) 5815 goto error; 5816 } 5817 goto error; 5818 } 5819 for (; digits--; ++s) { 5820 c = (unsigned char)*s; 5821 if (!Py_ISXDIGIT(c)) 5822 goto error; 5823 chr = (chr<<4) & ~0xF; 5824 if (c >= '0' && c <= '9') 5825 chr += c - '0'; 5826 else if (c >= 'a' && c <= 'f') 5827 chr += 10 + c - 'a'; 5828 else 5829 chr += 10 + c - 'A'; 5830 } 5831 if (chr == 0xffffffff && PyErr_Occurred()) 5832 /* _decoding_error will have already written into the 5833 target buffer. */ 5834 break; 5835 store: 5836 /* when we get here, chr is a 32-bit unicode character */ 5837 message = "illegal Unicode character"; 5838 if (chr > MAX_UNICODE) 5839 goto error; 5840 WRITECHAR(chr); 5841 break; 5842 5843 /* \N{name} */ 5844 case 'N': 5845 message = "malformed \\N character escape"; 5846 if (ucnhash_CAPI == NULL) { 5847 /* load the unicode data module */ 5848 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5849 PyUnicodeData_CAPSULE_NAME, 1); 5850 if (ucnhash_CAPI == NULL) 5851 goto ucnhashError; 5852 } 5853 if (*s == '{') { 5854 const char *start = s+1; 5855 /* look for the closing brace */ 5856 while (*s != '}' && s < end) 5857 s++; 5858 if (s > start && s < end && *s == '}') { 5859 /* found a name. look it up in the unicode database */ 5860 message = "unknown Unicode character name"; 5861 s++; 5862 if (s - start - 1 <= INT_MAX && 5863 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5864 &chr, 0)) 5865 goto store; 5866 } 5867 } 5868 goto error; 5869 5870 default: 5871 if (s > end) { 5872 message = "\\ at end of string"; 5873 s--; 5874 goto error; 5875 } 5876 else { 5877 WRITECHAR('\\'); 5878 WRITECHAR((unsigned char)s[-1]); 5879 } 5880 break; 5881 } 5882 continue; 5883 5884 error: 5885 endinpos = s-starts; 5886 if (unicode_decode_call_errorhandler_writer( 5887 errors, &errorHandler, 5888 "unicodeescape", message, 5889 &starts, &end, &startinpos, &endinpos, &exc, &s, 5890 &writer)) 5891 goto onError; 5892 continue; 5893 } 5894#undef WRITECHAR 5895 5896 Py_XDECREF(errorHandler); 5897 Py_XDECREF(exc); 5898 return _PyUnicodeWriter_Finish(&writer); 5899 5900 ucnhashError: 5901 PyErr_SetString( 5902 PyExc_UnicodeError, 5903 "\\N escapes not supported (can't load unicodedata module)" 5904 ); 5905 _PyUnicodeWriter_Dealloc(&writer); 5906 Py_XDECREF(errorHandler); 5907 Py_XDECREF(exc); 5908 return NULL; 5909 5910 onError: 5911 _PyUnicodeWriter_Dealloc(&writer); 5912 Py_XDECREF(errorHandler); 5913 Py_XDECREF(exc); 5914 return NULL; 5915} 5916 5917/* Return a Unicode-Escape string version of the Unicode object. 5918 5919 If quotes is true, the string is enclosed in u"" or u'' quotes as 5920 appropriate. 5921 5922*/ 5923 5924PyObject * 5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5926{ 5927 Py_ssize_t i, len; 5928 PyObject *repr; 5929 char *p; 5930 int kind; 5931 void *data; 5932 Py_ssize_t expandsize = 0; 5933 5934 /* Initial allocation is based on the longest-possible character 5935 escape. 5936 5937 For UCS1 strings it's '\xxx', 4 bytes per source character. 5938 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5939 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5940 */ 5941 5942 if (!PyUnicode_Check(unicode)) { 5943 PyErr_BadArgument(); 5944 return NULL; 5945 } 5946 if (PyUnicode_READY(unicode) == -1) 5947 return NULL; 5948 len = PyUnicode_GET_LENGTH(unicode); 5949 kind = PyUnicode_KIND(unicode); 5950 data = PyUnicode_DATA(unicode); 5951 switch (kind) { 5952 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5953 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5954 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5955 } 5956 5957 if (len == 0) 5958 return PyBytes_FromStringAndSize(NULL, 0); 5959 5960 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5961 return PyErr_NoMemory(); 5962 5963 repr = PyBytes_FromStringAndSize(NULL, 5964 2 5965 + expandsize*len 5966 + 1); 5967 if (repr == NULL) 5968 return NULL; 5969 5970 p = PyBytes_AS_STRING(repr); 5971 5972 for (i = 0; i < len; i++) { 5973 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5974 5975 /* Escape backslashes */ 5976 if (ch == '\\') { 5977 *p++ = '\\'; 5978 *p++ = (char) ch; 5979 continue; 5980 } 5981 5982 /* Map 21-bit characters to '\U00xxxxxx' */ 5983 else if (ch >= 0x10000) { 5984 assert(ch <= MAX_UNICODE); 5985 *p++ = '\\'; 5986 *p++ = 'U'; 5987 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5988 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5989 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5990 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5991 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5992 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5993 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5994 *p++ = Py_hexdigits[ch & 0x0000000F]; 5995 continue; 5996 } 5997 5998 /* Map 16-bit characters to '\uxxxx' */ 5999 if (ch >= 256) { 6000 *p++ = '\\'; 6001 *p++ = 'u'; 6002 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6003 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6004 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6005 *p++ = Py_hexdigits[ch & 0x000F]; 6006 } 6007 6008 /* Map special whitespace to '\t', \n', '\r' */ 6009 else if (ch == '\t') { 6010 *p++ = '\\'; 6011 *p++ = 't'; 6012 } 6013 else if (ch == '\n') { 6014 *p++ = '\\'; 6015 *p++ = 'n'; 6016 } 6017 else if (ch == '\r') { 6018 *p++ = '\\'; 6019 *p++ = 'r'; 6020 } 6021 6022 /* Map non-printable US ASCII to '\xhh' */ 6023 else if (ch < ' ' || ch >= 0x7F) { 6024 *p++ = '\\'; 6025 *p++ = 'x'; 6026 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6027 *p++ = Py_hexdigits[ch & 0x000F]; 6028 } 6029 6030 /* Copy everything else as-is */ 6031 else 6032 *p++ = (char) ch; 6033 } 6034 6035 assert(p - PyBytes_AS_STRING(repr) > 0); 6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6037 return NULL; 6038 return repr; 6039} 6040 6041PyObject * 6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6043 Py_ssize_t size) 6044{ 6045 PyObject *result; 6046 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6047 if (tmp == NULL) 6048 return NULL; 6049 result = PyUnicode_AsUnicodeEscapeString(tmp); 6050 Py_DECREF(tmp); 6051 return result; 6052} 6053 6054/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6055 6056PyObject * 6057PyUnicode_DecodeRawUnicodeEscape(const char *s, 6058 Py_ssize_t size, 6059 const char *errors) 6060{ 6061 const char *starts = s; 6062 Py_ssize_t startinpos; 6063 Py_ssize_t endinpos; 6064 _PyUnicodeWriter writer; 6065 const char *end; 6066 const char *bs; 6067 PyObject *errorHandler = NULL; 6068 PyObject *exc = NULL; 6069 6070 if (size == 0) 6071 _Py_RETURN_UNICODE_EMPTY(); 6072 6073 /* Escaped strings will always be longer than the resulting 6074 Unicode string, so we start with size here and then reduce the 6075 length after conversion to the true value. (But decoding error 6076 handler might have to resize the string) */ 6077 _PyUnicodeWriter_Init(&writer); 6078 writer.min_length = size; 6079 6080 end = s + size; 6081 while (s < end) { 6082 unsigned char c; 6083 Py_UCS4 x; 6084 int i; 6085 int count; 6086 6087 /* Non-escape characters are interpreted as Unicode ordinals */ 6088 if (*s != '\\') { 6089 x = (unsigned char)*s++; 6090 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6091 goto onError; 6092 continue; 6093 } 6094 startinpos = s-starts; 6095 6096 /* \u-escapes are only interpreted iff the number of leading 6097 backslashes if odd */ 6098 bs = s; 6099 for (;s < end;) { 6100 if (*s != '\\') 6101 break; 6102 x = (unsigned char)*s++; 6103 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6104 goto onError; 6105 } 6106 if (((s - bs) & 1) == 0 || 6107 s >= end || 6108 (*s != 'u' && *s != 'U')) { 6109 continue; 6110 } 6111 writer.pos--; 6112 count = *s=='u' ? 4 : 8; 6113 s++; 6114 6115 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6116 for (x = 0, i = 0; i < count; ++i, ++s) { 6117 c = (unsigned char)*s; 6118 if (!Py_ISXDIGIT(c)) { 6119 endinpos = s-starts; 6120 if (unicode_decode_call_errorhandler_writer( 6121 errors, &errorHandler, 6122 "rawunicodeescape", "truncated \\uXXXX", 6123 &starts, &end, &startinpos, &endinpos, &exc, &s, 6124 &writer)) 6125 goto onError; 6126 goto nextByte; 6127 } 6128 x = (x<<4) & ~0xF; 6129 if (c >= '0' && c <= '9') 6130 x += c - '0'; 6131 else if (c >= 'a' && c <= 'f') 6132 x += 10 + c - 'a'; 6133 else 6134 x += 10 + c - 'A'; 6135 } 6136 if (x <= MAX_UNICODE) { 6137 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6138 goto onError; 6139 } 6140 else { 6141 endinpos = s-starts; 6142 if (unicode_decode_call_errorhandler_writer( 6143 errors, &errorHandler, 6144 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6145 &starts, &end, &startinpos, &endinpos, &exc, &s, 6146 &writer)) 6147 goto onError; 6148 } 6149 nextByte: 6150 ; 6151 } 6152 Py_XDECREF(errorHandler); 6153 Py_XDECREF(exc); 6154 return _PyUnicodeWriter_Finish(&writer); 6155 6156 onError: 6157 _PyUnicodeWriter_Dealloc(&writer); 6158 Py_XDECREF(errorHandler); 6159 Py_XDECREF(exc); 6160 return NULL; 6161} 6162 6163 6164PyObject * 6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6166{ 6167 PyObject *repr; 6168 char *p; 6169 char *q; 6170 Py_ssize_t expandsize, pos; 6171 int kind; 6172 void *data; 6173 Py_ssize_t len; 6174 6175 if (!PyUnicode_Check(unicode)) { 6176 PyErr_BadArgument(); 6177 return NULL; 6178 } 6179 if (PyUnicode_READY(unicode) == -1) 6180 return NULL; 6181 kind = PyUnicode_KIND(unicode); 6182 data = PyUnicode_DATA(unicode); 6183 len = PyUnicode_GET_LENGTH(unicode); 6184 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6185 bytes, and 1 byte characters 4. */ 6186 expandsize = kind * 2 + 2; 6187 6188 if (len > PY_SSIZE_T_MAX / expandsize) 6189 return PyErr_NoMemory(); 6190 6191 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6192 if (repr == NULL) 6193 return NULL; 6194 if (len == 0) 6195 return repr; 6196 6197 p = q = PyBytes_AS_STRING(repr); 6198 for (pos = 0; pos < len; pos++) { 6199 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6200 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6201 if (ch >= 0x10000) { 6202 assert(ch <= MAX_UNICODE); 6203 *p++ = '\\'; 6204 *p++ = 'U'; 6205 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6206 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6207 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6208 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6209 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6212 *p++ = Py_hexdigits[ch & 15]; 6213 } 6214 /* Map 16-bit characters to '\uxxxx' */ 6215 else if (ch >= 256) { 6216 *p++ = '\\'; 6217 *p++ = 'u'; 6218 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6219 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6220 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6221 *p++ = Py_hexdigits[ch & 15]; 6222 } 6223 /* Copy everything else as-is */ 6224 else 6225 *p++ = (char) ch; 6226 } 6227 6228 assert(p > q); 6229 if (_PyBytes_Resize(&repr, p - q) < 0) 6230 return NULL; 6231 return repr; 6232} 6233 6234PyObject * 6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6236 Py_ssize_t size) 6237{ 6238 PyObject *result; 6239 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6240 if (tmp == NULL) 6241 return NULL; 6242 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6243 Py_DECREF(tmp); 6244 return result; 6245} 6246 6247/* --- Unicode Internal Codec ------------------------------------------- */ 6248 6249PyObject * 6250_PyUnicode_DecodeUnicodeInternal(const char *s, 6251 Py_ssize_t size, 6252 const char *errors) 6253{ 6254 const char *starts = s; 6255 Py_ssize_t startinpos; 6256 Py_ssize_t endinpos; 6257 _PyUnicodeWriter writer; 6258 const char *end; 6259 const char *reason; 6260 PyObject *errorHandler = NULL; 6261 PyObject *exc = NULL; 6262 6263 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6264 "unicode_internal codec has been deprecated", 6265 1)) 6266 return NULL; 6267 6268 if (size == 0) 6269 _Py_RETURN_UNICODE_EMPTY(); 6270 6271 _PyUnicodeWriter_Init(&writer); 6272 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6273 PyErr_NoMemory(); 6274 goto onError; 6275 } 6276 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6277 6278 end = s + size; 6279 while (s < end) { 6280 Py_UNICODE uch; 6281 Py_UCS4 ch; 6282 if (end - s < Py_UNICODE_SIZE) { 6283 endinpos = end-starts; 6284 reason = "truncated input"; 6285 goto error; 6286 } 6287 /* We copy the raw representation one byte at a time because the 6288 pointer may be unaligned (see test_codeccallbacks). */ 6289 ((char *) &uch)[0] = s[0]; 6290 ((char *) &uch)[1] = s[1]; 6291#ifdef Py_UNICODE_WIDE 6292 ((char *) &uch)[2] = s[2]; 6293 ((char *) &uch)[3] = s[3]; 6294#endif 6295 ch = uch; 6296#ifdef Py_UNICODE_WIDE 6297 /* We have to sanity check the raw data, otherwise doom looms for 6298 some malformed UCS-4 data. */ 6299 if (ch > 0x10ffff) { 6300 endinpos = s - starts + Py_UNICODE_SIZE; 6301 reason = "illegal code point (> 0x10FFFF)"; 6302 goto error; 6303 } 6304#endif 6305 s += Py_UNICODE_SIZE; 6306#ifndef Py_UNICODE_WIDE 6307 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6308 { 6309 Py_UNICODE uch2; 6310 ((char *) &uch2)[0] = s[0]; 6311 ((char *) &uch2)[1] = s[1]; 6312 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6313 { 6314 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6315 s += Py_UNICODE_SIZE; 6316 } 6317 } 6318#endif 6319 6320 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6321 goto onError; 6322 continue; 6323 6324 error: 6325 startinpos = s - starts; 6326 if (unicode_decode_call_errorhandler_writer( 6327 errors, &errorHandler, 6328 "unicode_internal", reason, 6329 &starts, &end, &startinpos, &endinpos, &exc, &s, 6330 &writer)) 6331 goto onError; 6332 } 6333 6334 Py_XDECREF(errorHandler); 6335 Py_XDECREF(exc); 6336 return _PyUnicodeWriter_Finish(&writer); 6337 6338 onError: 6339 _PyUnicodeWriter_Dealloc(&writer); 6340 Py_XDECREF(errorHandler); 6341 Py_XDECREF(exc); 6342 return NULL; 6343} 6344 6345/* --- Latin-1 Codec ------------------------------------------------------ */ 6346 6347PyObject * 6348PyUnicode_DecodeLatin1(const char *s, 6349 Py_ssize_t size, 6350 const char *errors) 6351{ 6352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6353 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6354} 6355 6356/* create or adjust a UnicodeEncodeError */ 6357static void 6358make_encode_exception(PyObject **exceptionObject, 6359 const char *encoding, 6360 PyObject *unicode, 6361 Py_ssize_t startpos, Py_ssize_t endpos, 6362 const char *reason) 6363{ 6364 if (*exceptionObject == NULL) { 6365 *exceptionObject = PyObject_CallFunction( 6366 PyExc_UnicodeEncodeError, "sOnns", 6367 encoding, unicode, startpos, endpos, reason); 6368 } 6369 else { 6370 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6371 goto onError; 6372 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6373 goto onError; 6374 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6375 goto onError; 6376 return; 6377 onError: 6378 Py_CLEAR(*exceptionObject); 6379 } 6380} 6381 6382/* raises a UnicodeEncodeError */ 6383static void 6384raise_encode_exception(PyObject **exceptionObject, 6385 const char *encoding, 6386 PyObject *unicode, 6387 Py_ssize_t startpos, Py_ssize_t endpos, 6388 const char *reason) 6389{ 6390 make_encode_exception(exceptionObject, 6391 encoding, unicode, startpos, endpos, reason); 6392 if (*exceptionObject != NULL) 6393 PyCodec_StrictErrors(*exceptionObject); 6394} 6395 6396/* error handling callback helper: 6397 build arguments, call the callback and check the arguments, 6398 put the result into newpos and return the replacement string, which 6399 has to be freed by the caller */ 6400static PyObject * 6401unicode_encode_call_errorhandler(const char *errors, 6402 PyObject **errorHandler, 6403 const char *encoding, const char *reason, 6404 PyObject *unicode, PyObject **exceptionObject, 6405 Py_ssize_t startpos, Py_ssize_t endpos, 6406 Py_ssize_t *newpos) 6407{ 6408 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6409 Py_ssize_t len; 6410 PyObject *restuple; 6411 PyObject *resunicode; 6412 6413 if (*errorHandler == NULL) { 6414 *errorHandler = PyCodec_LookupError(errors); 6415 if (*errorHandler == NULL) 6416 return NULL; 6417 } 6418 6419 if (PyUnicode_READY(unicode) == -1) 6420 return NULL; 6421 len = PyUnicode_GET_LENGTH(unicode); 6422 6423 make_encode_exception(exceptionObject, 6424 encoding, unicode, startpos, endpos, reason); 6425 if (*exceptionObject == NULL) 6426 return NULL; 6427 6428 restuple = PyObject_CallFunctionObjArgs( 6429 *errorHandler, *exceptionObject, NULL); 6430 if (restuple == NULL) 6431 return NULL; 6432 if (!PyTuple_Check(restuple)) { 6433 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6434 Py_DECREF(restuple); 6435 return NULL; 6436 } 6437 if (!PyArg_ParseTuple(restuple, argparse, 6438 &resunicode, newpos)) { 6439 Py_DECREF(restuple); 6440 return NULL; 6441 } 6442 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6443 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6444 Py_DECREF(restuple); 6445 return NULL; 6446 } 6447 if (*newpos<0) 6448 *newpos = len + *newpos; 6449 if (*newpos<0 || *newpos>len) { 6450 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6451 Py_DECREF(restuple); 6452 return NULL; 6453 } 6454 Py_INCREF(resunicode); 6455 Py_DECREF(restuple); 6456 return resunicode; 6457} 6458 6459static PyObject * 6460unicode_encode_ucs1(PyObject *unicode, 6461 const char *errors, 6462 unsigned int limit) 6463{ 6464 /* input state */ 6465 Py_ssize_t pos=0, size; 6466 int kind; 6467 void *data; 6468 /* output object */ 6469 PyObject *res; 6470 /* pointer into the output */ 6471 char *str; 6472 /* current output position */ 6473 Py_ssize_t ressize; 6474 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6475 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6476 PyObject *errorHandler = NULL; 6477 PyObject *exc = NULL; 6478 /* the following variable is used for caching string comparisons 6479 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6480 int known_errorHandler = -1; 6481 6482 if (PyUnicode_READY(unicode) == -1) 6483 return NULL; 6484 size = PyUnicode_GET_LENGTH(unicode); 6485 kind = PyUnicode_KIND(unicode); 6486 data = PyUnicode_DATA(unicode); 6487 /* allocate enough for a simple encoding without 6488 replacements, if we need more, we'll resize */ 6489 if (size == 0) 6490 return PyBytes_FromStringAndSize(NULL, 0); 6491 res = PyBytes_FromStringAndSize(NULL, size); 6492 if (res == NULL) 6493 return NULL; 6494 str = PyBytes_AS_STRING(res); 6495 ressize = size; 6496 6497 while (pos < size) { 6498 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6499 6500 /* can we encode this? */ 6501 if (c<limit) { 6502 /* no overflow check, because we know that the space is enough */ 6503 *str++ = (char)c; 6504 ++pos; 6505 } 6506 else { 6507 Py_ssize_t requiredsize; 6508 PyObject *repunicode; 6509 Py_ssize_t repsize, newpos, respos, i; 6510 /* startpos for collecting unencodable chars */ 6511 Py_ssize_t collstart = pos; 6512 Py_ssize_t collend = pos; 6513 /* find all unecodable characters */ 6514 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6515 ++collend; 6516 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6517 if (known_errorHandler==-1) { 6518 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6519 known_errorHandler = 1; 6520 else if (!strcmp(errors, "replace")) 6521 known_errorHandler = 2; 6522 else if (!strcmp(errors, "ignore")) 6523 known_errorHandler = 3; 6524 else if (!strcmp(errors, "xmlcharrefreplace")) 6525 known_errorHandler = 4; 6526 else 6527 known_errorHandler = 0; 6528 } 6529 switch (known_errorHandler) { 6530 case 1: /* strict */ 6531 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6532 goto onError; 6533 case 2: /* replace */ 6534 while (collstart++ < collend) 6535 *str++ = '?'; /* fall through */ 6536 case 3: /* ignore */ 6537 pos = collend; 6538 break; 6539 case 4: /* xmlcharrefreplace */ 6540 respos = str - PyBytes_AS_STRING(res); 6541 requiredsize = respos; 6542 /* determine replacement size */ 6543 for (i = collstart; i < collend; ++i) { 6544 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6545 Py_ssize_t incr; 6546 if (ch < 10) 6547 incr = 2+1+1; 6548 else if (ch < 100) 6549 incr = 2+2+1; 6550 else if (ch < 1000) 6551 incr = 2+3+1; 6552 else if (ch < 10000) 6553 incr = 2+4+1; 6554 else if (ch < 100000) 6555 incr = 2+5+1; 6556 else if (ch < 1000000) 6557 incr = 2+6+1; 6558 else { 6559 assert(ch <= MAX_UNICODE); 6560 incr = 2+7+1; 6561 } 6562 if (requiredsize > PY_SSIZE_T_MAX - incr) 6563 goto overflow; 6564 requiredsize += incr; 6565 } 6566 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6567 goto overflow; 6568 requiredsize += size - collend; 6569 if (requiredsize > ressize) { 6570 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6571 requiredsize = 2*ressize; 6572 if (_PyBytes_Resize(&res, requiredsize)) 6573 goto onError; 6574 str = PyBytes_AS_STRING(res) + respos; 6575 ressize = requiredsize; 6576 } 6577 /* generate replacement */ 6578 for (i = collstart; i < collend; ++i) { 6579 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6580 } 6581 pos = collend; 6582 break; 6583 default: 6584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6585 encoding, reason, unicode, &exc, 6586 collstart, collend, &newpos); 6587 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6588 PyUnicode_READY(repunicode) == -1)) 6589 goto onError; 6590 if (PyBytes_Check(repunicode)) { 6591 /* Directly copy bytes result to output. */ 6592 repsize = PyBytes_Size(repunicode); 6593 if (repsize > 1) { 6594 /* Make room for all additional bytes. */ 6595 respos = str - PyBytes_AS_STRING(res); 6596 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6597 Py_DECREF(repunicode); 6598 goto overflow; 6599 } 6600 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6601 Py_DECREF(repunicode); 6602 goto onError; 6603 } 6604 str = PyBytes_AS_STRING(res) + respos; 6605 ressize += repsize-1; 6606 } 6607 memcpy(str, PyBytes_AsString(repunicode), repsize); 6608 str += repsize; 6609 pos = newpos; 6610 Py_DECREF(repunicode); 6611 break; 6612 } 6613 /* need more space? (at least enough for what we 6614 have+the replacement+the rest of the string, so 6615 we won't have to check space for encodable characters) */ 6616 respos = str - PyBytes_AS_STRING(res); 6617 repsize = PyUnicode_GET_LENGTH(repunicode); 6618 requiredsize = respos; 6619 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6620 goto overflow; 6621 requiredsize += repsize; 6622 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6623 goto overflow; 6624 requiredsize += size - collend; 6625 if (requiredsize > ressize) { 6626 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6627 requiredsize = 2*ressize; 6628 if (_PyBytes_Resize(&res, requiredsize)) { 6629 Py_DECREF(repunicode); 6630 goto onError; 6631 } 6632 str = PyBytes_AS_STRING(res) + respos; 6633 ressize = requiredsize; 6634 } 6635 /* check if there is anything unencodable in the replacement 6636 and copy it to the output */ 6637 for (i = 0; repsize-->0; ++i, ++str) { 6638 c = PyUnicode_READ_CHAR(repunicode, i); 6639 if (c >= limit) { 6640 raise_encode_exception(&exc, encoding, unicode, 6641 pos, pos+1, reason); 6642 Py_DECREF(repunicode); 6643 goto onError; 6644 } 6645 *str = (char)c; 6646 } 6647 pos = newpos; 6648 Py_DECREF(repunicode); 6649 } 6650 } 6651 } 6652 /* Resize if we allocated to much */ 6653 size = str - PyBytes_AS_STRING(res); 6654 if (size < ressize) { /* If this falls res will be NULL */ 6655 assert(size >= 0); 6656 if (_PyBytes_Resize(&res, size) < 0) 6657 goto onError; 6658 } 6659 6660 Py_XDECREF(errorHandler); 6661 Py_XDECREF(exc); 6662 return res; 6663 6664 overflow: 6665 PyErr_SetString(PyExc_OverflowError, 6666 "encoded result is too long for a Python string"); 6667 6668 onError: 6669 Py_XDECREF(res); 6670 Py_XDECREF(errorHandler); 6671 Py_XDECREF(exc); 6672 return NULL; 6673} 6674 6675/* Deprecated */ 6676PyObject * 6677PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6678 Py_ssize_t size, 6679 const char *errors) 6680{ 6681 PyObject *result; 6682 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6683 if (unicode == NULL) 6684 return NULL; 6685 result = unicode_encode_ucs1(unicode, errors, 256); 6686 Py_DECREF(unicode); 6687 return result; 6688} 6689 6690PyObject * 6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6692{ 6693 if (!PyUnicode_Check(unicode)) { 6694 PyErr_BadArgument(); 6695 return NULL; 6696 } 6697 if (PyUnicode_READY(unicode) == -1) 6698 return NULL; 6699 /* Fast path: if it is a one-byte string, construct 6700 bytes object directly. */ 6701 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6702 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6703 PyUnicode_GET_LENGTH(unicode)); 6704 /* Non-Latin-1 characters present. Defer to above function to 6705 raise the exception. */ 6706 return unicode_encode_ucs1(unicode, errors, 256); 6707} 6708 6709PyObject* 6710PyUnicode_AsLatin1String(PyObject *unicode) 6711{ 6712 return _PyUnicode_AsLatin1String(unicode, NULL); 6713} 6714 6715/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6716 6717PyObject * 6718PyUnicode_DecodeASCII(const char *s, 6719 Py_ssize_t size, 6720 const char *errors) 6721{ 6722 const char *starts = s; 6723 _PyUnicodeWriter writer; 6724 int kind; 6725 void *data; 6726 Py_ssize_t startinpos; 6727 Py_ssize_t endinpos; 6728 Py_ssize_t outpos; 6729 const char *e; 6730 PyObject *errorHandler = NULL; 6731 PyObject *exc = NULL; 6732 6733 if (size == 0) 6734 _Py_RETURN_UNICODE_EMPTY(); 6735 6736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6737 if (size == 1 && (unsigned char)s[0] < 128) 6738 return get_latin1_char((unsigned char)s[0]); 6739 6740 _PyUnicodeWriter_Init(&writer); 6741 writer.min_length = size; 6742 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6743 return NULL; 6744 6745 e = s + size; 6746 data = writer.data; 6747 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6748 writer.pos = outpos; 6749 if (writer.pos == size) 6750 return _PyUnicodeWriter_Finish(&writer); 6751 6752 s += writer.pos; 6753 kind = writer.kind; 6754 while (s < e) { 6755 unsigned char c = (unsigned char)*s; 6756 if (c < 128) { 6757 PyUnicode_WRITE(kind, data, writer.pos, c); 6758 writer.pos++; 6759 ++s; 6760 } 6761 else { 6762 startinpos = s-starts; 6763 endinpos = startinpos + 1; 6764 if (unicode_decode_call_errorhandler_writer( 6765 errors, &errorHandler, 6766 "ascii", "ordinal not in range(128)", 6767 &starts, &e, &startinpos, &endinpos, &exc, &s, 6768 &writer)) 6769 goto onError; 6770 kind = writer.kind; 6771 data = writer.data; 6772 } 6773 } 6774 Py_XDECREF(errorHandler); 6775 Py_XDECREF(exc); 6776 return _PyUnicodeWriter_Finish(&writer); 6777 6778 onError: 6779 _PyUnicodeWriter_Dealloc(&writer); 6780 Py_XDECREF(errorHandler); 6781 Py_XDECREF(exc); 6782 return NULL; 6783} 6784 6785/* Deprecated */ 6786PyObject * 6787PyUnicode_EncodeASCII(const Py_UNICODE *p, 6788 Py_ssize_t size, 6789 const char *errors) 6790{ 6791 PyObject *result; 6792 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6793 if (unicode == NULL) 6794 return NULL; 6795 result = unicode_encode_ucs1(unicode, errors, 128); 6796 Py_DECREF(unicode); 6797 return result; 6798} 6799 6800PyObject * 6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6802{ 6803 if (!PyUnicode_Check(unicode)) { 6804 PyErr_BadArgument(); 6805 return NULL; 6806 } 6807 if (PyUnicode_READY(unicode) == -1) 6808 return NULL; 6809 /* Fast path: if it is an ASCII-only string, construct bytes object 6810 directly. Else defer to above function to raise the exception. */ 6811 if (PyUnicode_IS_ASCII(unicode)) 6812 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6813 PyUnicode_GET_LENGTH(unicode)); 6814 return unicode_encode_ucs1(unicode, errors, 128); 6815} 6816 6817PyObject * 6818PyUnicode_AsASCIIString(PyObject *unicode) 6819{ 6820 return _PyUnicode_AsASCIIString(unicode, NULL); 6821} 6822 6823#ifdef HAVE_MBCS 6824 6825/* --- MBCS codecs for Windows -------------------------------------------- */ 6826 6827#if SIZEOF_INT < SIZEOF_SIZE_T 6828#define NEED_RETRY 6829#endif 6830 6831#ifndef WC_ERR_INVALID_CHARS 6832# define WC_ERR_INVALID_CHARS 0x0080 6833#endif 6834 6835static char* 6836code_page_name(UINT code_page, PyObject **obj) 6837{ 6838 *obj = NULL; 6839 if (code_page == CP_ACP) 6840 return "mbcs"; 6841 if (code_page == CP_UTF7) 6842 return "CP_UTF7"; 6843 if (code_page == CP_UTF8) 6844 return "CP_UTF8"; 6845 6846 *obj = PyBytes_FromFormat("cp%u", code_page); 6847 if (*obj == NULL) 6848 return NULL; 6849 return PyBytes_AS_STRING(*obj); 6850} 6851 6852static int 6853is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6854{ 6855 const char *curr = s + offset; 6856 const char *prev; 6857 6858 if (!IsDBCSLeadByteEx(code_page, *curr)) 6859 return 0; 6860 6861 prev = CharPrevExA(code_page, s, curr, 0); 6862 if (prev == curr) 6863 return 1; 6864 /* FIXME: This code is limited to "true" double-byte encodings, 6865 as it assumes an incomplete character consists of a single 6866 byte. */ 6867 if (curr - prev == 2) 6868 return 1; 6869 if (!IsDBCSLeadByteEx(code_page, *prev)) 6870 return 1; 6871 return 0; 6872} 6873 6874static DWORD 6875decode_code_page_flags(UINT code_page) 6876{ 6877 if (code_page == CP_UTF7) { 6878 /* The CP_UTF7 decoder only supports flags=0 */ 6879 return 0; 6880 } 6881 else 6882 return MB_ERR_INVALID_CHARS; 6883} 6884 6885/* 6886 * Decode a byte string from a Windows code page into unicode object in strict 6887 * mode. 6888 * 6889 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6890 * OSError and returns -1 on other error. 6891 */ 6892static int 6893decode_code_page_strict(UINT code_page, 6894 PyObject **v, 6895 const char *in, 6896 int insize) 6897{ 6898 const DWORD flags = decode_code_page_flags(code_page); 6899 wchar_t *out; 6900 DWORD outsize; 6901 6902 /* First get the size of the result */ 6903 assert(insize > 0); 6904 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6905 if (outsize <= 0) 6906 goto error; 6907 6908 if (*v == NULL) { 6909 /* Create unicode object */ 6910 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6911 *v = (PyObject*)_PyUnicode_New(outsize); 6912 if (*v == NULL) 6913 return -1; 6914 out = PyUnicode_AS_UNICODE(*v); 6915 } 6916 else { 6917 /* Extend unicode object */ 6918 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6919 if (unicode_resize(v, n + outsize) < 0) 6920 return -1; 6921 out = PyUnicode_AS_UNICODE(*v) + n; 6922 } 6923 6924 /* Do the conversion */ 6925 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6926 if (outsize <= 0) 6927 goto error; 6928 return insize; 6929 6930error: 6931 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6932 return -2; 6933 PyErr_SetFromWindowsErr(0); 6934 return -1; 6935} 6936 6937/* 6938 * Decode a byte string from a code page into unicode object with an error 6939 * handler. 6940 * 6941 * Returns consumed size if succeed, or raise an OSError or 6942 * UnicodeDecodeError exception and returns -1 on error. 6943 */ 6944static int 6945decode_code_page_errors(UINT code_page, 6946 PyObject **v, 6947 const char *in, const int size, 6948 const char *errors) 6949{ 6950 const char *startin = in; 6951 const char *endin = in + size; 6952 const DWORD flags = decode_code_page_flags(code_page); 6953 /* Ideally, we should get reason from FormatMessage. This is the Windows 6954 2000 English version of the message. */ 6955 const char *reason = "No mapping for the Unicode character exists " 6956 "in the target code page."; 6957 /* each step cannot decode more than 1 character, but a character can be 6958 represented as a surrogate pair */ 6959 wchar_t buffer[2], *startout, *out; 6960 int insize; 6961 Py_ssize_t outsize; 6962 PyObject *errorHandler = NULL; 6963 PyObject *exc = NULL; 6964 PyObject *encoding_obj = NULL; 6965 char *encoding; 6966 DWORD err; 6967 int ret = -1; 6968 6969 assert(size > 0); 6970 6971 encoding = code_page_name(code_page, &encoding_obj); 6972 if (encoding == NULL) 6973 return -1; 6974 6975 if (errors == NULL || strcmp(errors, "strict") == 0) { 6976 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6977 UnicodeDecodeError. */ 6978 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6979 if (exc != NULL) { 6980 PyCodec_StrictErrors(exc); 6981 Py_CLEAR(exc); 6982 } 6983 goto error; 6984 } 6985 6986 if (*v == NULL) { 6987 /* Create unicode object */ 6988 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6989 PyErr_NoMemory(); 6990 goto error; 6991 } 6992 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6993 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6994 if (*v == NULL) 6995 goto error; 6996 startout = PyUnicode_AS_UNICODE(*v); 6997 } 6998 else { 6999 /* Extend unicode object */ 7000 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7001 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7002 PyErr_NoMemory(); 7003 goto error; 7004 } 7005 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7006 goto error; 7007 startout = PyUnicode_AS_UNICODE(*v) + n; 7008 } 7009 7010 /* Decode the byte string character per character */ 7011 out = startout; 7012 while (in < endin) 7013 { 7014 /* Decode a character */ 7015 insize = 1; 7016 do 7017 { 7018 outsize = MultiByteToWideChar(code_page, flags, 7019 in, insize, 7020 buffer, Py_ARRAY_LENGTH(buffer)); 7021 if (outsize > 0) 7022 break; 7023 err = GetLastError(); 7024 if (err != ERROR_NO_UNICODE_TRANSLATION 7025 && err != ERROR_INSUFFICIENT_BUFFER) 7026 { 7027 PyErr_SetFromWindowsErr(0); 7028 goto error; 7029 } 7030 insize++; 7031 } 7032 /* 4=maximum length of a UTF-8 sequence */ 7033 while (insize <= 4 && (in + insize) <= endin); 7034 7035 if (outsize <= 0) { 7036 Py_ssize_t startinpos, endinpos, outpos; 7037 7038 startinpos = in - startin; 7039 endinpos = startinpos + 1; 7040 outpos = out - PyUnicode_AS_UNICODE(*v); 7041 if (unicode_decode_call_errorhandler_wchar( 7042 errors, &errorHandler, 7043 encoding, reason, 7044 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7045 v, &outpos)) 7046 { 7047 goto error; 7048 } 7049 out = PyUnicode_AS_UNICODE(*v) + outpos; 7050 } 7051 else { 7052 in += insize; 7053 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7054 out += outsize; 7055 } 7056 } 7057 7058 /* write a NUL character at the end */ 7059 *out = 0; 7060 7061 /* Extend unicode object */ 7062 outsize = out - startout; 7063 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7064 if (unicode_resize(v, outsize) < 0) 7065 goto error; 7066 ret = size; 7067 7068error: 7069 Py_XDECREF(encoding_obj); 7070 Py_XDECREF(errorHandler); 7071 Py_XDECREF(exc); 7072 return ret; 7073} 7074 7075static PyObject * 7076decode_code_page_stateful(int code_page, 7077 const char *s, Py_ssize_t size, 7078 const char *errors, Py_ssize_t *consumed) 7079{ 7080 PyObject *v = NULL; 7081 int chunk_size, final, converted, done; 7082 7083 if (code_page < 0) { 7084 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7085 return NULL; 7086 } 7087 7088 if (consumed) 7089 *consumed = 0; 7090 7091 do 7092 { 7093#ifdef NEED_RETRY 7094 if (size > INT_MAX) { 7095 chunk_size = INT_MAX; 7096 final = 0; 7097 done = 0; 7098 } 7099 else 7100#endif 7101 { 7102 chunk_size = (int)size; 7103 final = (consumed == NULL); 7104 done = 1; 7105 } 7106 7107 /* Skip trailing lead-byte unless 'final' is set */ 7108 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 7109 --chunk_size; 7110 7111 if (chunk_size == 0 && done) { 7112 if (v != NULL) 7113 break; 7114 _Py_RETURN_UNICODE_EMPTY(); 7115 } 7116 7117 7118 converted = decode_code_page_strict(code_page, &v, 7119 s, chunk_size); 7120 if (converted == -2) 7121 converted = decode_code_page_errors(code_page, &v, 7122 s, chunk_size, 7123 errors); 7124 assert(converted != 0); 7125 7126 if (converted < 0) { 7127 Py_XDECREF(v); 7128 return NULL; 7129 } 7130 7131 if (consumed) 7132 *consumed += converted; 7133 7134 s += converted; 7135 size -= converted; 7136 } while (!done); 7137 7138 return unicode_result(v); 7139} 7140 7141PyObject * 7142PyUnicode_DecodeCodePageStateful(int code_page, 7143 const char *s, 7144 Py_ssize_t size, 7145 const char *errors, 7146 Py_ssize_t *consumed) 7147{ 7148 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7149} 7150 7151PyObject * 7152PyUnicode_DecodeMBCSStateful(const char *s, 7153 Py_ssize_t size, 7154 const char *errors, 7155 Py_ssize_t *consumed) 7156{ 7157 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7158} 7159 7160PyObject * 7161PyUnicode_DecodeMBCS(const char *s, 7162 Py_ssize_t size, 7163 const char *errors) 7164{ 7165 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7166} 7167 7168static DWORD 7169encode_code_page_flags(UINT code_page, const char *errors) 7170{ 7171 if (code_page == CP_UTF8) { 7172 if (winver.dwMajorVersion >= 6) 7173 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7174 and later */ 7175 return WC_ERR_INVALID_CHARS; 7176 else 7177 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7178 return 0; 7179 } 7180 else if (code_page == CP_UTF7) { 7181 /* CP_UTF7 only supports flags=0 */ 7182 return 0; 7183 } 7184 else { 7185 if (errors != NULL && strcmp(errors, "replace") == 0) 7186 return 0; 7187 else 7188 return WC_NO_BEST_FIT_CHARS; 7189 } 7190} 7191 7192/* 7193 * Encode a Unicode string to a Windows code page into a byte string in strict 7194 * mode. 7195 * 7196 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7197 * an OSError and returns -1 on other error. 7198 */ 7199static int 7200encode_code_page_strict(UINT code_page, PyObject **outbytes, 7201 PyObject *unicode, Py_ssize_t offset, int len, 7202 const char* errors) 7203{ 7204 BOOL usedDefaultChar = FALSE; 7205 BOOL *pusedDefaultChar = &usedDefaultChar; 7206 int outsize; 7207 PyObject *exc = NULL; 7208 wchar_t *p; 7209 Py_ssize_t size; 7210 const DWORD flags = encode_code_page_flags(code_page, NULL); 7211 char *out; 7212 /* Create a substring so that we can get the UTF-16 representation 7213 of just the slice under consideration. */ 7214 PyObject *substring; 7215 7216 assert(len > 0); 7217 7218 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7219 pusedDefaultChar = &usedDefaultChar; 7220 else 7221 pusedDefaultChar = NULL; 7222 7223 substring = PyUnicode_Substring(unicode, offset, offset+len); 7224 if (substring == NULL) 7225 return -1; 7226 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7227 if (p == NULL) { 7228 Py_DECREF(substring); 7229 return -1; 7230 } 7231 assert(size <= INT_MAX); 7232 7233 /* First get the size of the result */ 7234 outsize = WideCharToMultiByte(code_page, flags, 7235 p, (int)size, 7236 NULL, 0, 7237 NULL, pusedDefaultChar); 7238 if (outsize <= 0) 7239 goto error; 7240 /* If we used a default char, then we failed! */ 7241 if (pusedDefaultChar && *pusedDefaultChar) { 7242 Py_DECREF(substring); 7243 return -2; 7244 } 7245 7246 if (*outbytes == NULL) { 7247 /* Create string object */ 7248 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7249 if (*outbytes == NULL) { 7250 Py_DECREF(substring); 7251 return -1; 7252 } 7253 out = PyBytes_AS_STRING(*outbytes); 7254 } 7255 else { 7256 /* Extend string object */ 7257 const Py_ssize_t n = PyBytes_Size(*outbytes); 7258 if (outsize > PY_SSIZE_T_MAX - n) { 7259 PyErr_NoMemory(); 7260 Py_DECREF(substring); 7261 return -1; 7262 } 7263 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7264 Py_DECREF(substring); 7265 return -1; 7266 } 7267 out = PyBytes_AS_STRING(*outbytes) + n; 7268 } 7269 7270 /* Do the conversion */ 7271 outsize = WideCharToMultiByte(code_page, flags, 7272 p, (int)size, 7273 out, outsize, 7274 NULL, pusedDefaultChar); 7275 Py_CLEAR(substring); 7276 if (outsize <= 0) 7277 goto error; 7278 if (pusedDefaultChar && *pusedDefaultChar) 7279 return -2; 7280 return 0; 7281 7282error: 7283 Py_XDECREF(substring); 7284 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7285 return -2; 7286 PyErr_SetFromWindowsErr(0); 7287 return -1; 7288} 7289 7290/* 7291 * Encode a Unicode string to a Windows code page into a byte string using a 7292 * error handler. 7293 * 7294 * Returns consumed characters if succeed, or raise an OSError and returns 7295 * -1 on other error. 7296 */ 7297static int 7298encode_code_page_errors(UINT code_page, PyObject **outbytes, 7299 PyObject *unicode, Py_ssize_t unicode_offset, 7300 Py_ssize_t insize, const char* errors) 7301{ 7302 const DWORD flags = encode_code_page_flags(code_page, errors); 7303 Py_ssize_t pos = unicode_offset; 7304 Py_ssize_t endin = unicode_offset + insize; 7305 /* Ideally, we should get reason from FormatMessage. This is the Windows 7306 2000 English version of the message. */ 7307 const char *reason = "invalid character"; 7308 /* 4=maximum length of a UTF-8 sequence */ 7309 char buffer[4]; 7310 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7311 Py_ssize_t outsize; 7312 char *out; 7313 PyObject *errorHandler = NULL; 7314 PyObject *exc = NULL; 7315 PyObject *encoding_obj = NULL; 7316 char *encoding; 7317 Py_ssize_t newpos, newoutsize; 7318 PyObject *rep; 7319 int ret = -1; 7320 7321 assert(insize > 0); 7322 7323 encoding = code_page_name(code_page, &encoding_obj); 7324 if (encoding == NULL) 7325 return -1; 7326 7327 if (errors == NULL || strcmp(errors, "strict") == 0) { 7328 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7329 then we raise a UnicodeEncodeError. */ 7330 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7331 if (exc != NULL) { 7332 PyCodec_StrictErrors(exc); 7333 Py_DECREF(exc); 7334 } 7335 Py_XDECREF(encoding_obj); 7336 return -1; 7337 } 7338 7339 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7340 pusedDefaultChar = &usedDefaultChar; 7341 else 7342 pusedDefaultChar = NULL; 7343 7344 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7345 PyErr_NoMemory(); 7346 goto error; 7347 } 7348 outsize = insize * Py_ARRAY_LENGTH(buffer); 7349 7350 if (*outbytes == NULL) { 7351 /* Create string object */ 7352 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7353 if (*outbytes == NULL) 7354 goto error; 7355 out = PyBytes_AS_STRING(*outbytes); 7356 } 7357 else { 7358 /* Extend string object */ 7359 Py_ssize_t n = PyBytes_Size(*outbytes); 7360 if (n > PY_SSIZE_T_MAX - outsize) { 7361 PyErr_NoMemory(); 7362 goto error; 7363 } 7364 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7365 goto error; 7366 out = PyBytes_AS_STRING(*outbytes) + n; 7367 } 7368 7369 /* Encode the string character per character */ 7370 while (pos < endin) 7371 { 7372 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7373 wchar_t chars[2]; 7374 int charsize; 7375 if (ch < 0x10000) { 7376 chars[0] = (wchar_t)ch; 7377 charsize = 1; 7378 } 7379 else { 7380 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7381 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7382 charsize = 2; 7383 } 7384 7385 outsize = WideCharToMultiByte(code_page, flags, 7386 chars, charsize, 7387 buffer, Py_ARRAY_LENGTH(buffer), 7388 NULL, pusedDefaultChar); 7389 if (outsize > 0) { 7390 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7391 { 7392 pos++; 7393 memcpy(out, buffer, outsize); 7394 out += outsize; 7395 continue; 7396 } 7397 } 7398 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7399 PyErr_SetFromWindowsErr(0); 7400 goto error; 7401 } 7402 7403 rep = unicode_encode_call_errorhandler( 7404 errors, &errorHandler, encoding, reason, 7405 unicode, &exc, 7406 pos, pos + 1, &newpos); 7407 if (rep == NULL) 7408 goto error; 7409 pos = newpos; 7410 7411 if (PyBytes_Check(rep)) { 7412 outsize = PyBytes_GET_SIZE(rep); 7413 if (outsize != 1) { 7414 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7415 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7416 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7417 Py_DECREF(rep); 7418 goto error; 7419 } 7420 out = PyBytes_AS_STRING(*outbytes) + offset; 7421 } 7422 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7423 out += outsize; 7424 } 7425 else { 7426 Py_ssize_t i; 7427 enum PyUnicode_Kind kind; 7428 void *data; 7429 7430 if (PyUnicode_READY(rep) == -1) { 7431 Py_DECREF(rep); 7432 goto error; 7433 } 7434 7435 outsize = PyUnicode_GET_LENGTH(rep); 7436 if (outsize != 1) { 7437 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7438 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7439 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7440 Py_DECREF(rep); 7441 goto error; 7442 } 7443 out = PyBytes_AS_STRING(*outbytes) + offset; 7444 } 7445 kind = PyUnicode_KIND(rep); 7446 data = PyUnicode_DATA(rep); 7447 for (i=0; i < outsize; i++) { 7448 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7449 if (ch > 127) { 7450 raise_encode_exception(&exc, 7451 encoding, unicode, 7452 pos, pos + 1, 7453 "unable to encode error handler result to ASCII"); 7454 Py_DECREF(rep); 7455 goto error; 7456 } 7457 *out = (unsigned char)ch; 7458 out++; 7459 } 7460 } 7461 Py_DECREF(rep); 7462 } 7463 /* write a NUL byte */ 7464 *out = 0; 7465 outsize = out - PyBytes_AS_STRING(*outbytes); 7466 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7467 if (_PyBytes_Resize(outbytes, outsize) < 0) 7468 goto error; 7469 ret = 0; 7470 7471error: 7472 Py_XDECREF(encoding_obj); 7473 Py_XDECREF(errorHandler); 7474 Py_XDECREF(exc); 7475 return ret; 7476} 7477 7478static PyObject * 7479encode_code_page(int code_page, 7480 PyObject *unicode, 7481 const char *errors) 7482{ 7483 Py_ssize_t len; 7484 PyObject *outbytes = NULL; 7485 Py_ssize_t offset; 7486 int chunk_len, ret, done; 7487 7488 if (PyUnicode_READY(unicode) == -1) 7489 return NULL; 7490 len = PyUnicode_GET_LENGTH(unicode); 7491 7492 if (code_page < 0) { 7493 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7494 return NULL; 7495 } 7496 7497 if (len == 0) 7498 return PyBytes_FromStringAndSize(NULL, 0); 7499 7500 offset = 0; 7501 do 7502 { 7503#ifdef NEED_RETRY 7504 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7505 chunks. */ 7506 if (len > INT_MAX/2) { 7507 chunk_len = INT_MAX/2; 7508 done = 0; 7509 } 7510 else 7511#endif 7512 { 7513 chunk_len = (int)len; 7514 done = 1; 7515 } 7516 7517 ret = encode_code_page_strict(code_page, &outbytes, 7518 unicode, offset, chunk_len, 7519 errors); 7520 if (ret == -2) 7521 ret = encode_code_page_errors(code_page, &outbytes, 7522 unicode, offset, 7523 chunk_len, errors); 7524 if (ret < 0) { 7525 Py_XDECREF(outbytes); 7526 return NULL; 7527 } 7528 7529 offset += chunk_len; 7530 len -= chunk_len; 7531 } while (!done); 7532 7533 return outbytes; 7534} 7535 7536PyObject * 7537PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7538 Py_ssize_t size, 7539 const char *errors) 7540{ 7541 PyObject *unicode, *res; 7542 unicode = PyUnicode_FromUnicode(p, size); 7543 if (unicode == NULL) 7544 return NULL; 7545 res = encode_code_page(CP_ACP, unicode, errors); 7546 Py_DECREF(unicode); 7547 return res; 7548} 7549 7550PyObject * 7551PyUnicode_EncodeCodePage(int code_page, 7552 PyObject *unicode, 7553 const char *errors) 7554{ 7555 return encode_code_page(code_page, unicode, errors); 7556} 7557 7558PyObject * 7559PyUnicode_AsMBCSString(PyObject *unicode) 7560{ 7561 if (!PyUnicode_Check(unicode)) { 7562 PyErr_BadArgument(); 7563 return NULL; 7564 } 7565 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7566} 7567 7568#undef NEED_RETRY 7569 7570#endif /* HAVE_MBCS */ 7571 7572/* --- Character Mapping Codec -------------------------------------------- */ 7573 7574static int 7575charmap_decode_string(const char *s, 7576 Py_ssize_t size, 7577 PyObject *mapping, 7578 const char *errors, 7579 _PyUnicodeWriter *writer) 7580{ 7581 const char *starts = s; 7582 const char *e; 7583 Py_ssize_t startinpos, endinpos; 7584 PyObject *errorHandler = NULL, *exc = NULL; 7585 Py_ssize_t maplen; 7586 enum PyUnicode_Kind mapkind; 7587 void *mapdata; 7588 Py_UCS4 x; 7589 unsigned char ch; 7590 7591 if (PyUnicode_READY(mapping) == -1) 7592 return -1; 7593 7594 maplen = PyUnicode_GET_LENGTH(mapping); 7595 mapdata = PyUnicode_DATA(mapping); 7596 mapkind = PyUnicode_KIND(mapping); 7597 7598 e = s + size; 7599 7600 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7601 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7602 * is disabled in encoding aliases, latin1 is preferred because 7603 * its implementation is faster. */ 7604 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7605 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7606 Py_UCS4 maxchar = writer->maxchar; 7607 7608 assert (writer->kind == PyUnicode_1BYTE_KIND); 7609 while (s < e) { 7610 ch = *s; 7611 x = mapdata_ucs1[ch]; 7612 if (x > maxchar) { 7613 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7614 goto onError; 7615 maxchar = writer->maxchar; 7616 outdata = (Py_UCS1 *)writer->data; 7617 } 7618 outdata[writer->pos] = x; 7619 writer->pos++; 7620 ++s; 7621 } 7622 return 0; 7623 } 7624 7625 while (s < e) { 7626 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7627 enum PyUnicode_Kind outkind = writer->kind; 7628 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7629 if (outkind == PyUnicode_1BYTE_KIND) { 7630 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7631 Py_UCS4 maxchar = writer->maxchar; 7632 while (s < e) { 7633 ch = *s; 7634 x = mapdata_ucs2[ch]; 7635 if (x > maxchar) 7636 goto Error; 7637 outdata[writer->pos] = x; 7638 writer->pos++; 7639 ++s; 7640 } 7641 break; 7642 } 7643 else if (outkind == PyUnicode_2BYTE_KIND) { 7644 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7645 while (s < e) { 7646 ch = *s; 7647 x = mapdata_ucs2[ch]; 7648 if (x == 0xFFFE) 7649 goto Error; 7650 outdata[writer->pos] = x; 7651 writer->pos++; 7652 ++s; 7653 } 7654 break; 7655 } 7656 } 7657 ch = *s; 7658 7659 if (ch < maplen) 7660 x = PyUnicode_READ(mapkind, mapdata, ch); 7661 else 7662 x = 0xfffe; /* invalid value */ 7663Error: 7664 if (x == 0xfffe) 7665 { 7666 /* undefined mapping */ 7667 startinpos = s-starts; 7668 endinpos = startinpos+1; 7669 if (unicode_decode_call_errorhandler_writer( 7670 errors, &errorHandler, 7671 "charmap", "character maps to <undefined>", 7672 &starts, &e, &startinpos, &endinpos, &exc, &s, 7673 writer)) { 7674 goto onError; 7675 } 7676 continue; 7677 } 7678 7679 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7680 goto onError; 7681 ++s; 7682 } 7683 Py_XDECREF(errorHandler); 7684 Py_XDECREF(exc); 7685 return 0; 7686 7687onError: 7688 Py_XDECREF(errorHandler); 7689 Py_XDECREF(exc); 7690 return -1; 7691} 7692 7693static int 7694charmap_decode_mapping(const char *s, 7695 Py_ssize_t size, 7696 PyObject *mapping, 7697 const char *errors, 7698 _PyUnicodeWriter *writer) 7699{ 7700 const char *starts = s; 7701 const char *e; 7702 Py_ssize_t startinpos, endinpos; 7703 PyObject *errorHandler = NULL, *exc = NULL; 7704 unsigned char ch; 7705 PyObject *key, *item = NULL; 7706 7707 e = s + size; 7708 7709 while (s < e) { 7710 ch = *s; 7711 7712 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7713 key = PyLong_FromLong((long)ch); 7714 if (key == NULL) 7715 goto onError; 7716 7717 item = PyObject_GetItem(mapping, key); 7718 Py_DECREF(key); 7719 if (item == NULL) { 7720 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7721 /* No mapping found means: mapping is undefined. */ 7722 PyErr_Clear(); 7723 goto Undefined; 7724 } else 7725 goto onError; 7726 } 7727 7728 /* Apply mapping */ 7729 if (item == Py_None) 7730 goto Undefined; 7731 if (PyLong_Check(item)) { 7732 long value = PyLong_AS_LONG(item); 7733 if (value == 0xFFFE) 7734 goto Undefined; 7735 if (value < 0 || value > MAX_UNICODE) { 7736 PyErr_Format(PyExc_TypeError, 7737 "character mapping must be in range(0x%lx)", 7738 (unsigned long)MAX_UNICODE + 1); 7739 goto onError; 7740 } 7741 7742 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7743 goto onError; 7744 } 7745 else if (PyUnicode_Check(item)) { 7746 if (PyUnicode_READY(item) == -1) 7747 goto onError; 7748 if (PyUnicode_GET_LENGTH(item) == 1) { 7749 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7750 if (value == 0xFFFE) 7751 goto Undefined; 7752 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7753 goto onError; 7754 } 7755 else { 7756 writer->overallocate = 1; 7757 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7758 goto onError; 7759 } 7760 } 7761 else { 7762 /* wrong return value */ 7763 PyErr_SetString(PyExc_TypeError, 7764 "character mapping must return integer, None or str"); 7765 goto onError; 7766 } 7767 Py_CLEAR(item); 7768 ++s; 7769 continue; 7770 7771Undefined: 7772 /* undefined mapping */ 7773 Py_CLEAR(item); 7774 startinpos = s-starts; 7775 endinpos = startinpos+1; 7776 if (unicode_decode_call_errorhandler_writer( 7777 errors, &errorHandler, 7778 "charmap", "character maps to <undefined>", 7779 &starts, &e, &startinpos, &endinpos, &exc, &s, 7780 writer)) { 7781 goto onError; 7782 } 7783 } 7784 Py_XDECREF(errorHandler); 7785 Py_XDECREF(exc); 7786 return 0; 7787 7788onError: 7789 Py_XDECREF(item); 7790 Py_XDECREF(errorHandler); 7791 Py_XDECREF(exc); 7792 return -1; 7793} 7794 7795PyObject * 7796PyUnicode_DecodeCharmap(const char *s, 7797 Py_ssize_t size, 7798 PyObject *mapping, 7799 const char *errors) 7800{ 7801 _PyUnicodeWriter writer; 7802 7803 /* Default to Latin-1 */ 7804 if (mapping == NULL) 7805 return PyUnicode_DecodeLatin1(s, size, errors); 7806 7807 if (size == 0) 7808 _Py_RETURN_UNICODE_EMPTY(); 7809 _PyUnicodeWriter_Init(&writer); 7810 writer.min_length = size; 7811 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7812 goto onError; 7813 7814 if (PyUnicode_CheckExact(mapping)) { 7815 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7816 goto onError; 7817 } 7818 else { 7819 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7820 goto onError; 7821 } 7822 return _PyUnicodeWriter_Finish(&writer); 7823 7824 onError: 7825 _PyUnicodeWriter_Dealloc(&writer); 7826 return NULL; 7827} 7828 7829/* Charmap encoding: the lookup table */ 7830 7831struct encoding_map { 7832 PyObject_HEAD 7833 unsigned char level1[32]; 7834 int count2, count3; 7835 unsigned char level23[1]; 7836}; 7837 7838static PyObject* 7839encoding_map_size(PyObject *obj, PyObject* args) 7840{ 7841 struct encoding_map *map = (struct encoding_map*)obj; 7842 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7843 128*map->count3); 7844} 7845 7846static PyMethodDef encoding_map_methods[] = { 7847 {"size", encoding_map_size, METH_NOARGS, 7848 PyDoc_STR("Return the size (in bytes) of this object") }, 7849 { 0 } 7850}; 7851 7852static void 7853encoding_map_dealloc(PyObject* o) 7854{ 7855 PyObject_FREE(o); 7856} 7857 7858static PyTypeObject EncodingMapType = { 7859 PyVarObject_HEAD_INIT(NULL, 0) 7860 "EncodingMap", /*tp_name*/ 7861 sizeof(struct encoding_map), /*tp_basicsize*/ 7862 0, /*tp_itemsize*/ 7863 /* methods */ 7864 encoding_map_dealloc, /*tp_dealloc*/ 7865 0, /*tp_print*/ 7866 0, /*tp_getattr*/ 7867 0, /*tp_setattr*/ 7868 0, /*tp_reserved*/ 7869 0, /*tp_repr*/ 7870 0, /*tp_as_number*/ 7871 0, /*tp_as_sequence*/ 7872 0, /*tp_as_mapping*/ 7873 0, /*tp_hash*/ 7874 0, /*tp_call*/ 7875 0, /*tp_str*/ 7876 0, /*tp_getattro*/ 7877 0, /*tp_setattro*/ 7878 0, /*tp_as_buffer*/ 7879 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7880 0, /*tp_doc*/ 7881 0, /*tp_traverse*/ 7882 0, /*tp_clear*/ 7883 0, /*tp_richcompare*/ 7884 0, /*tp_weaklistoffset*/ 7885 0, /*tp_iter*/ 7886 0, /*tp_iternext*/ 7887 encoding_map_methods, /*tp_methods*/ 7888 0, /*tp_members*/ 7889 0, /*tp_getset*/ 7890 0, /*tp_base*/ 7891 0, /*tp_dict*/ 7892 0, /*tp_descr_get*/ 7893 0, /*tp_descr_set*/ 7894 0, /*tp_dictoffset*/ 7895 0, /*tp_init*/ 7896 0, /*tp_alloc*/ 7897 0, /*tp_new*/ 7898 0, /*tp_free*/ 7899 0, /*tp_is_gc*/ 7900}; 7901 7902PyObject* 7903PyUnicode_BuildEncodingMap(PyObject* string) 7904{ 7905 PyObject *result; 7906 struct encoding_map *mresult; 7907 int i; 7908 int need_dict = 0; 7909 unsigned char level1[32]; 7910 unsigned char level2[512]; 7911 unsigned char *mlevel1, *mlevel2, *mlevel3; 7912 int count2 = 0, count3 = 0; 7913 int kind; 7914 void *data; 7915 Py_ssize_t length; 7916 Py_UCS4 ch; 7917 7918 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7919 PyErr_BadArgument(); 7920 return NULL; 7921 } 7922 kind = PyUnicode_KIND(string); 7923 data = PyUnicode_DATA(string); 7924 length = PyUnicode_GET_LENGTH(string); 7925 length = Py_MIN(length, 256); 7926 memset(level1, 0xFF, sizeof level1); 7927 memset(level2, 0xFF, sizeof level2); 7928 7929 /* If there isn't a one-to-one mapping of NULL to \0, 7930 or if there are non-BMP characters, we need to use 7931 a mapping dictionary. */ 7932 if (PyUnicode_READ(kind, data, 0) != 0) 7933 need_dict = 1; 7934 for (i = 1; i < length; i++) { 7935 int l1, l2; 7936 ch = PyUnicode_READ(kind, data, i); 7937 if (ch == 0 || ch > 0xFFFF) { 7938 need_dict = 1; 7939 break; 7940 } 7941 if (ch == 0xFFFE) 7942 /* unmapped character */ 7943 continue; 7944 l1 = ch >> 11; 7945 l2 = ch >> 7; 7946 if (level1[l1] == 0xFF) 7947 level1[l1] = count2++; 7948 if (level2[l2] == 0xFF) 7949 level2[l2] = count3++; 7950 } 7951 7952 if (count2 >= 0xFF || count3 >= 0xFF) 7953 need_dict = 1; 7954 7955 if (need_dict) { 7956 PyObject *result = PyDict_New(); 7957 PyObject *key, *value; 7958 if (!result) 7959 return NULL; 7960 for (i = 0; i < length; i++) { 7961 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7962 value = PyLong_FromLong(i); 7963 if (!key || !value) 7964 goto failed1; 7965 if (PyDict_SetItem(result, key, value) == -1) 7966 goto failed1; 7967 Py_DECREF(key); 7968 Py_DECREF(value); 7969 } 7970 return result; 7971 failed1: 7972 Py_XDECREF(key); 7973 Py_XDECREF(value); 7974 Py_DECREF(result); 7975 return NULL; 7976 } 7977 7978 /* Create a three-level trie */ 7979 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7980 16*count2 + 128*count3 - 1); 7981 if (!result) 7982 return PyErr_NoMemory(); 7983 PyObject_Init(result, &EncodingMapType); 7984 mresult = (struct encoding_map*)result; 7985 mresult->count2 = count2; 7986 mresult->count3 = count3; 7987 mlevel1 = mresult->level1; 7988 mlevel2 = mresult->level23; 7989 mlevel3 = mresult->level23 + 16*count2; 7990 memcpy(mlevel1, level1, 32); 7991 memset(mlevel2, 0xFF, 16*count2); 7992 memset(mlevel3, 0, 128*count3); 7993 count3 = 0; 7994 for (i = 1; i < length; i++) { 7995 int o1, o2, o3, i2, i3; 7996 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7997 if (ch == 0xFFFE) 7998 /* unmapped character */ 7999 continue; 8000 o1 = ch>>11; 8001 o2 = (ch>>7) & 0xF; 8002 i2 = 16*mlevel1[o1] + o2; 8003 if (mlevel2[i2] == 0xFF) 8004 mlevel2[i2] = count3++; 8005 o3 = ch & 0x7F; 8006 i3 = 128*mlevel2[i2] + o3; 8007 mlevel3[i3] = i; 8008 } 8009 return result; 8010} 8011 8012static int 8013encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 8014{ 8015 struct encoding_map *map = (struct encoding_map*)mapping; 8016 int l1 = c>>11; 8017 int l2 = (c>>7) & 0xF; 8018 int l3 = c & 0x7F; 8019 int i; 8020 8021 if (c > 0xFFFF) 8022 return -1; 8023 if (c == 0) 8024 return 0; 8025 /* level 1*/ 8026 i = map->level1[l1]; 8027 if (i == 0xFF) { 8028 return -1; 8029 } 8030 /* level 2*/ 8031 i = map->level23[16*i+l2]; 8032 if (i == 0xFF) { 8033 return -1; 8034 } 8035 /* level 3 */ 8036 i = map->level23[16*map->count2 + 128*i + l3]; 8037 if (i == 0) { 8038 return -1; 8039 } 8040 return i; 8041} 8042 8043/* Lookup the character ch in the mapping. If the character 8044 can't be found, Py_None is returned (or NULL, if another 8045 error occurred). */ 8046static PyObject * 8047charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8048{ 8049 PyObject *w = PyLong_FromLong((long)c); 8050 PyObject *x; 8051 8052 if (w == NULL) 8053 return NULL; 8054 x = PyObject_GetItem(mapping, w); 8055 Py_DECREF(w); 8056 if (x == NULL) { 8057 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8058 /* No mapping found means: mapping is undefined. */ 8059 PyErr_Clear(); 8060 x = Py_None; 8061 Py_INCREF(x); 8062 return x; 8063 } else 8064 return NULL; 8065 } 8066 else if (x == Py_None) 8067 return x; 8068 else if (PyLong_Check(x)) { 8069 long value = PyLong_AS_LONG(x); 8070 if (value < 0 || value > 255) { 8071 PyErr_SetString(PyExc_TypeError, 8072 "character mapping must be in range(256)"); 8073 Py_DECREF(x); 8074 return NULL; 8075 } 8076 return x; 8077 } 8078 else if (PyBytes_Check(x)) 8079 return x; 8080 else { 8081 /* wrong return value */ 8082 PyErr_Format(PyExc_TypeError, 8083 "character mapping must return integer, bytes or None, not %.400s", 8084 x->ob_type->tp_name); 8085 Py_DECREF(x); 8086 return NULL; 8087 } 8088} 8089 8090static int 8091charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8092{ 8093 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8094 /* exponentially overallocate to minimize reallocations */ 8095 if (requiredsize < 2*outsize) 8096 requiredsize = 2*outsize; 8097 if (_PyBytes_Resize(outobj, requiredsize)) 8098 return -1; 8099 return 0; 8100} 8101 8102typedef enum charmapencode_result { 8103 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8104} charmapencode_result; 8105/* lookup the character, put the result in the output string and adjust 8106 various state variables. Resize the output bytes object if not enough 8107 space is available. Return a new reference to the object that 8108 was put in the output buffer, or Py_None, if the mapping was undefined 8109 (in which case no character was written) or NULL, if a 8110 reallocation error occurred. The caller must decref the result */ 8111static charmapencode_result 8112charmapencode_output(Py_UCS4 c, PyObject *mapping, 8113 PyObject **outobj, Py_ssize_t *outpos) 8114{ 8115 PyObject *rep; 8116 char *outstart; 8117 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8118 8119 if (Py_TYPE(mapping) == &EncodingMapType) { 8120 int res = encoding_map_lookup(c, mapping); 8121 Py_ssize_t requiredsize = *outpos+1; 8122 if (res == -1) 8123 return enc_FAILED; 8124 if (outsize<requiredsize) 8125 if (charmapencode_resize(outobj, outpos, requiredsize)) 8126 return enc_EXCEPTION; 8127 outstart = PyBytes_AS_STRING(*outobj); 8128 outstart[(*outpos)++] = (char)res; 8129 return enc_SUCCESS; 8130 } 8131 8132 rep = charmapencode_lookup(c, mapping); 8133 if (rep==NULL) 8134 return enc_EXCEPTION; 8135 else if (rep==Py_None) { 8136 Py_DECREF(rep); 8137 return enc_FAILED; 8138 } else { 8139 if (PyLong_Check(rep)) { 8140 Py_ssize_t requiredsize = *outpos+1; 8141 if (outsize<requiredsize) 8142 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8143 Py_DECREF(rep); 8144 return enc_EXCEPTION; 8145 } 8146 outstart = PyBytes_AS_STRING(*outobj); 8147 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8148 } 8149 else { 8150 const char *repchars = PyBytes_AS_STRING(rep); 8151 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8152 Py_ssize_t requiredsize = *outpos+repsize; 8153 if (outsize<requiredsize) 8154 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8155 Py_DECREF(rep); 8156 return enc_EXCEPTION; 8157 } 8158 outstart = PyBytes_AS_STRING(*outobj); 8159 memcpy(outstart + *outpos, repchars, repsize); 8160 *outpos += repsize; 8161 } 8162 } 8163 Py_DECREF(rep); 8164 return enc_SUCCESS; 8165} 8166 8167/* handle an error in PyUnicode_EncodeCharmap 8168 Return 0 on success, -1 on error */ 8169static int 8170charmap_encoding_error( 8171 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8172 PyObject **exceptionObject, 8173 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8174 PyObject **res, Py_ssize_t *respos) 8175{ 8176 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8177 Py_ssize_t size, repsize; 8178 Py_ssize_t newpos; 8179 enum PyUnicode_Kind kind; 8180 void *data; 8181 Py_ssize_t index; 8182 /* startpos for collecting unencodable chars */ 8183 Py_ssize_t collstartpos = *inpos; 8184 Py_ssize_t collendpos = *inpos+1; 8185 Py_ssize_t collpos; 8186 char *encoding = "charmap"; 8187 char *reason = "character maps to <undefined>"; 8188 charmapencode_result x; 8189 Py_UCS4 ch; 8190 int val; 8191 8192 if (PyUnicode_READY(unicode) == -1) 8193 return -1; 8194 size = PyUnicode_GET_LENGTH(unicode); 8195 /* find all unencodable characters */ 8196 while (collendpos < size) { 8197 PyObject *rep; 8198 if (Py_TYPE(mapping) == &EncodingMapType) { 8199 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8200 val = encoding_map_lookup(ch, mapping); 8201 if (val != -1) 8202 break; 8203 ++collendpos; 8204 continue; 8205 } 8206 8207 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8208 rep = charmapencode_lookup(ch, mapping); 8209 if (rep==NULL) 8210 return -1; 8211 else if (rep!=Py_None) { 8212 Py_DECREF(rep); 8213 break; 8214 } 8215 Py_DECREF(rep); 8216 ++collendpos; 8217 } 8218 /* cache callback name lookup 8219 * (if not done yet, i.e. it's the first error) */ 8220 if (*known_errorHandler==-1) { 8221 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8222 *known_errorHandler = 1; 8223 else if (!strcmp(errors, "replace")) 8224 *known_errorHandler = 2; 8225 else if (!strcmp(errors, "ignore")) 8226 *known_errorHandler = 3; 8227 else if (!strcmp(errors, "xmlcharrefreplace")) 8228 *known_errorHandler = 4; 8229 else 8230 *known_errorHandler = 0; 8231 } 8232 switch (*known_errorHandler) { 8233 case 1: /* strict */ 8234 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8235 return -1; 8236 case 2: /* replace */ 8237 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8238 x = charmapencode_output('?', mapping, res, respos); 8239 if (x==enc_EXCEPTION) { 8240 return -1; 8241 } 8242 else if (x==enc_FAILED) { 8243 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8244 return -1; 8245 } 8246 } 8247 /* fall through */ 8248 case 3: /* ignore */ 8249 *inpos = collendpos; 8250 break; 8251 case 4: /* xmlcharrefreplace */ 8252 /* generate replacement (temporarily (mis)uses p) */ 8253 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8254 char buffer[2+29+1+1]; 8255 char *cp; 8256 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8257 for (cp = buffer; *cp; ++cp) { 8258 x = charmapencode_output(*cp, mapping, res, respos); 8259 if (x==enc_EXCEPTION) 8260 return -1; 8261 else if (x==enc_FAILED) { 8262 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8263 return -1; 8264 } 8265 } 8266 } 8267 *inpos = collendpos; 8268 break; 8269 default: 8270 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8271 encoding, reason, unicode, exceptionObject, 8272 collstartpos, collendpos, &newpos); 8273 if (repunicode == NULL) 8274 return -1; 8275 if (PyBytes_Check(repunicode)) { 8276 /* Directly copy bytes result to output. */ 8277 Py_ssize_t outsize = PyBytes_Size(*res); 8278 Py_ssize_t requiredsize; 8279 repsize = PyBytes_Size(repunicode); 8280 requiredsize = *respos + repsize; 8281 if (requiredsize > outsize) 8282 /* Make room for all additional bytes. */ 8283 if (charmapencode_resize(res, respos, requiredsize)) { 8284 Py_DECREF(repunicode); 8285 return -1; 8286 } 8287 memcpy(PyBytes_AsString(*res) + *respos, 8288 PyBytes_AsString(repunicode), repsize); 8289 *respos += repsize; 8290 *inpos = newpos; 8291 Py_DECREF(repunicode); 8292 break; 8293 } 8294 /* generate replacement */ 8295 if (PyUnicode_READY(repunicode) == -1) { 8296 Py_DECREF(repunicode); 8297 return -1; 8298 } 8299 repsize = PyUnicode_GET_LENGTH(repunicode); 8300 data = PyUnicode_DATA(repunicode); 8301 kind = PyUnicode_KIND(repunicode); 8302 for (index = 0; index < repsize; index++) { 8303 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8304 x = charmapencode_output(repch, mapping, res, respos); 8305 if (x==enc_EXCEPTION) { 8306 Py_DECREF(repunicode); 8307 return -1; 8308 } 8309 else if (x==enc_FAILED) { 8310 Py_DECREF(repunicode); 8311 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8312 return -1; 8313 } 8314 } 8315 *inpos = newpos; 8316 Py_DECREF(repunicode); 8317 } 8318 return 0; 8319} 8320 8321PyObject * 8322_PyUnicode_EncodeCharmap(PyObject *unicode, 8323 PyObject *mapping, 8324 const char *errors) 8325{ 8326 /* output object */ 8327 PyObject *res = NULL; 8328 /* current input position */ 8329 Py_ssize_t inpos = 0; 8330 Py_ssize_t size; 8331 /* current output position */ 8332 Py_ssize_t respos = 0; 8333 PyObject *errorHandler = NULL; 8334 PyObject *exc = NULL; 8335 /* the following variable is used for caching string comparisons 8336 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8337 * 3=ignore, 4=xmlcharrefreplace */ 8338 int known_errorHandler = -1; 8339 void *data; 8340 int kind; 8341 8342 if (PyUnicode_READY(unicode) == -1) 8343 return NULL; 8344 size = PyUnicode_GET_LENGTH(unicode); 8345 data = PyUnicode_DATA(unicode); 8346 kind = PyUnicode_KIND(unicode); 8347 8348 /* Default to Latin-1 */ 8349 if (mapping == NULL) 8350 return unicode_encode_ucs1(unicode, errors, 256); 8351 8352 /* allocate enough for a simple encoding without 8353 replacements, if we need more, we'll resize */ 8354 res = PyBytes_FromStringAndSize(NULL, size); 8355 if (res == NULL) 8356 goto onError; 8357 if (size == 0) 8358 return res; 8359 8360 while (inpos<size) { 8361 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8362 /* try to encode it */ 8363 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8364 if (x==enc_EXCEPTION) /* error */ 8365 goto onError; 8366 if (x==enc_FAILED) { /* unencodable character */ 8367 if (charmap_encoding_error(unicode, &inpos, mapping, 8368 &exc, 8369 &known_errorHandler, &errorHandler, errors, 8370 &res, &respos)) { 8371 goto onError; 8372 } 8373 } 8374 else 8375 /* done with this character => adjust input position */ 8376 ++inpos; 8377 } 8378 8379 /* Resize if we allocated to much */ 8380 if (respos<PyBytes_GET_SIZE(res)) 8381 if (_PyBytes_Resize(&res, respos) < 0) 8382 goto onError; 8383 8384 Py_XDECREF(exc); 8385 Py_XDECREF(errorHandler); 8386 return res; 8387 8388 onError: 8389 Py_XDECREF(res); 8390 Py_XDECREF(exc); 8391 Py_XDECREF(errorHandler); 8392 return NULL; 8393} 8394 8395/* Deprecated */ 8396PyObject * 8397PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8398 Py_ssize_t size, 8399 PyObject *mapping, 8400 const char *errors) 8401{ 8402 PyObject *result; 8403 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8404 if (unicode == NULL) 8405 return NULL; 8406 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8407 Py_DECREF(unicode); 8408 return result; 8409} 8410 8411PyObject * 8412PyUnicode_AsCharmapString(PyObject *unicode, 8413 PyObject *mapping) 8414{ 8415 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8416 PyErr_BadArgument(); 8417 return NULL; 8418 } 8419 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8420} 8421 8422/* create or adjust a UnicodeTranslateError */ 8423static void 8424make_translate_exception(PyObject **exceptionObject, 8425 PyObject *unicode, 8426 Py_ssize_t startpos, Py_ssize_t endpos, 8427 const char *reason) 8428{ 8429 if (*exceptionObject == NULL) { 8430 *exceptionObject = _PyUnicodeTranslateError_Create( 8431 unicode, startpos, endpos, reason); 8432 } 8433 else { 8434 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8435 goto onError; 8436 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8437 goto onError; 8438 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8439 goto onError; 8440 return; 8441 onError: 8442 Py_CLEAR(*exceptionObject); 8443 } 8444} 8445 8446/* error handling callback helper: 8447 build arguments, call the callback and check the arguments, 8448 put the result into newpos and return the replacement string, which 8449 has to be freed by the caller */ 8450static PyObject * 8451unicode_translate_call_errorhandler(const char *errors, 8452 PyObject **errorHandler, 8453 const char *reason, 8454 PyObject *unicode, PyObject **exceptionObject, 8455 Py_ssize_t startpos, Py_ssize_t endpos, 8456 Py_ssize_t *newpos) 8457{ 8458 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8459 8460 Py_ssize_t i_newpos; 8461 PyObject *restuple; 8462 PyObject *resunicode; 8463 8464 if (*errorHandler == NULL) { 8465 *errorHandler = PyCodec_LookupError(errors); 8466 if (*errorHandler == NULL) 8467 return NULL; 8468 } 8469 8470 make_translate_exception(exceptionObject, 8471 unicode, startpos, endpos, reason); 8472 if (*exceptionObject == NULL) 8473 return NULL; 8474 8475 restuple = PyObject_CallFunctionObjArgs( 8476 *errorHandler, *exceptionObject, NULL); 8477 if (restuple == NULL) 8478 return NULL; 8479 if (!PyTuple_Check(restuple)) { 8480 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8481 Py_DECREF(restuple); 8482 return NULL; 8483 } 8484 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8485 &resunicode, &i_newpos)) { 8486 Py_DECREF(restuple); 8487 return NULL; 8488 } 8489 if (i_newpos<0) 8490 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8491 else 8492 *newpos = i_newpos; 8493 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8494 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8495 Py_DECREF(restuple); 8496 return NULL; 8497 } 8498 Py_INCREF(resunicode); 8499 Py_DECREF(restuple); 8500 return resunicode; 8501} 8502 8503/* Lookup the character ch in the mapping and put the result in result, 8504 which must be decrefed by the caller. 8505 Return 0 on success, -1 on error */ 8506static int 8507charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8508{ 8509 PyObject *w = PyLong_FromLong((long)c); 8510 PyObject *x; 8511 8512 if (w == NULL) 8513 return -1; 8514 x = PyObject_GetItem(mapping, w); 8515 Py_DECREF(w); 8516 if (x == NULL) { 8517 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8518 /* No mapping found means: use 1:1 mapping. */ 8519 PyErr_Clear(); 8520 *result = NULL; 8521 return 0; 8522 } else 8523 return -1; 8524 } 8525 else if (x == Py_None) { 8526 *result = x; 8527 return 0; 8528 } 8529 else if (PyLong_Check(x)) { 8530 long value = PyLong_AS_LONG(x); 8531 long max = PyUnicode_GetMax(); 8532 if (value < 0 || value > max) { 8533 PyErr_Format(PyExc_TypeError, 8534 "character mapping must be in range(0x%x)", max+1); 8535 Py_DECREF(x); 8536 return -1; 8537 } 8538 *result = x; 8539 return 0; 8540 } 8541 else if (PyUnicode_Check(x)) { 8542 *result = x; 8543 return 0; 8544 } 8545 else { 8546 /* wrong return value */ 8547 PyErr_SetString(PyExc_TypeError, 8548 "character mapping must return integer, None or str"); 8549 Py_DECREF(x); 8550 return -1; 8551 } 8552} 8553/* ensure that *outobj is at least requiredsize characters long, 8554 if not reallocate and adjust various state variables. 8555 Return 0 on success, -1 on error */ 8556static int 8557charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8558 Py_ssize_t requiredsize) 8559{ 8560 Py_ssize_t oldsize = *psize; 8561 Py_UCS4 *new_outobj; 8562 if (requiredsize > oldsize) { 8563 /* exponentially overallocate to minimize reallocations */ 8564 if (requiredsize < 2 * oldsize) 8565 requiredsize = 2 * oldsize; 8566 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8567 if (new_outobj == 0) 8568 return -1; 8569 *outobj = new_outobj; 8570 *psize = requiredsize; 8571 } 8572 return 0; 8573} 8574/* lookup the character, put the result in the output string and adjust 8575 various state variables. Return a new reference to the object that 8576 was put in the output buffer in *result, or Py_None, if the mapping was 8577 undefined (in which case no character was written). 8578 The called must decref result. 8579 Return 0 on success, -1 on error. */ 8580static int 8581charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8582 PyObject *mapping, Py_UCS4 **output, 8583 Py_ssize_t *osize, Py_ssize_t *opos, 8584 PyObject **res) 8585{ 8586 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8587 if (charmaptranslate_lookup(curinp, mapping, res)) 8588 return -1; 8589 if (*res==NULL) { 8590 /* not found => default to 1:1 mapping */ 8591 (*output)[(*opos)++] = curinp; 8592 } 8593 else if (*res==Py_None) 8594 ; 8595 else if (PyLong_Check(*res)) { 8596 /* no overflow check, because we know that the space is enough */ 8597 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8598 } 8599 else if (PyUnicode_Check(*res)) { 8600 Py_ssize_t repsize; 8601 if (PyUnicode_READY(*res) == -1) 8602 return -1; 8603 repsize = PyUnicode_GET_LENGTH(*res); 8604 if (repsize==1) { 8605 /* no overflow check, because we know that the space is enough */ 8606 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8607 } 8608 else if (repsize!=0) { 8609 /* more than one character */ 8610 Py_ssize_t requiredsize = *opos + 8611 (PyUnicode_GET_LENGTH(input) - ipos) + 8612 repsize - 1; 8613 Py_ssize_t i; 8614 if (charmaptranslate_makespace(output, osize, requiredsize)) 8615 return -1; 8616 for(i = 0; i < repsize; i++) 8617 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8618 } 8619 } 8620 else 8621 return -1; 8622 return 0; 8623} 8624 8625PyObject * 8626_PyUnicode_TranslateCharmap(PyObject *input, 8627 PyObject *mapping, 8628 const char *errors) 8629{ 8630 /* input object */ 8631 char *idata; 8632 Py_ssize_t size, i; 8633 int kind; 8634 /* output buffer */ 8635 Py_UCS4 *output = NULL; 8636 Py_ssize_t osize; 8637 PyObject *res; 8638 /* current output position */ 8639 Py_ssize_t opos; 8640 char *reason = "character maps to <undefined>"; 8641 PyObject *errorHandler = NULL; 8642 PyObject *exc = NULL; 8643 /* the following variable is used for caching string comparisons 8644 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8645 * 3=ignore, 4=xmlcharrefreplace */ 8646 int known_errorHandler = -1; 8647 8648 if (mapping == NULL) { 8649 PyErr_BadArgument(); 8650 return NULL; 8651 } 8652 8653 if (PyUnicode_READY(input) == -1) 8654 return NULL; 8655 idata = (char*)PyUnicode_DATA(input); 8656 kind = PyUnicode_KIND(input); 8657 size = PyUnicode_GET_LENGTH(input); 8658 i = 0; 8659 8660 if (size == 0) { 8661 Py_INCREF(input); 8662 return input; 8663 } 8664 8665 /* allocate enough for a simple 1:1 translation without 8666 replacements, if we need more, we'll resize */ 8667 osize = size; 8668 output = PyMem_NEW(Py_UCS4, osize); 8669 opos = 0; 8670 if (output == NULL) { 8671 PyErr_NoMemory(); 8672 goto onError; 8673 } 8674 8675 while (i<size) { 8676 /* try to encode it */ 8677 PyObject *x = NULL; 8678 if (charmaptranslate_output(input, i, mapping, 8679 &output, &osize, &opos, &x)) { 8680 Py_XDECREF(x); 8681 goto onError; 8682 } 8683 Py_XDECREF(x); 8684 if (x!=Py_None) /* it worked => adjust input pointer */ 8685 ++i; 8686 else { /* untranslatable character */ 8687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8688 Py_ssize_t repsize; 8689 Py_ssize_t newpos; 8690 Py_ssize_t uni2; 8691 /* startpos for collecting untranslatable chars */ 8692 Py_ssize_t collstart = i; 8693 Py_ssize_t collend = i+1; 8694 Py_ssize_t coll; 8695 8696 /* find all untranslatable characters */ 8697 while (collend < size) { 8698 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8699 goto onError; 8700 Py_XDECREF(x); 8701 if (x!=Py_None) 8702 break; 8703 ++collend; 8704 } 8705 /* cache callback name lookup 8706 * (if not done yet, i.e. it's the first error) */ 8707 if (known_errorHandler==-1) { 8708 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8709 known_errorHandler = 1; 8710 else if (!strcmp(errors, "replace")) 8711 known_errorHandler = 2; 8712 else if (!strcmp(errors, "ignore")) 8713 known_errorHandler = 3; 8714 else if (!strcmp(errors, "xmlcharrefreplace")) 8715 known_errorHandler = 4; 8716 else 8717 known_errorHandler = 0; 8718 } 8719 switch (known_errorHandler) { 8720 case 1: /* strict */ 8721 make_translate_exception(&exc, 8722 input, collstart, collend, reason); 8723 if (exc != NULL) 8724 PyCodec_StrictErrors(exc); 8725 goto onError; 8726 case 2: /* replace */ 8727 /* No need to check for space, this is a 1:1 replacement */ 8728 for (coll = collstart; coll<collend; coll++) 8729 output[opos++] = '?'; 8730 /* fall through */ 8731 case 3: /* ignore */ 8732 i = collend; 8733 break; 8734 case 4: /* xmlcharrefreplace */ 8735 /* generate replacement (temporarily (mis)uses i) */ 8736 for (i = collstart; i < collend; ++i) { 8737 char buffer[2+29+1+1]; 8738 char *cp; 8739 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8740 if (charmaptranslate_makespace(&output, &osize, 8741 opos+strlen(buffer)+(size-collend))) 8742 goto onError; 8743 for (cp = buffer; *cp; ++cp) 8744 output[opos++] = *cp; 8745 } 8746 i = collend; 8747 break; 8748 default: 8749 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8750 reason, input, &exc, 8751 collstart, collend, &newpos); 8752 if (repunicode == NULL) 8753 goto onError; 8754 if (PyUnicode_READY(repunicode) == -1) { 8755 Py_DECREF(repunicode); 8756 goto onError; 8757 } 8758 /* generate replacement */ 8759 repsize = PyUnicode_GET_LENGTH(repunicode); 8760 if (charmaptranslate_makespace(&output, &osize, 8761 opos+repsize+(size-collend))) { 8762 Py_DECREF(repunicode); 8763 goto onError; 8764 } 8765 for (uni2 = 0; repsize-->0; ++uni2) 8766 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8767 i = newpos; 8768 Py_DECREF(repunicode); 8769 } 8770 } 8771 } 8772 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8773 if (!res) 8774 goto onError; 8775 PyMem_Free(output); 8776 Py_XDECREF(exc); 8777 Py_XDECREF(errorHandler); 8778 return res; 8779 8780 onError: 8781 PyMem_Free(output); 8782 Py_XDECREF(exc); 8783 Py_XDECREF(errorHandler); 8784 return NULL; 8785} 8786 8787/* Deprecated. Use PyUnicode_Translate instead. */ 8788PyObject * 8789PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8790 Py_ssize_t size, 8791 PyObject *mapping, 8792 const char *errors) 8793{ 8794 PyObject *result; 8795 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8796 if (!unicode) 8797 return NULL; 8798 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8799 Py_DECREF(unicode); 8800 return result; 8801} 8802 8803PyObject * 8804PyUnicode_Translate(PyObject *str, 8805 PyObject *mapping, 8806 const char *errors) 8807{ 8808 PyObject *result; 8809 8810 str = PyUnicode_FromObject(str); 8811 if (str == NULL) 8812 return NULL; 8813 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8814 Py_DECREF(str); 8815 return result; 8816} 8817 8818static Py_UCS4 8819fix_decimal_and_space_to_ascii(PyObject *self) 8820{ 8821 /* No need to call PyUnicode_READY(self) because this function is only 8822 called as a callback from fixup() which does it already. */ 8823 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8824 const int kind = PyUnicode_KIND(self); 8825 void *data = PyUnicode_DATA(self); 8826 Py_UCS4 maxchar = 127, ch, fixed; 8827 int modified = 0; 8828 Py_ssize_t i; 8829 8830 for (i = 0; i < len; ++i) { 8831 ch = PyUnicode_READ(kind, data, i); 8832 fixed = 0; 8833 if (ch > 127) { 8834 if (Py_UNICODE_ISSPACE(ch)) 8835 fixed = ' '; 8836 else { 8837 const int decimal = Py_UNICODE_TODECIMAL(ch); 8838 if (decimal >= 0) 8839 fixed = '0' + decimal; 8840 } 8841 if (fixed != 0) { 8842 modified = 1; 8843 maxchar = Py_MAX(maxchar, fixed); 8844 PyUnicode_WRITE(kind, data, i, fixed); 8845 } 8846 else 8847 maxchar = Py_MAX(maxchar, ch); 8848 } 8849 } 8850 8851 return (modified) ? maxchar : 0; 8852} 8853 8854PyObject * 8855_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8856{ 8857 if (!PyUnicode_Check(unicode)) { 8858 PyErr_BadInternalCall(); 8859 return NULL; 8860 } 8861 if (PyUnicode_READY(unicode) == -1) 8862 return NULL; 8863 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8864 /* If the string is already ASCII, just return the same string */ 8865 Py_INCREF(unicode); 8866 return unicode; 8867 } 8868 return fixup(unicode, fix_decimal_and_space_to_ascii); 8869} 8870 8871PyObject * 8872PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8873 Py_ssize_t length) 8874{ 8875 PyObject *decimal; 8876 Py_ssize_t i; 8877 Py_UCS4 maxchar; 8878 enum PyUnicode_Kind kind; 8879 void *data; 8880 8881 maxchar = 127; 8882 for (i = 0; i < length; i++) { 8883 Py_UNICODE ch = s[i]; 8884 if (ch > 127) { 8885 int decimal = Py_UNICODE_TODECIMAL(ch); 8886 if (decimal >= 0) 8887 ch = '0' + decimal; 8888 maxchar = Py_MAX(maxchar, ch); 8889 } 8890 } 8891 8892 /* Copy to a new string */ 8893 decimal = PyUnicode_New(length, maxchar); 8894 if (decimal == NULL) 8895 return decimal; 8896 kind = PyUnicode_KIND(decimal); 8897 data = PyUnicode_DATA(decimal); 8898 /* Iterate over code points */ 8899 for (i = 0; i < length; i++) { 8900 Py_UNICODE ch = s[i]; 8901 if (ch > 127) { 8902 int decimal = Py_UNICODE_TODECIMAL(ch); 8903 if (decimal >= 0) 8904 ch = '0' + decimal; 8905 } 8906 PyUnicode_WRITE(kind, data, i, ch); 8907 } 8908 return unicode_result(decimal); 8909} 8910/* --- Decimal Encoder ---------------------------------------------------- */ 8911 8912int 8913PyUnicode_EncodeDecimal(Py_UNICODE *s, 8914 Py_ssize_t length, 8915 char *output, 8916 const char *errors) 8917{ 8918 PyObject *unicode; 8919 Py_ssize_t i; 8920 enum PyUnicode_Kind kind; 8921 void *data; 8922 8923 if (output == NULL) { 8924 PyErr_BadArgument(); 8925 return -1; 8926 } 8927 8928 unicode = PyUnicode_FromUnicode(s, length); 8929 if (unicode == NULL) 8930 return -1; 8931 8932 if (PyUnicode_READY(unicode) == -1) { 8933 Py_DECREF(unicode); 8934 return -1; 8935 } 8936 kind = PyUnicode_KIND(unicode); 8937 data = PyUnicode_DATA(unicode); 8938 8939 for (i=0; i < length; ) { 8940 PyObject *exc; 8941 Py_UCS4 ch; 8942 int decimal; 8943 Py_ssize_t startpos; 8944 8945 ch = PyUnicode_READ(kind, data, i); 8946 8947 if (Py_UNICODE_ISSPACE(ch)) { 8948 *output++ = ' '; 8949 i++; 8950 continue; 8951 } 8952 decimal = Py_UNICODE_TODECIMAL(ch); 8953 if (decimal >= 0) { 8954 *output++ = '0' + decimal; 8955 i++; 8956 continue; 8957 } 8958 if (0 < ch && ch < 256) { 8959 *output++ = (char)ch; 8960 i++; 8961 continue; 8962 } 8963 8964 startpos = i; 8965 exc = NULL; 8966 raise_encode_exception(&exc, "decimal", unicode, 8967 startpos, startpos+1, 8968 "invalid decimal Unicode string"); 8969 Py_XDECREF(exc); 8970 Py_DECREF(unicode); 8971 return -1; 8972 } 8973 /* 0-terminate the output string */ 8974 *output++ = '\0'; 8975 Py_DECREF(unicode); 8976 return 0; 8977} 8978 8979/* --- Helpers ------------------------------------------------------------ */ 8980 8981static Py_ssize_t 8982any_find_slice(int direction, PyObject* s1, PyObject* s2, 8983 Py_ssize_t start, 8984 Py_ssize_t end) 8985{ 8986 int kind1, kind2, kind; 8987 void *buf1, *buf2; 8988 Py_ssize_t len1, len2, result; 8989 8990 kind1 = PyUnicode_KIND(s1); 8991 kind2 = PyUnicode_KIND(s2); 8992 kind = kind1 > kind2 ? kind1 : kind2; 8993 buf1 = PyUnicode_DATA(s1); 8994 buf2 = PyUnicode_DATA(s2); 8995 if (kind1 != kind) 8996 buf1 = _PyUnicode_AsKind(s1, kind); 8997 if (!buf1) 8998 return -2; 8999 if (kind2 != kind) 9000 buf2 = _PyUnicode_AsKind(s2, kind); 9001 if (!buf2) { 9002 if (kind1 != kind) PyMem_Free(buf1); 9003 return -2; 9004 } 9005 len1 = PyUnicode_GET_LENGTH(s1); 9006 len2 = PyUnicode_GET_LENGTH(s2); 9007 9008 if (direction > 0) { 9009 switch (kind) { 9010 case PyUnicode_1BYTE_KIND: 9011 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9012 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9013 else 9014 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9015 break; 9016 case PyUnicode_2BYTE_KIND: 9017 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9018 break; 9019 case PyUnicode_4BYTE_KIND: 9020 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9021 break; 9022 default: 9023 assert(0); result = -2; 9024 } 9025 } 9026 else { 9027 switch (kind) { 9028 case PyUnicode_1BYTE_KIND: 9029 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9030 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9031 else 9032 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9033 break; 9034 case PyUnicode_2BYTE_KIND: 9035 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9036 break; 9037 case PyUnicode_4BYTE_KIND: 9038 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9039 break; 9040 default: 9041 assert(0); result = -2; 9042 } 9043 } 9044 9045 if (kind1 != kind) 9046 PyMem_Free(buf1); 9047 if (kind2 != kind) 9048 PyMem_Free(buf2); 9049 9050 return result; 9051} 9052 9053Py_ssize_t 9054_PyUnicode_InsertThousandsGrouping( 9055 PyObject *unicode, Py_ssize_t index, 9056 Py_ssize_t n_buffer, 9057 void *digits, Py_ssize_t n_digits, 9058 Py_ssize_t min_width, 9059 const char *grouping, PyObject *thousands_sep, 9060 Py_UCS4 *maxchar) 9061{ 9062 unsigned int kind, thousands_sep_kind; 9063 char *data, *thousands_sep_data; 9064 Py_ssize_t thousands_sep_len; 9065 Py_ssize_t len; 9066 9067 if (unicode != NULL) { 9068 kind = PyUnicode_KIND(unicode); 9069 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9070 } 9071 else { 9072 kind = PyUnicode_1BYTE_KIND; 9073 data = NULL; 9074 } 9075 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9076 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9077 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9078 if (unicode != NULL && thousands_sep_kind != kind) { 9079 if (thousands_sep_kind < kind) { 9080 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9081 if (!thousands_sep_data) 9082 return -1; 9083 } 9084 else { 9085 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9086 if (!data) 9087 return -1; 9088 } 9089 } 9090 9091 switch (kind) { 9092 case PyUnicode_1BYTE_KIND: 9093 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9094 len = asciilib_InsertThousandsGrouping( 9095 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9096 min_width, grouping, 9097 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9098 else 9099 len = ucs1lib_InsertThousandsGrouping( 9100 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9101 min_width, grouping, 9102 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9103 break; 9104 case PyUnicode_2BYTE_KIND: 9105 len = ucs2lib_InsertThousandsGrouping( 9106 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9107 min_width, grouping, 9108 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9109 break; 9110 case PyUnicode_4BYTE_KIND: 9111 len = ucs4lib_InsertThousandsGrouping( 9112 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9113 min_width, grouping, 9114 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9115 break; 9116 default: 9117 assert(0); 9118 return -1; 9119 } 9120 if (unicode != NULL && thousands_sep_kind != kind) { 9121 if (thousands_sep_kind < kind) 9122 PyMem_Free(thousands_sep_data); 9123 else 9124 PyMem_Free(data); 9125 } 9126 if (unicode == NULL) { 9127 *maxchar = 127; 9128 if (len != n_digits) { 9129 *maxchar = Py_MAX(*maxchar, 9130 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9131 } 9132 } 9133 return len; 9134} 9135 9136 9137/* helper macro to fixup start/end slice values */ 9138#define ADJUST_INDICES(start, end, len) \ 9139 if (end > len) \ 9140 end = len; \ 9141 else if (end < 0) { \ 9142 end += len; \ 9143 if (end < 0) \ 9144 end = 0; \ 9145 } \ 9146 if (start < 0) { \ 9147 start += len; \ 9148 if (start < 0) \ 9149 start = 0; \ 9150 } 9151 9152Py_ssize_t 9153PyUnicode_Count(PyObject *str, 9154 PyObject *substr, 9155 Py_ssize_t start, 9156 Py_ssize_t end) 9157{ 9158 Py_ssize_t result; 9159 PyObject* str_obj; 9160 PyObject* sub_obj; 9161 int kind1, kind2, kind; 9162 void *buf1 = NULL, *buf2 = NULL; 9163 Py_ssize_t len1, len2; 9164 9165 str_obj = PyUnicode_FromObject(str); 9166 if (!str_obj) 9167 return -1; 9168 sub_obj = PyUnicode_FromObject(substr); 9169 if (!sub_obj) { 9170 Py_DECREF(str_obj); 9171 return -1; 9172 } 9173 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9174 Py_DECREF(sub_obj); 9175 Py_DECREF(str_obj); 9176 return -1; 9177 } 9178 9179 kind1 = PyUnicode_KIND(str_obj); 9180 kind2 = PyUnicode_KIND(sub_obj); 9181 kind = kind1; 9182 buf1 = PyUnicode_DATA(str_obj); 9183 buf2 = PyUnicode_DATA(sub_obj); 9184 if (kind2 != kind) { 9185 if (kind2 > kind) { 9186 Py_DECREF(sub_obj); 9187 Py_DECREF(str_obj); 9188 return 0; 9189 } 9190 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9191 } 9192 if (!buf2) 9193 goto onError; 9194 len1 = PyUnicode_GET_LENGTH(str_obj); 9195 len2 = PyUnicode_GET_LENGTH(sub_obj); 9196 9197 ADJUST_INDICES(start, end, len1); 9198 switch (kind) { 9199 case PyUnicode_1BYTE_KIND: 9200 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9201 result = asciilib_count( 9202 ((Py_UCS1*)buf1) + start, end - start, 9203 buf2, len2, PY_SSIZE_T_MAX 9204 ); 9205 else 9206 result = ucs1lib_count( 9207 ((Py_UCS1*)buf1) + start, end - start, 9208 buf2, len2, PY_SSIZE_T_MAX 9209 ); 9210 break; 9211 case PyUnicode_2BYTE_KIND: 9212 result = ucs2lib_count( 9213 ((Py_UCS2*)buf1) + start, end - start, 9214 buf2, len2, PY_SSIZE_T_MAX 9215 ); 9216 break; 9217 case PyUnicode_4BYTE_KIND: 9218 result = ucs4lib_count( 9219 ((Py_UCS4*)buf1) + start, end - start, 9220 buf2, len2, PY_SSIZE_T_MAX 9221 ); 9222 break; 9223 default: 9224 assert(0); result = 0; 9225 } 9226 9227 Py_DECREF(sub_obj); 9228 Py_DECREF(str_obj); 9229 9230 if (kind2 != kind) 9231 PyMem_Free(buf2); 9232 9233 return result; 9234 onError: 9235 Py_DECREF(sub_obj); 9236 Py_DECREF(str_obj); 9237 if (kind2 != kind && buf2) 9238 PyMem_Free(buf2); 9239 return -1; 9240} 9241 9242Py_ssize_t 9243PyUnicode_Find(PyObject *str, 9244 PyObject *sub, 9245 Py_ssize_t start, 9246 Py_ssize_t end, 9247 int direction) 9248{ 9249 Py_ssize_t result; 9250 9251 str = PyUnicode_FromObject(str); 9252 if (!str) 9253 return -2; 9254 sub = PyUnicode_FromObject(sub); 9255 if (!sub) { 9256 Py_DECREF(str); 9257 return -2; 9258 } 9259 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9260 Py_DECREF(sub); 9261 Py_DECREF(str); 9262 return -2; 9263 } 9264 9265 result = any_find_slice(direction, 9266 str, sub, start, end 9267 ); 9268 9269 Py_DECREF(str); 9270 Py_DECREF(sub); 9271 9272 return result; 9273} 9274 9275Py_ssize_t 9276PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9277 Py_ssize_t start, Py_ssize_t end, 9278 int direction) 9279{ 9280 int kind; 9281 Py_ssize_t result; 9282 if (PyUnicode_READY(str) == -1) 9283 return -2; 9284 if (start < 0 || end < 0) { 9285 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9286 return -2; 9287 } 9288 if (end > PyUnicode_GET_LENGTH(str)) 9289 end = PyUnicode_GET_LENGTH(str); 9290 kind = PyUnicode_KIND(str); 9291 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9292 kind, end-start, ch, direction); 9293 if (result == -1) 9294 return -1; 9295 else 9296 return start + result; 9297} 9298 9299static int 9300tailmatch(PyObject *self, 9301 PyObject *substring, 9302 Py_ssize_t start, 9303 Py_ssize_t end, 9304 int direction) 9305{ 9306 int kind_self; 9307 int kind_sub; 9308 void *data_self; 9309 void *data_sub; 9310 Py_ssize_t offset; 9311 Py_ssize_t i; 9312 Py_ssize_t end_sub; 9313 9314 if (PyUnicode_READY(self) == -1 || 9315 PyUnicode_READY(substring) == -1) 9316 return -1; 9317 9318 if (PyUnicode_GET_LENGTH(substring) == 0) 9319 return 1; 9320 9321 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9322 end -= PyUnicode_GET_LENGTH(substring); 9323 if (end < start) 9324 return 0; 9325 9326 kind_self = PyUnicode_KIND(self); 9327 data_self = PyUnicode_DATA(self); 9328 kind_sub = PyUnicode_KIND(substring); 9329 data_sub = PyUnicode_DATA(substring); 9330 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9331 9332 if (direction > 0) 9333 offset = end; 9334 else 9335 offset = start; 9336 9337 if (PyUnicode_READ(kind_self, data_self, offset) == 9338 PyUnicode_READ(kind_sub, data_sub, 0) && 9339 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9340 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9341 /* If both are of the same kind, memcmp is sufficient */ 9342 if (kind_self == kind_sub) { 9343 return ! memcmp((char *)data_self + 9344 (offset * PyUnicode_KIND(substring)), 9345 data_sub, 9346 PyUnicode_GET_LENGTH(substring) * 9347 PyUnicode_KIND(substring)); 9348 } 9349 /* otherwise we have to compare each character by first accesing it */ 9350 else { 9351 /* We do not need to compare 0 and len(substring)-1 because 9352 the if statement above ensured already that they are equal 9353 when we end up here. */ 9354 for (i = 1; i < end_sub; ++i) { 9355 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9356 PyUnicode_READ(kind_sub, data_sub, i)) 9357 return 0; 9358 } 9359 return 1; 9360 } 9361 } 9362 9363 return 0; 9364} 9365 9366Py_ssize_t 9367PyUnicode_Tailmatch(PyObject *str, 9368 PyObject *substr, 9369 Py_ssize_t start, 9370 Py_ssize_t end, 9371 int direction) 9372{ 9373 Py_ssize_t result; 9374 9375 str = PyUnicode_FromObject(str); 9376 if (str == NULL) 9377 return -1; 9378 substr = PyUnicode_FromObject(substr); 9379 if (substr == NULL) { 9380 Py_DECREF(str); 9381 return -1; 9382 } 9383 9384 result = tailmatch(str, substr, 9385 start, end, direction); 9386 Py_DECREF(str); 9387 Py_DECREF(substr); 9388 return result; 9389} 9390 9391/* Apply fixfct filter to the Unicode object self and return a 9392 reference to the modified object */ 9393 9394static PyObject * 9395fixup(PyObject *self, 9396 Py_UCS4 (*fixfct)(PyObject *s)) 9397{ 9398 PyObject *u; 9399 Py_UCS4 maxchar_old, maxchar_new = 0; 9400 PyObject *v; 9401 9402 u = _PyUnicode_Copy(self); 9403 if (u == NULL) 9404 return NULL; 9405 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9406 9407 /* fix functions return the new maximum character in a string, 9408 if the kind of the resulting unicode object does not change, 9409 everything is fine. Otherwise we need to change the string kind 9410 and re-run the fix function. */ 9411 maxchar_new = fixfct(u); 9412 9413 if (maxchar_new == 0) { 9414 /* no changes */; 9415 if (PyUnicode_CheckExact(self)) { 9416 Py_DECREF(u); 9417 Py_INCREF(self); 9418 return self; 9419 } 9420 else 9421 return u; 9422 } 9423 9424 maxchar_new = align_maxchar(maxchar_new); 9425 9426 if (maxchar_new == maxchar_old) 9427 return u; 9428 9429 /* In case the maximum character changed, we need to 9430 convert the string to the new category. */ 9431 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9432 if (v == NULL) { 9433 Py_DECREF(u); 9434 return NULL; 9435 } 9436 if (maxchar_new > maxchar_old) { 9437 /* If the maxchar increased so that the kind changed, not all 9438 characters are representable anymore and we need to fix the 9439 string again. This only happens in very few cases. */ 9440 _PyUnicode_FastCopyCharacters(v, 0, 9441 self, 0, PyUnicode_GET_LENGTH(self)); 9442 maxchar_old = fixfct(v); 9443 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9444 } 9445 else { 9446 _PyUnicode_FastCopyCharacters(v, 0, 9447 u, 0, PyUnicode_GET_LENGTH(self)); 9448 } 9449 Py_DECREF(u); 9450 assert(_PyUnicode_CheckConsistency(v, 1)); 9451 return v; 9452} 9453 9454static PyObject * 9455ascii_upper_or_lower(PyObject *self, int lower) 9456{ 9457 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9458 char *resdata, *data = PyUnicode_DATA(self); 9459 PyObject *res; 9460 9461 res = PyUnicode_New(len, 127); 9462 if (res == NULL) 9463 return NULL; 9464 resdata = PyUnicode_DATA(res); 9465 if (lower) 9466 _Py_bytes_lower(resdata, data, len); 9467 else 9468 _Py_bytes_upper(resdata, data, len); 9469 return res; 9470} 9471 9472static Py_UCS4 9473handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9474{ 9475 Py_ssize_t j; 9476 int final_sigma; 9477 Py_UCS4 c = 0; 9478 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9479 9480 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9481 9482 where ! is a negation and \p{xxx} is a character with property xxx. 9483 */ 9484 for (j = i - 1; j >= 0; j--) { 9485 c = PyUnicode_READ(kind, data, j); 9486 if (!_PyUnicode_IsCaseIgnorable(c)) 9487 break; 9488 } 9489 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9490 if (final_sigma) { 9491 for (j = i + 1; j < length; j++) { 9492 c = PyUnicode_READ(kind, data, j); 9493 if (!_PyUnicode_IsCaseIgnorable(c)) 9494 break; 9495 } 9496 final_sigma = j == length || !_PyUnicode_IsCased(c); 9497 } 9498 return (final_sigma) ? 0x3C2 : 0x3C3; 9499} 9500 9501static int 9502lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9503 Py_UCS4 c, Py_UCS4 *mapped) 9504{ 9505 /* Obscure special case. */ 9506 if (c == 0x3A3) { 9507 mapped[0] = handle_capital_sigma(kind, data, length, i); 9508 return 1; 9509 } 9510 return _PyUnicode_ToLowerFull(c, mapped); 9511} 9512 9513static Py_ssize_t 9514do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9515{ 9516 Py_ssize_t i, k = 0; 9517 int n_res, j; 9518 Py_UCS4 c, mapped[3]; 9519 9520 c = PyUnicode_READ(kind, data, 0); 9521 n_res = _PyUnicode_ToUpperFull(c, mapped); 9522 for (j = 0; j < n_res; j++) { 9523 *maxchar = Py_MAX(*maxchar, mapped[j]); 9524 res[k++] = mapped[j]; 9525 } 9526 for (i = 1; i < length; i++) { 9527 c = PyUnicode_READ(kind, data, i); 9528 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9529 for (j = 0; j < n_res; j++) { 9530 *maxchar = Py_MAX(*maxchar, mapped[j]); 9531 res[k++] = mapped[j]; 9532 } 9533 } 9534 return k; 9535} 9536 9537static Py_ssize_t 9538do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9539 Py_ssize_t i, k = 0; 9540 9541 for (i = 0; i < length; i++) { 9542 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9543 int n_res, j; 9544 if (Py_UNICODE_ISUPPER(c)) { 9545 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9546 } 9547 else if (Py_UNICODE_ISLOWER(c)) { 9548 n_res = _PyUnicode_ToUpperFull(c, mapped); 9549 } 9550 else { 9551 n_res = 1; 9552 mapped[0] = c; 9553 } 9554 for (j = 0; j < n_res; j++) { 9555 *maxchar = Py_MAX(*maxchar, mapped[j]); 9556 res[k++] = mapped[j]; 9557 } 9558 } 9559 return k; 9560} 9561 9562static Py_ssize_t 9563do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9564 Py_UCS4 *maxchar, int lower) 9565{ 9566 Py_ssize_t i, k = 0; 9567 9568 for (i = 0; i < length; i++) { 9569 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9570 int n_res, j; 9571 if (lower) 9572 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9573 else 9574 n_res = _PyUnicode_ToUpperFull(c, mapped); 9575 for (j = 0; j < n_res; j++) { 9576 *maxchar = Py_MAX(*maxchar, mapped[j]); 9577 res[k++] = mapped[j]; 9578 } 9579 } 9580 return k; 9581} 9582 9583static Py_ssize_t 9584do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9585{ 9586 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9587} 9588 9589static Py_ssize_t 9590do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9591{ 9592 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9593} 9594 9595static Py_ssize_t 9596do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9597{ 9598 Py_ssize_t i, k = 0; 9599 9600 for (i = 0; i < length; i++) { 9601 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9602 Py_UCS4 mapped[3]; 9603 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9604 for (j = 0; j < n_res; j++) { 9605 *maxchar = Py_MAX(*maxchar, mapped[j]); 9606 res[k++] = mapped[j]; 9607 } 9608 } 9609 return k; 9610} 9611 9612static Py_ssize_t 9613do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9614{ 9615 Py_ssize_t i, k = 0; 9616 int previous_is_cased; 9617 9618 previous_is_cased = 0; 9619 for (i = 0; i < length; i++) { 9620 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9621 Py_UCS4 mapped[3]; 9622 int n_res, j; 9623 9624 if (previous_is_cased) 9625 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9626 else 9627 n_res = _PyUnicode_ToTitleFull(c, mapped); 9628 9629 for (j = 0; j < n_res; j++) { 9630 *maxchar = Py_MAX(*maxchar, mapped[j]); 9631 res[k++] = mapped[j]; 9632 } 9633 9634 previous_is_cased = _PyUnicode_IsCased(c); 9635 } 9636 return k; 9637} 9638 9639static PyObject * 9640case_operation(PyObject *self, 9641 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9642{ 9643 PyObject *res = NULL; 9644 Py_ssize_t length, newlength = 0; 9645 int kind, outkind; 9646 void *data, *outdata; 9647 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9648 9649 assert(PyUnicode_IS_READY(self)); 9650 9651 kind = PyUnicode_KIND(self); 9652 data = PyUnicode_DATA(self); 9653 length = PyUnicode_GET_LENGTH(self); 9654 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9655 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9656 return NULL; 9657 } 9658 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9659 if (tmp == NULL) 9660 return PyErr_NoMemory(); 9661 newlength = perform(kind, data, length, tmp, &maxchar); 9662 res = PyUnicode_New(newlength, maxchar); 9663 if (res == NULL) 9664 goto leave; 9665 tmpend = tmp + newlength; 9666 outdata = PyUnicode_DATA(res); 9667 outkind = PyUnicode_KIND(res); 9668 switch (outkind) { 9669 case PyUnicode_1BYTE_KIND: 9670 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9671 break; 9672 case PyUnicode_2BYTE_KIND: 9673 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9674 break; 9675 case PyUnicode_4BYTE_KIND: 9676 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9677 break; 9678 default: 9679 assert(0); 9680 break; 9681 } 9682 leave: 9683 PyMem_FREE(tmp); 9684 return res; 9685} 9686 9687PyObject * 9688PyUnicode_Join(PyObject *separator, PyObject *seq) 9689{ 9690 PyObject *sep = NULL; 9691 Py_ssize_t seplen; 9692 PyObject *res = NULL; /* the result */ 9693 PyObject *fseq; /* PySequence_Fast(seq) */ 9694 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9695 PyObject **items; 9696 PyObject *item; 9697 Py_ssize_t sz, i, res_offset; 9698 Py_UCS4 maxchar; 9699 Py_UCS4 item_maxchar; 9700 int use_memcpy; 9701 unsigned char *res_data = NULL, *sep_data = NULL; 9702 PyObject *last_obj; 9703 unsigned int kind = 0; 9704 9705 fseq = PySequence_Fast(seq, "can only join an iterable"); 9706 if (fseq == NULL) { 9707 return NULL; 9708 } 9709 9710 /* NOTE: the following code can't call back into Python code, 9711 * so we are sure that fseq won't be mutated. 9712 */ 9713 9714 seqlen = PySequence_Fast_GET_SIZE(fseq); 9715 /* If empty sequence, return u"". */ 9716 if (seqlen == 0) { 9717 Py_DECREF(fseq); 9718 _Py_RETURN_UNICODE_EMPTY(); 9719 } 9720 9721 /* If singleton sequence with an exact Unicode, return that. */ 9722 last_obj = NULL; 9723 items = PySequence_Fast_ITEMS(fseq); 9724 if (seqlen == 1) { 9725 if (PyUnicode_CheckExact(items[0])) { 9726 res = items[0]; 9727 Py_INCREF(res); 9728 Py_DECREF(fseq); 9729 return res; 9730 } 9731 seplen = 0; 9732 maxchar = 0; 9733 } 9734 else { 9735 /* Set up sep and seplen */ 9736 if (separator == NULL) { 9737 /* fall back to a blank space separator */ 9738 sep = PyUnicode_FromOrdinal(' '); 9739 if (!sep) 9740 goto onError; 9741 seplen = 1; 9742 maxchar = 32; 9743 } 9744 else { 9745 if (!PyUnicode_Check(separator)) { 9746 PyErr_Format(PyExc_TypeError, 9747 "separator: expected str instance," 9748 " %.80s found", 9749 Py_TYPE(separator)->tp_name); 9750 goto onError; 9751 } 9752 if (PyUnicode_READY(separator)) 9753 goto onError; 9754 sep = separator; 9755 seplen = PyUnicode_GET_LENGTH(separator); 9756 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9757 /* inc refcount to keep this code path symmetric with the 9758 above case of a blank separator */ 9759 Py_INCREF(sep); 9760 } 9761 last_obj = sep; 9762 } 9763 9764 /* There are at least two things to join, or else we have a subclass 9765 * of str in the sequence. 9766 * Do a pre-pass to figure out the total amount of space we'll 9767 * need (sz), and see whether all argument are strings. 9768 */ 9769 sz = 0; 9770#ifdef Py_DEBUG 9771 use_memcpy = 0; 9772#else 9773 use_memcpy = 1; 9774#endif 9775 for (i = 0; i < seqlen; i++) { 9776 const Py_ssize_t old_sz = sz; 9777 item = items[i]; 9778 if (!PyUnicode_Check(item)) { 9779 PyErr_Format(PyExc_TypeError, 9780 "sequence item %zd: expected str instance," 9781 " %.80s found", 9782 i, Py_TYPE(item)->tp_name); 9783 goto onError; 9784 } 9785 if (PyUnicode_READY(item) == -1) 9786 goto onError; 9787 sz += PyUnicode_GET_LENGTH(item); 9788 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9789 maxchar = Py_MAX(maxchar, item_maxchar); 9790 if (i != 0) 9791 sz += seplen; 9792 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9793 PyErr_SetString(PyExc_OverflowError, 9794 "join() result is too long for a Python string"); 9795 goto onError; 9796 } 9797 if (use_memcpy && last_obj != NULL) { 9798 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9799 use_memcpy = 0; 9800 } 9801 last_obj = item; 9802 } 9803 9804 res = PyUnicode_New(sz, maxchar); 9805 if (res == NULL) 9806 goto onError; 9807 9808 /* Catenate everything. */ 9809#ifdef Py_DEBUG 9810 use_memcpy = 0; 9811#else 9812 if (use_memcpy) { 9813 res_data = PyUnicode_1BYTE_DATA(res); 9814 kind = PyUnicode_KIND(res); 9815 if (seplen != 0) 9816 sep_data = PyUnicode_1BYTE_DATA(sep); 9817 } 9818#endif 9819 if (use_memcpy) { 9820 for (i = 0; i < seqlen; ++i) { 9821 Py_ssize_t itemlen; 9822 item = items[i]; 9823 9824 /* Copy item, and maybe the separator. */ 9825 if (i && seplen != 0) { 9826 Py_MEMCPY(res_data, 9827 sep_data, 9828 kind * seplen); 9829 res_data += kind * seplen; 9830 } 9831 9832 itemlen = PyUnicode_GET_LENGTH(item); 9833 if (itemlen != 0) { 9834 Py_MEMCPY(res_data, 9835 PyUnicode_DATA(item), 9836 kind * itemlen); 9837 res_data += kind * itemlen; 9838 } 9839 } 9840 assert(res_data == PyUnicode_1BYTE_DATA(res) 9841 + kind * PyUnicode_GET_LENGTH(res)); 9842 } 9843 else { 9844 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9845 Py_ssize_t itemlen; 9846 item = items[i]; 9847 9848 /* Copy item, and maybe the separator. */ 9849 if (i && seplen != 0) { 9850 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9851 res_offset += seplen; 9852 } 9853 9854 itemlen = PyUnicode_GET_LENGTH(item); 9855 if (itemlen != 0) { 9856 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9857 res_offset += itemlen; 9858 } 9859 } 9860 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9861 } 9862 9863 Py_DECREF(fseq); 9864 Py_XDECREF(sep); 9865 assert(_PyUnicode_CheckConsistency(res, 1)); 9866 return res; 9867 9868 onError: 9869 Py_DECREF(fseq); 9870 Py_XDECREF(sep); 9871 Py_XDECREF(res); 9872 return NULL; 9873} 9874 9875#define FILL(kind, data, value, start, length) \ 9876 do { \ 9877 Py_ssize_t i_ = 0; \ 9878 assert(kind != PyUnicode_WCHAR_KIND); \ 9879 switch ((kind)) { \ 9880 case PyUnicode_1BYTE_KIND: { \ 9881 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9882 memset(to_, (unsigned char)value, (length)); \ 9883 break; \ 9884 } \ 9885 case PyUnicode_2BYTE_KIND: { \ 9886 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9887 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9888 break; \ 9889 } \ 9890 case PyUnicode_4BYTE_KIND: { \ 9891 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9892 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9893 break; \ 9894 } \ 9895 default: assert(0); \ 9896 } \ 9897 } while (0) 9898 9899void 9900_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9901 Py_UCS4 fill_char) 9902{ 9903 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9904 const void *data = PyUnicode_DATA(unicode); 9905 assert(PyUnicode_IS_READY(unicode)); 9906 assert(unicode_modifiable(unicode)); 9907 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9908 assert(start >= 0); 9909 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9910 FILL(kind, data, fill_char, start, length); 9911} 9912 9913Py_ssize_t 9914PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9915 Py_UCS4 fill_char) 9916{ 9917 Py_ssize_t maxlen; 9918 9919 if (!PyUnicode_Check(unicode)) { 9920 PyErr_BadInternalCall(); 9921 return -1; 9922 } 9923 if (PyUnicode_READY(unicode) == -1) 9924 return -1; 9925 if (unicode_check_modifiable(unicode)) 9926 return -1; 9927 9928 if (start < 0) { 9929 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9930 return -1; 9931 } 9932 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9933 PyErr_SetString(PyExc_ValueError, 9934 "fill character is bigger than " 9935 "the string maximum character"); 9936 return -1; 9937 } 9938 9939 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9940 length = Py_MIN(maxlen, length); 9941 if (length <= 0) 9942 return 0; 9943 9944 _PyUnicode_FastFill(unicode, start, length, fill_char); 9945 return length; 9946} 9947 9948static PyObject * 9949pad(PyObject *self, 9950 Py_ssize_t left, 9951 Py_ssize_t right, 9952 Py_UCS4 fill) 9953{ 9954 PyObject *u; 9955 Py_UCS4 maxchar; 9956 int kind; 9957 void *data; 9958 9959 if (left < 0) 9960 left = 0; 9961 if (right < 0) 9962 right = 0; 9963 9964 if (left == 0 && right == 0) 9965 return unicode_result_unchanged(self); 9966 9967 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9968 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9969 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9970 return NULL; 9971 } 9972 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9973 maxchar = Py_MAX(maxchar, fill); 9974 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9975 if (!u) 9976 return NULL; 9977 9978 kind = PyUnicode_KIND(u); 9979 data = PyUnicode_DATA(u); 9980 if (left) 9981 FILL(kind, data, fill, 0, left); 9982 if (right) 9983 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9984 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9985 assert(_PyUnicode_CheckConsistency(u, 1)); 9986 return u; 9987} 9988 9989PyObject * 9990PyUnicode_Splitlines(PyObject *string, int keepends) 9991{ 9992 PyObject *list; 9993 9994 string = PyUnicode_FromObject(string); 9995 if (string == NULL) 9996 return NULL; 9997 if (PyUnicode_READY(string) == -1) { 9998 Py_DECREF(string); 9999 return NULL; 10000 } 10001 10002 switch (PyUnicode_KIND(string)) { 10003 case PyUnicode_1BYTE_KIND: 10004 if (PyUnicode_IS_ASCII(string)) 10005 list = asciilib_splitlines( 10006 string, PyUnicode_1BYTE_DATA(string), 10007 PyUnicode_GET_LENGTH(string), keepends); 10008 else 10009 list = ucs1lib_splitlines( 10010 string, PyUnicode_1BYTE_DATA(string), 10011 PyUnicode_GET_LENGTH(string), keepends); 10012 break; 10013 case PyUnicode_2BYTE_KIND: 10014 list = ucs2lib_splitlines( 10015 string, PyUnicode_2BYTE_DATA(string), 10016 PyUnicode_GET_LENGTH(string), keepends); 10017 break; 10018 case PyUnicode_4BYTE_KIND: 10019 list = ucs4lib_splitlines( 10020 string, PyUnicode_4BYTE_DATA(string), 10021 PyUnicode_GET_LENGTH(string), keepends); 10022 break; 10023 default: 10024 assert(0); 10025 list = 0; 10026 } 10027 Py_DECREF(string); 10028 return list; 10029} 10030 10031static PyObject * 10032split(PyObject *self, 10033 PyObject *substring, 10034 Py_ssize_t maxcount) 10035{ 10036 int kind1, kind2, kind; 10037 void *buf1, *buf2; 10038 Py_ssize_t len1, len2; 10039 PyObject* out; 10040 10041 if (maxcount < 0) 10042 maxcount = PY_SSIZE_T_MAX; 10043 10044 if (PyUnicode_READY(self) == -1) 10045 return NULL; 10046 10047 if (substring == NULL) 10048 switch (PyUnicode_KIND(self)) { 10049 case PyUnicode_1BYTE_KIND: 10050 if (PyUnicode_IS_ASCII(self)) 10051 return asciilib_split_whitespace( 10052 self, PyUnicode_1BYTE_DATA(self), 10053 PyUnicode_GET_LENGTH(self), maxcount 10054 ); 10055 else 10056 return ucs1lib_split_whitespace( 10057 self, PyUnicode_1BYTE_DATA(self), 10058 PyUnicode_GET_LENGTH(self), maxcount 10059 ); 10060 case PyUnicode_2BYTE_KIND: 10061 return ucs2lib_split_whitespace( 10062 self, PyUnicode_2BYTE_DATA(self), 10063 PyUnicode_GET_LENGTH(self), maxcount 10064 ); 10065 case PyUnicode_4BYTE_KIND: 10066 return ucs4lib_split_whitespace( 10067 self, PyUnicode_4BYTE_DATA(self), 10068 PyUnicode_GET_LENGTH(self), maxcount 10069 ); 10070 default: 10071 assert(0); 10072 return NULL; 10073 } 10074 10075 if (PyUnicode_READY(substring) == -1) 10076 return NULL; 10077 10078 kind1 = PyUnicode_KIND(self); 10079 kind2 = PyUnicode_KIND(substring); 10080 kind = kind1 > kind2 ? kind1 : kind2; 10081 buf1 = PyUnicode_DATA(self); 10082 buf2 = PyUnicode_DATA(substring); 10083 if (kind1 != kind) 10084 buf1 = _PyUnicode_AsKind(self, kind); 10085 if (!buf1) 10086 return NULL; 10087 if (kind2 != kind) 10088 buf2 = _PyUnicode_AsKind(substring, kind); 10089 if (!buf2) { 10090 if (kind1 != kind) PyMem_Free(buf1); 10091 return NULL; 10092 } 10093 len1 = PyUnicode_GET_LENGTH(self); 10094 len2 = PyUnicode_GET_LENGTH(substring); 10095 10096 switch (kind) { 10097 case PyUnicode_1BYTE_KIND: 10098 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10099 out = asciilib_split( 10100 self, buf1, len1, buf2, len2, maxcount); 10101 else 10102 out = ucs1lib_split( 10103 self, buf1, len1, buf2, len2, maxcount); 10104 break; 10105 case PyUnicode_2BYTE_KIND: 10106 out = ucs2lib_split( 10107 self, buf1, len1, buf2, len2, maxcount); 10108 break; 10109 case PyUnicode_4BYTE_KIND: 10110 out = ucs4lib_split( 10111 self, buf1, len1, buf2, len2, maxcount); 10112 break; 10113 default: 10114 out = NULL; 10115 } 10116 if (kind1 != kind) 10117 PyMem_Free(buf1); 10118 if (kind2 != kind) 10119 PyMem_Free(buf2); 10120 return out; 10121} 10122 10123static PyObject * 10124rsplit(PyObject *self, 10125 PyObject *substring, 10126 Py_ssize_t maxcount) 10127{ 10128 int kind1, kind2, kind; 10129 void *buf1, *buf2; 10130 Py_ssize_t len1, len2; 10131 PyObject* out; 10132 10133 if (maxcount < 0) 10134 maxcount = PY_SSIZE_T_MAX; 10135 10136 if (PyUnicode_READY(self) == -1) 10137 return NULL; 10138 10139 if (substring == NULL) 10140 switch (PyUnicode_KIND(self)) { 10141 case PyUnicode_1BYTE_KIND: 10142 if (PyUnicode_IS_ASCII(self)) 10143 return asciilib_rsplit_whitespace( 10144 self, PyUnicode_1BYTE_DATA(self), 10145 PyUnicode_GET_LENGTH(self), maxcount 10146 ); 10147 else 10148 return ucs1lib_rsplit_whitespace( 10149 self, PyUnicode_1BYTE_DATA(self), 10150 PyUnicode_GET_LENGTH(self), maxcount 10151 ); 10152 case PyUnicode_2BYTE_KIND: 10153 return ucs2lib_rsplit_whitespace( 10154 self, PyUnicode_2BYTE_DATA(self), 10155 PyUnicode_GET_LENGTH(self), maxcount 10156 ); 10157 case PyUnicode_4BYTE_KIND: 10158 return ucs4lib_rsplit_whitespace( 10159 self, PyUnicode_4BYTE_DATA(self), 10160 PyUnicode_GET_LENGTH(self), maxcount 10161 ); 10162 default: 10163 assert(0); 10164 return NULL; 10165 } 10166 10167 if (PyUnicode_READY(substring) == -1) 10168 return NULL; 10169 10170 kind1 = PyUnicode_KIND(self); 10171 kind2 = PyUnicode_KIND(substring); 10172 kind = kind1 > kind2 ? kind1 : kind2; 10173 buf1 = PyUnicode_DATA(self); 10174 buf2 = PyUnicode_DATA(substring); 10175 if (kind1 != kind) 10176 buf1 = _PyUnicode_AsKind(self, kind); 10177 if (!buf1) 10178 return NULL; 10179 if (kind2 != kind) 10180 buf2 = _PyUnicode_AsKind(substring, kind); 10181 if (!buf2) { 10182 if (kind1 != kind) PyMem_Free(buf1); 10183 return NULL; 10184 } 10185 len1 = PyUnicode_GET_LENGTH(self); 10186 len2 = PyUnicode_GET_LENGTH(substring); 10187 10188 switch (kind) { 10189 case PyUnicode_1BYTE_KIND: 10190 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10191 out = asciilib_rsplit( 10192 self, buf1, len1, buf2, len2, maxcount); 10193 else 10194 out = ucs1lib_rsplit( 10195 self, buf1, len1, buf2, len2, maxcount); 10196 break; 10197 case PyUnicode_2BYTE_KIND: 10198 out = ucs2lib_rsplit( 10199 self, buf1, len1, buf2, len2, maxcount); 10200 break; 10201 case PyUnicode_4BYTE_KIND: 10202 out = ucs4lib_rsplit( 10203 self, buf1, len1, buf2, len2, maxcount); 10204 break; 10205 default: 10206 out = NULL; 10207 } 10208 if (kind1 != kind) 10209 PyMem_Free(buf1); 10210 if (kind2 != kind) 10211 PyMem_Free(buf2); 10212 return out; 10213} 10214 10215static Py_ssize_t 10216anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10217 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10218{ 10219 switch (kind) { 10220 case PyUnicode_1BYTE_KIND: 10221 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10222 return asciilib_find(buf1, len1, buf2, len2, offset); 10223 else 10224 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10225 case PyUnicode_2BYTE_KIND: 10226 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10227 case PyUnicode_4BYTE_KIND: 10228 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10229 } 10230 assert(0); 10231 return -1; 10232} 10233 10234static Py_ssize_t 10235anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10236 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10237{ 10238 switch (kind) { 10239 case PyUnicode_1BYTE_KIND: 10240 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10241 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10242 else 10243 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10244 case PyUnicode_2BYTE_KIND: 10245 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10246 case PyUnicode_4BYTE_KIND: 10247 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10248 } 10249 assert(0); 10250 return 0; 10251} 10252 10253static void 10254replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10255 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10256{ 10257 int kind = PyUnicode_KIND(u); 10258 void *data = PyUnicode_DATA(u); 10259 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10260 if (kind == PyUnicode_1BYTE_KIND) { 10261 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10262 (Py_UCS1 *)data + len, 10263 u1, u2, maxcount); 10264 } 10265 else if (kind == PyUnicode_2BYTE_KIND) { 10266 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10267 (Py_UCS2 *)data + len, 10268 u1, u2, maxcount); 10269 } 10270 else { 10271 assert(kind == PyUnicode_4BYTE_KIND); 10272 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10273 (Py_UCS4 *)data + len, 10274 u1, u2, maxcount); 10275 } 10276} 10277 10278static PyObject * 10279replace(PyObject *self, PyObject *str1, 10280 PyObject *str2, Py_ssize_t maxcount) 10281{ 10282 PyObject *u; 10283 char *sbuf = PyUnicode_DATA(self); 10284 char *buf1 = PyUnicode_DATA(str1); 10285 char *buf2 = PyUnicode_DATA(str2); 10286 int srelease = 0, release1 = 0, release2 = 0; 10287 int skind = PyUnicode_KIND(self); 10288 int kind1 = PyUnicode_KIND(str1); 10289 int kind2 = PyUnicode_KIND(str2); 10290 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10291 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10292 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10293 int mayshrink; 10294 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10295 10296 if (maxcount < 0) 10297 maxcount = PY_SSIZE_T_MAX; 10298 else if (maxcount == 0 || slen == 0) 10299 goto nothing; 10300 10301 if (str1 == str2) 10302 goto nothing; 10303 10304 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10305 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10306 if (maxchar < maxchar_str1) 10307 /* substring too wide to be present */ 10308 goto nothing; 10309 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10310 /* Replacing str1 with str2 may cause a maxchar reduction in the 10311 result string. */ 10312 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10313 maxchar = Py_MAX(maxchar, maxchar_str2); 10314 10315 if (len1 == len2) { 10316 /* same length */ 10317 if (len1 == 0) 10318 goto nothing; 10319 if (len1 == 1) { 10320 /* replace characters */ 10321 Py_UCS4 u1, u2; 10322 Py_ssize_t pos; 10323 10324 u1 = PyUnicode_READ(kind1, buf1, 0); 10325 pos = findchar(sbuf, skind, slen, u1, 1); 10326 if (pos < 0) 10327 goto nothing; 10328 u2 = PyUnicode_READ(kind2, buf2, 0); 10329 u = PyUnicode_New(slen, maxchar); 10330 if (!u) 10331 goto error; 10332 10333 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10334 replace_1char_inplace(u, pos, u1, u2, maxcount); 10335 } 10336 else { 10337 int rkind = skind; 10338 char *res; 10339 Py_ssize_t i; 10340 10341 if (kind1 < rkind) { 10342 /* widen substring */ 10343 buf1 = _PyUnicode_AsKind(str1, rkind); 10344 if (!buf1) goto error; 10345 release1 = 1; 10346 } 10347 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10348 if (i < 0) 10349 goto nothing; 10350 if (rkind > kind2) { 10351 /* widen replacement */ 10352 buf2 = _PyUnicode_AsKind(str2, rkind); 10353 if (!buf2) goto error; 10354 release2 = 1; 10355 } 10356 else if (rkind < kind2) { 10357 /* widen self and buf1 */ 10358 rkind = kind2; 10359 if (release1) PyMem_Free(buf1); 10360 release1 = 0; 10361 sbuf = _PyUnicode_AsKind(self, rkind); 10362 if (!sbuf) goto error; 10363 srelease = 1; 10364 buf1 = _PyUnicode_AsKind(str1, rkind); 10365 if (!buf1) goto error; 10366 release1 = 1; 10367 } 10368 u = PyUnicode_New(slen, maxchar); 10369 if (!u) 10370 goto error; 10371 assert(PyUnicode_KIND(u) == rkind); 10372 res = PyUnicode_DATA(u); 10373 10374 memcpy(res, sbuf, rkind * slen); 10375 /* change everything in-place, starting with this one */ 10376 memcpy(res + rkind * i, 10377 buf2, 10378 rkind * len2); 10379 i += len1; 10380 10381 while ( --maxcount > 0) { 10382 i = anylib_find(rkind, self, 10383 sbuf+rkind*i, slen-i, 10384 str1, buf1, len1, i); 10385 if (i == -1) 10386 break; 10387 memcpy(res + rkind * i, 10388 buf2, 10389 rkind * len2); 10390 i += len1; 10391 } 10392 } 10393 } 10394 else { 10395 Py_ssize_t n, i, j, ires; 10396 Py_ssize_t new_size; 10397 int rkind = skind; 10398 char *res; 10399 10400 if (kind1 < rkind) { 10401 /* widen substring */ 10402 buf1 = _PyUnicode_AsKind(str1, rkind); 10403 if (!buf1) goto error; 10404 release1 = 1; 10405 } 10406 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10407 if (n == 0) 10408 goto nothing; 10409 if (kind2 < rkind) { 10410 /* widen replacement */ 10411 buf2 = _PyUnicode_AsKind(str2, rkind); 10412 if (!buf2) goto error; 10413 release2 = 1; 10414 } 10415 else if (kind2 > rkind) { 10416 /* widen self and buf1 */ 10417 rkind = kind2; 10418 sbuf = _PyUnicode_AsKind(self, rkind); 10419 if (!sbuf) goto error; 10420 srelease = 1; 10421 if (release1) PyMem_Free(buf1); 10422 release1 = 0; 10423 buf1 = _PyUnicode_AsKind(str1, rkind); 10424 if (!buf1) goto error; 10425 release1 = 1; 10426 } 10427 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10428 PyUnicode_GET_LENGTH(str1))); */ 10429 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10430 PyErr_SetString(PyExc_OverflowError, 10431 "replace string is too long"); 10432 goto error; 10433 } 10434 new_size = slen + n * (len2 - len1); 10435 if (new_size == 0) { 10436 _Py_INCREF_UNICODE_EMPTY(); 10437 if (!unicode_empty) 10438 goto error; 10439 u = unicode_empty; 10440 goto done; 10441 } 10442 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10443 PyErr_SetString(PyExc_OverflowError, 10444 "replace string is too long"); 10445 goto error; 10446 } 10447 u = PyUnicode_New(new_size, maxchar); 10448 if (!u) 10449 goto error; 10450 assert(PyUnicode_KIND(u) == rkind); 10451 res = PyUnicode_DATA(u); 10452 ires = i = 0; 10453 if (len1 > 0) { 10454 while (n-- > 0) { 10455 /* look for next match */ 10456 j = anylib_find(rkind, self, 10457 sbuf + rkind * i, slen-i, 10458 str1, buf1, len1, i); 10459 if (j == -1) 10460 break; 10461 else if (j > i) { 10462 /* copy unchanged part [i:j] */ 10463 memcpy(res + rkind * ires, 10464 sbuf + rkind * i, 10465 rkind * (j-i)); 10466 ires += j - i; 10467 } 10468 /* copy substitution string */ 10469 if (len2 > 0) { 10470 memcpy(res + rkind * ires, 10471 buf2, 10472 rkind * len2); 10473 ires += len2; 10474 } 10475 i = j + len1; 10476 } 10477 if (i < slen) 10478 /* copy tail [i:] */ 10479 memcpy(res + rkind * ires, 10480 sbuf + rkind * i, 10481 rkind * (slen-i)); 10482 } 10483 else { 10484 /* interleave */ 10485 while (n > 0) { 10486 memcpy(res + rkind * ires, 10487 buf2, 10488 rkind * len2); 10489 ires += len2; 10490 if (--n <= 0) 10491 break; 10492 memcpy(res + rkind * ires, 10493 sbuf + rkind * i, 10494 rkind); 10495 ires++; 10496 i++; 10497 } 10498 memcpy(res + rkind * ires, 10499 sbuf + rkind * i, 10500 rkind * (slen-i)); 10501 } 10502 } 10503 10504 if (mayshrink) { 10505 unicode_adjust_maxchar(&u); 10506 if (u == NULL) 10507 goto error; 10508 } 10509 10510 done: 10511 if (srelease) 10512 PyMem_FREE(sbuf); 10513 if (release1) 10514 PyMem_FREE(buf1); 10515 if (release2) 10516 PyMem_FREE(buf2); 10517 assert(_PyUnicode_CheckConsistency(u, 1)); 10518 return u; 10519 10520 nothing: 10521 /* nothing to replace; return original string (when possible) */ 10522 if (srelease) 10523 PyMem_FREE(sbuf); 10524 if (release1) 10525 PyMem_FREE(buf1); 10526 if (release2) 10527 PyMem_FREE(buf2); 10528 return unicode_result_unchanged(self); 10529 10530 error: 10531 if (srelease && sbuf) 10532 PyMem_FREE(sbuf); 10533 if (release1 && buf1) 10534 PyMem_FREE(buf1); 10535 if (release2 && buf2) 10536 PyMem_FREE(buf2); 10537 return NULL; 10538} 10539 10540/* --- Unicode Object Methods --------------------------------------------- */ 10541 10542PyDoc_STRVAR(title__doc__, 10543 "S.title() -> str\n\ 10544\n\ 10545Return a titlecased version of S, i.e. words start with title case\n\ 10546characters, all remaining cased characters have lower case."); 10547 10548static PyObject* 10549unicode_title(PyObject *self) 10550{ 10551 if (PyUnicode_READY(self) == -1) 10552 return NULL; 10553 return case_operation(self, do_title); 10554} 10555 10556PyDoc_STRVAR(capitalize__doc__, 10557 "S.capitalize() -> str\n\ 10558\n\ 10559Return a capitalized version of S, i.e. make the first character\n\ 10560have upper case and the rest lower case."); 10561 10562static PyObject* 10563unicode_capitalize(PyObject *self) 10564{ 10565 if (PyUnicode_READY(self) == -1) 10566 return NULL; 10567 if (PyUnicode_GET_LENGTH(self) == 0) 10568 return unicode_result_unchanged(self); 10569 return case_operation(self, do_capitalize); 10570} 10571 10572PyDoc_STRVAR(casefold__doc__, 10573 "S.casefold() -> str\n\ 10574\n\ 10575Return a version of S suitable for caseless comparisons."); 10576 10577static PyObject * 10578unicode_casefold(PyObject *self) 10579{ 10580 if (PyUnicode_READY(self) == -1) 10581 return NULL; 10582 if (PyUnicode_IS_ASCII(self)) 10583 return ascii_upper_or_lower(self, 1); 10584 return case_operation(self, do_casefold); 10585} 10586 10587 10588/* Argument converter. Coerces to a single unicode character */ 10589 10590static int 10591convert_uc(PyObject *obj, void *addr) 10592{ 10593 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10594 PyObject *uniobj; 10595 10596 uniobj = PyUnicode_FromObject(obj); 10597 if (uniobj == NULL) { 10598 PyErr_SetString(PyExc_TypeError, 10599 "The fill character cannot be converted to Unicode"); 10600 return 0; 10601 } 10602 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10603 PyErr_SetString(PyExc_TypeError, 10604 "The fill character must be exactly one character long"); 10605 Py_DECREF(uniobj); 10606 return 0; 10607 } 10608 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10609 Py_DECREF(uniobj); 10610 return 1; 10611} 10612 10613PyDoc_STRVAR(center__doc__, 10614 "S.center(width[, fillchar]) -> str\n\ 10615\n\ 10616Return S centered in a string of length width. Padding is\n\ 10617done using the specified fill character (default is a space)"); 10618 10619static PyObject * 10620unicode_center(PyObject *self, PyObject *args) 10621{ 10622 Py_ssize_t marg, left; 10623 Py_ssize_t width; 10624 Py_UCS4 fillchar = ' '; 10625 10626 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10627 return NULL; 10628 10629 if (PyUnicode_READY(self) == -1) 10630 return NULL; 10631 10632 if (PyUnicode_GET_LENGTH(self) >= width) 10633 return unicode_result_unchanged(self); 10634 10635 marg = width - PyUnicode_GET_LENGTH(self); 10636 left = marg / 2 + (marg & width & 1); 10637 10638 return pad(self, left, marg - left, fillchar); 10639} 10640 10641/* This function assumes that str1 and str2 are readied by the caller. */ 10642 10643static int 10644unicode_compare(PyObject *str1, PyObject *str2) 10645{ 10646#define COMPARE(TYPE1, TYPE2) \ 10647 do { \ 10648 TYPE1* p1 = (TYPE1 *)data1; \ 10649 TYPE2* p2 = (TYPE2 *)data2; \ 10650 TYPE1* end = p1 + len; \ 10651 Py_UCS4 c1, c2; \ 10652 for (; p1 != end; p1++, p2++) { \ 10653 c1 = *p1; \ 10654 c2 = *p2; \ 10655 if (c1 != c2) \ 10656 return (c1 < c2) ? -1 : 1; \ 10657 } \ 10658 } \ 10659 while (0) 10660 10661 int kind1, kind2; 10662 void *data1, *data2; 10663 Py_ssize_t len1, len2, len; 10664 10665 kind1 = PyUnicode_KIND(str1); 10666 kind2 = PyUnicode_KIND(str2); 10667 data1 = PyUnicode_DATA(str1); 10668 data2 = PyUnicode_DATA(str2); 10669 len1 = PyUnicode_GET_LENGTH(str1); 10670 len2 = PyUnicode_GET_LENGTH(str2); 10671 len = Py_MIN(len1, len2); 10672 10673 switch(kind1) { 10674 case PyUnicode_1BYTE_KIND: 10675 { 10676 switch(kind2) { 10677 case PyUnicode_1BYTE_KIND: 10678 { 10679 int cmp = memcmp(data1, data2, len); 10680 /* normalize result of memcmp() into the range [-1; 1] */ 10681 if (cmp < 0) 10682 return -1; 10683 if (cmp > 0) 10684 return 1; 10685 break; 10686 } 10687 case PyUnicode_2BYTE_KIND: 10688 COMPARE(Py_UCS1, Py_UCS2); 10689 break; 10690 case PyUnicode_4BYTE_KIND: 10691 COMPARE(Py_UCS1, Py_UCS4); 10692 break; 10693 default: 10694 assert(0); 10695 } 10696 break; 10697 } 10698 case PyUnicode_2BYTE_KIND: 10699 { 10700 switch(kind2) { 10701 case PyUnicode_1BYTE_KIND: 10702 COMPARE(Py_UCS2, Py_UCS1); 10703 break; 10704 case PyUnicode_2BYTE_KIND: 10705 { 10706 COMPARE(Py_UCS2, Py_UCS2); 10707 break; 10708 } 10709 case PyUnicode_4BYTE_KIND: 10710 COMPARE(Py_UCS2, Py_UCS4); 10711 break; 10712 default: 10713 assert(0); 10714 } 10715 break; 10716 } 10717 case PyUnicode_4BYTE_KIND: 10718 { 10719 switch(kind2) { 10720 case PyUnicode_1BYTE_KIND: 10721 COMPARE(Py_UCS4, Py_UCS1); 10722 break; 10723 case PyUnicode_2BYTE_KIND: 10724 COMPARE(Py_UCS4, Py_UCS2); 10725 break; 10726 case PyUnicode_4BYTE_KIND: 10727 { 10728#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10729 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10730 /* normalize result of wmemcmp() into the range [-1; 1] */ 10731 if (cmp < 0) 10732 return -1; 10733 if (cmp > 0) 10734 return 1; 10735#else 10736 COMPARE(Py_UCS4, Py_UCS4); 10737#endif 10738 break; 10739 } 10740 default: 10741 assert(0); 10742 } 10743 break; 10744 } 10745 default: 10746 assert(0); 10747 } 10748 10749 if (len1 == len2) 10750 return 0; 10751 if (len1 < len2) 10752 return -1; 10753 else 10754 return 1; 10755 10756#undef COMPARE 10757} 10758 10759Py_LOCAL(int) 10760unicode_compare_eq(PyObject *str1, PyObject *str2) 10761{ 10762 int kind; 10763 void *data1, *data2; 10764 Py_ssize_t len; 10765 int cmp; 10766 10767 len = PyUnicode_GET_LENGTH(str1); 10768 if (PyUnicode_GET_LENGTH(str2) != len) 10769 return 0; 10770 kind = PyUnicode_KIND(str1); 10771 if (PyUnicode_KIND(str2) != kind) 10772 return 0; 10773 data1 = PyUnicode_DATA(str1); 10774 data2 = PyUnicode_DATA(str2); 10775 10776 cmp = memcmp(data1, data2, len * kind); 10777 return (cmp == 0); 10778} 10779 10780 10781int 10782PyUnicode_Compare(PyObject *left, PyObject *right) 10783{ 10784 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10785 if (PyUnicode_READY(left) == -1 || 10786 PyUnicode_READY(right) == -1) 10787 return -1; 10788 10789 /* a string is equal to itself */ 10790 if (left == right) 10791 return 0; 10792 10793 return unicode_compare(left, right); 10794 } 10795 PyErr_Format(PyExc_TypeError, 10796 "Can't compare %.100s and %.100s", 10797 left->ob_type->tp_name, 10798 right->ob_type->tp_name); 10799 return -1; 10800} 10801 10802int 10803_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10804{ 10805 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10806 if (right_str == NULL) 10807 return -1; 10808 return PyUnicode_Compare(left, right_str); 10809} 10810 10811int 10812PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10813{ 10814 Py_ssize_t i; 10815 int kind; 10816 Py_UCS4 chr; 10817 10818 assert(_PyUnicode_CHECK(uni)); 10819 if (PyUnicode_READY(uni) == -1) 10820 return -1; 10821 kind = PyUnicode_KIND(uni); 10822 if (kind == PyUnicode_1BYTE_KIND) { 10823 const void *data = PyUnicode_1BYTE_DATA(uni); 10824 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10825 size_t len, len2 = strlen(str); 10826 int cmp; 10827 10828 len = Py_MIN(len1, len2); 10829 cmp = memcmp(data, str, len); 10830 if (cmp != 0) { 10831 if (cmp < 0) 10832 return -1; 10833 else 10834 return 1; 10835 } 10836 if (len1 > len2) 10837 return 1; /* uni is longer */ 10838 if (len2 > len1) 10839 return -1; /* str is longer */ 10840 return 0; 10841 } 10842 else { 10843 void *data = PyUnicode_DATA(uni); 10844 /* Compare Unicode string and source character set string */ 10845 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10846 if (chr != str[i]) 10847 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10848 /* This check keeps Python strings that end in '\0' from comparing equal 10849 to C strings identical up to that point. */ 10850 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10851 return 1; /* uni is longer */ 10852 if (str[i]) 10853 return -1; /* str is longer */ 10854 return 0; 10855 } 10856} 10857 10858 10859#define TEST_COND(cond) \ 10860 ((cond) ? Py_True : Py_False) 10861 10862PyObject * 10863PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10864{ 10865 int result; 10866 PyObject *v; 10867 10868 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10869 Py_RETURN_NOTIMPLEMENTED; 10870 10871 if (PyUnicode_READY(left) == -1 || 10872 PyUnicode_READY(right) == -1) 10873 return NULL; 10874 10875 if (left == right) { 10876 switch (op) { 10877 case Py_EQ: 10878 case Py_LE: 10879 case Py_GE: 10880 /* a string is equal to itself */ 10881 v = Py_True; 10882 break; 10883 case Py_NE: 10884 case Py_LT: 10885 case Py_GT: 10886 v = Py_False; 10887 break; 10888 default: 10889 PyErr_BadArgument(); 10890 return NULL; 10891 } 10892 } 10893 else if (op == Py_EQ || op == Py_NE) { 10894 result = unicode_compare_eq(left, right); 10895 result ^= (op == Py_NE); 10896 v = TEST_COND(result); 10897 } 10898 else { 10899 result = unicode_compare(left, right); 10900 10901 /* Convert the return value to a Boolean */ 10902 switch (op) { 10903 case Py_LE: 10904 v = TEST_COND(result <= 0); 10905 break; 10906 case Py_GE: 10907 v = TEST_COND(result >= 0); 10908 break; 10909 case Py_LT: 10910 v = TEST_COND(result == -1); 10911 break; 10912 case Py_GT: 10913 v = TEST_COND(result == 1); 10914 break; 10915 default: 10916 PyErr_BadArgument(); 10917 return NULL; 10918 } 10919 } 10920 Py_INCREF(v); 10921 return v; 10922} 10923 10924int 10925PyUnicode_Contains(PyObject *container, PyObject *element) 10926{ 10927 PyObject *str, *sub; 10928 int kind1, kind2; 10929 void *buf1, *buf2; 10930 Py_ssize_t len1, len2; 10931 int result; 10932 10933 /* Coerce the two arguments */ 10934 sub = PyUnicode_FromObject(element); 10935 if (!sub) { 10936 PyErr_Format(PyExc_TypeError, 10937 "'in <string>' requires string as left operand, not %s", 10938 element->ob_type->tp_name); 10939 return -1; 10940 } 10941 10942 str = PyUnicode_FromObject(container); 10943 if (!str) { 10944 Py_DECREF(sub); 10945 return -1; 10946 } 10947 10948 kind1 = PyUnicode_KIND(str); 10949 kind2 = PyUnicode_KIND(sub); 10950 buf1 = PyUnicode_DATA(str); 10951 buf2 = PyUnicode_DATA(sub); 10952 if (kind2 != kind1) { 10953 if (kind2 > kind1) { 10954 Py_DECREF(sub); 10955 Py_DECREF(str); 10956 return 0; 10957 } 10958 buf2 = _PyUnicode_AsKind(sub, kind1); 10959 } 10960 if (!buf2) { 10961 Py_DECREF(sub); 10962 Py_DECREF(str); 10963 return -1; 10964 } 10965 len1 = PyUnicode_GET_LENGTH(str); 10966 len2 = PyUnicode_GET_LENGTH(sub); 10967 10968 switch (kind1) { 10969 case PyUnicode_1BYTE_KIND: 10970 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10971 break; 10972 case PyUnicode_2BYTE_KIND: 10973 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10974 break; 10975 case PyUnicode_4BYTE_KIND: 10976 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10977 break; 10978 default: 10979 result = -1; 10980 assert(0); 10981 } 10982 10983 Py_DECREF(str); 10984 Py_DECREF(sub); 10985 10986 if (kind2 != kind1) 10987 PyMem_Free(buf2); 10988 10989 return result; 10990} 10991 10992/* Concat to string or Unicode object giving a new Unicode object. */ 10993 10994PyObject * 10995PyUnicode_Concat(PyObject *left, PyObject *right) 10996{ 10997 PyObject *u = NULL, *v = NULL, *w; 10998 Py_UCS4 maxchar, maxchar2; 10999 Py_ssize_t u_len, v_len, new_len; 11000 11001 /* Coerce the two arguments */ 11002 u = PyUnicode_FromObject(left); 11003 if (u == NULL) 11004 goto onError; 11005 v = PyUnicode_FromObject(right); 11006 if (v == NULL) 11007 goto onError; 11008 11009 /* Shortcuts */ 11010 if (v == unicode_empty) { 11011 Py_DECREF(v); 11012 return u; 11013 } 11014 if (u == unicode_empty) { 11015 Py_DECREF(u); 11016 return v; 11017 } 11018 11019 u_len = PyUnicode_GET_LENGTH(u); 11020 v_len = PyUnicode_GET_LENGTH(v); 11021 if (u_len > PY_SSIZE_T_MAX - v_len) { 11022 PyErr_SetString(PyExc_OverflowError, 11023 "strings are too large to concat"); 11024 goto onError; 11025 } 11026 new_len = u_len + v_len; 11027 11028 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11029 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11030 maxchar = Py_MAX(maxchar, maxchar2); 11031 11032 /* Concat the two Unicode strings */ 11033 w = PyUnicode_New(new_len, maxchar); 11034 if (w == NULL) 11035 goto onError; 11036 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11037 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11038 Py_DECREF(u); 11039 Py_DECREF(v); 11040 assert(_PyUnicode_CheckConsistency(w, 1)); 11041 return w; 11042 11043 onError: 11044 Py_XDECREF(u); 11045 Py_XDECREF(v); 11046 return NULL; 11047} 11048 11049void 11050PyUnicode_Append(PyObject **p_left, PyObject *right) 11051{ 11052 PyObject *left, *res; 11053 Py_UCS4 maxchar, maxchar2; 11054 Py_ssize_t left_len, right_len, new_len; 11055 11056 if (p_left == NULL) { 11057 if (!PyErr_Occurred()) 11058 PyErr_BadInternalCall(); 11059 return; 11060 } 11061 left = *p_left; 11062 if (right == NULL || left == NULL 11063 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11064 if (!PyErr_Occurred()) 11065 PyErr_BadInternalCall(); 11066 goto error; 11067 } 11068 11069 if (PyUnicode_READY(left) == -1) 11070 goto error; 11071 if (PyUnicode_READY(right) == -1) 11072 goto error; 11073 11074 /* Shortcuts */ 11075 if (left == unicode_empty) { 11076 Py_DECREF(left); 11077 Py_INCREF(right); 11078 *p_left = right; 11079 return; 11080 } 11081 if (right == unicode_empty) 11082 return; 11083 11084 left_len = PyUnicode_GET_LENGTH(left); 11085 right_len = PyUnicode_GET_LENGTH(right); 11086 if (left_len > PY_SSIZE_T_MAX - right_len) { 11087 PyErr_SetString(PyExc_OverflowError, 11088 "strings are too large to concat"); 11089 goto error; 11090 } 11091 new_len = left_len + right_len; 11092 11093 if (unicode_modifiable(left) 11094 && PyUnicode_CheckExact(right) 11095 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11096 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11097 to change the structure size, but characters are stored just after 11098 the structure, and so it requires to move all characters which is 11099 not so different than duplicating the string. */ 11100 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11101 { 11102 /* append inplace */ 11103 if (unicode_resize(p_left, new_len) != 0) 11104 goto error; 11105 11106 /* copy 'right' into the newly allocated area of 'left' */ 11107 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11108 } 11109 else { 11110 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11111 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11112 maxchar = Py_MAX(maxchar, maxchar2); 11113 11114 /* Concat the two Unicode strings */ 11115 res = PyUnicode_New(new_len, maxchar); 11116 if (res == NULL) 11117 goto error; 11118 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11119 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11120 Py_DECREF(left); 11121 *p_left = res; 11122 } 11123 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11124 return; 11125 11126error: 11127 Py_CLEAR(*p_left); 11128} 11129 11130void 11131PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11132{ 11133 PyUnicode_Append(pleft, right); 11134 Py_XDECREF(right); 11135} 11136 11137PyDoc_STRVAR(count__doc__, 11138 "S.count(sub[, start[, end]]) -> int\n\ 11139\n\ 11140Return the number of non-overlapping occurrences of substring sub in\n\ 11141string S[start:end]. Optional arguments start and end are\n\ 11142interpreted as in slice notation."); 11143 11144static PyObject * 11145unicode_count(PyObject *self, PyObject *args) 11146{ 11147 PyObject *substring = NULL; 11148 Py_ssize_t start = 0; 11149 Py_ssize_t end = PY_SSIZE_T_MAX; 11150 PyObject *result; 11151 int kind1, kind2, kind; 11152 void *buf1, *buf2; 11153 Py_ssize_t len1, len2, iresult; 11154 11155 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11156 &start, &end)) 11157 return NULL; 11158 11159 kind1 = PyUnicode_KIND(self); 11160 kind2 = PyUnicode_KIND(substring); 11161 if (kind2 > kind1) { 11162 Py_DECREF(substring); 11163 return PyLong_FromLong(0); 11164 } 11165 kind = kind1; 11166 buf1 = PyUnicode_DATA(self); 11167 buf2 = PyUnicode_DATA(substring); 11168 if (kind2 != kind) 11169 buf2 = _PyUnicode_AsKind(substring, kind); 11170 if (!buf2) { 11171 Py_DECREF(substring); 11172 return NULL; 11173 } 11174 len1 = PyUnicode_GET_LENGTH(self); 11175 len2 = PyUnicode_GET_LENGTH(substring); 11176 11177 ADJUST_INDICES(start, end, len1); 11178 switch (kind) { 11179 case PyUnicode_1BYTE_KIND: 11180 iresult = ucs1lib_count( 11181 ((Py_UCS1*)buf1) + start, end - start, 11182 buf2, len2, PY_SSIZE_T_MAX 11183 ); 11184 break; 11185 case PyUnicode_2BYTE_KIND: 11186 iresult = ucs2lib_count( 11187 ((Py_UCS2*)buf1) + start, end - start, 11188 buf2, len2, PY_SSIZE_T_MAX 11189 ); 11190 break; 11191 case PyUnicode_4BYTE_KIND: 11192 iresult = ucs4lib_count( 11193 ((Py_UCS4*)buf1) + start, end - start, 11194 buf2, len2, PY_SSIZE_T_MAX 11195 ); 11196 break; 11197 default: 11198 assert(0); iresult = 0; 11199 } 11200 11201 result = PyLong_FromSsize_t(iresult); 11202 11203 if (kind2 != kind) 11204 PyMem_Free(buf2); 11205 11206 Py_DECREF(substring); 11207 11208 return result; 11209} 11210 11211PyDoc_STRVAR(encode__doc__, 11212 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11213\n\ 11214Encode S using the codec registered for encoding. Default encoding\n\ 11215is 'utf-8'. errors may be given to set a different error\n\ 11216handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11217a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11218'xmlcharrefreplace' as well as any other name registered with\n\ 11219codecs.register_error that can handle UnicodeEncodeErrors."); 11220 11221static PyObject * 11222unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11223{ 11224 static char *kwlist[] = {"encoding", "errors", 0}; 11225 char *encoding = NULL; 11226 char *errors = NULL; 11227 11228 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11229 kwlist, &encoding, &errors)) 11230 return NULL; 11231 return PyUnicode_AsEncodedString(self, encoding, errors); 11232} 11233 11234PyDoc_STRVAR(expandtabs__doc__, 11235 "S.expandtabs(tabsize=8) -> str\n\ 11236\n\ 11237Return a copy of S where all tab characters are expanded using spaces.\n\ 11238If tabsize is not given, a tab size of 8 characters is assumed."); 11239 11240static PyObject* 11241unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11242{ 11243 Py_ssize_t i, j, line_pos, src_len, incr; 11244 Py_UCS4 ch; 11245 PyObject *u; 11246 void *src_data, *dest_data; 11247 static char *kwlist[] = {"tabsize", 0}; 11248 int tabsize = 8; 11249 int kind; 11250 int found; 11251 11252 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11253 kwlist, &tabsize)) 11254 return NULL; 11255 11256 if (PyUnicode_READY(self) == -1) 11257 return NULL; 11258 11259 /* First pass: determine size of output string */ 11260 src_len = PyUnicode_GET_LENGTH(self); 11261 i = j = line_pos = 0; 11262 kind = PyUnicode_KIND(self); 11263 src_data = PyUnicode_DATA(self); 11264 found = 0; 11265 for (; i < src_len; i++) { 11266 ch = PyUnicode_READ(kind, src_data, i); 11267 if (ch == '\t') { 11268 found = 1; 11269 if (tabsize > 0) { 11270 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11271 if (j > PY_SSIZE_T_MAX - incr) 11272 goto overflow; 11273 line_pos += incr; 11274 j += incr; 11275 } 11276 } 11277 else { 11278 if (j > PY_SSIZE_T_MAX - 1) 11279 goto overflow; 11280 line_pos++; 11281 j++; 11282 if (ch == '\n' || ch == '\r') 11283 line_pos = 0; 11284 } 11285 } 11286 if (!found) 11287 return unicode_result_unchanged(self); 11288 11289 /* Second pass: create output string and fill it */ 11290 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11291 if (!u) 11292 return NULL; 11293 dest_data = PyUnicode_DATA(u); 11294 11295 i = j = line_pos = 0; 11296 11297 for (; i < src_len; i++) { 11298 ch = PyUnicode_READ(kind, src_data, i); 11299 if (ch == '\t') { 11300 if (tabsize > 0) { 11301 incr = tabsize - (line_pos % tabsize); 11302 line_pos += incr; 11303 FILL(kind, dest_data, ' ', j, incr); 11304 j += incr; 11305 } 11306 } 11307 else { 11308 line_pos++; 11309 PyUnicode_WRITE(kind, dest_data, j, ch); 11310 j++; 11311 if (ch == '\n' || ch == '\r') 11312 line_pos = 0; 11313 } 11314 } 11315 assert (j == PyUnicode_GET_LENGTH(u)); 11316 return unicode_result(u); 11317 11318 overflow: 11319 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11320 return NULL; 11321} 11322 11323PyDoc_STRVAR(find__doc__, 11324 "S.find(sub[, start[, end]]) -> int\n\ 11325\n\ 11326Return the lowest index in S where substring sub is found,\n\ 11327such that sub is contained within S[start:end]. Optional\n\ 11328arguments start and end are interpreted as in slice notation.\n\ 11329\n\ 11330Return -1 on failure."); 11331 11332static PyObject * 11333unicode_find(PyObject *self, PyObject *args) 11334{ 11335 PyObject *substring = NULL; 11336 Py_ssize_t start = 0; 11337 Py_ssize_t end = 0; 11338 Py_ssize_t result; 11339 11340 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11341 &start, &end)) 11342 return NULL; 11343 11344 if (PyUnicode_READY(self) == -1) { 11345 Py_DECREF(substring); 11346 return NULL; 11347 } 11348 if (PyUnicode_READY(substring) == -1) { 11349 Py_DECREF(substring); 11350 return NULL; 11351 } 11352 11353 result = any_find_slice(1, self, substring, start, end); 11354 11355 Py_DECREF(substring); 11356 11357 if (result == -2) 11358 return NULL; 11359 11360 return PyLong_FromSsize_t(result); 11361} 11362 11363static PyObject * 11364unicode_getitem(PyObject *self, Py_ssize_t index) 11365{ 11366 void *data; 11367 enum PyUnicode_Kind kind; 11368 Py_UCS4 ch; 11369 11370 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11371 PyErr_BadArgument(); 11372 return NULL; 11373 } 11374 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11375 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11376 return NULL; 11377 } 11378 kind = PyUnicode_KIND(self); 11379 data = PyUnicode_DATA(self); 11380 ch = PyUnicode_READ(kind, data, index); 11381 return unicode_char(ch); 11382} 11383 11384/* Believe it or not, this produces the same value for ASCII strings 11385 as bytes_hash(). */ 11386static Py_hash_t 11387unicode_hash(PyObject *self) 11388{ 11389 Py_ssize_t len; 11390 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11391 11392#ifdef Py_DEBUG 11393 assert(_Py_HashSecret_Initialized); 11394#endif 11395 if (_PyUnicode_HASH(self) != -1) 11396 return _PyUnicode_HASH(self); 11397 if (PyUnicode_READY(self) == -1) 11398 return -1; 11399 len = PyUnicode_GET_LENGTH(self); 11400 /* 11401 We make the hash of the empty string be 0, rather than using 11402 (prefix ^ suffix), since this slightly obfuscates the hash secret 11403 */ 11404 if (len == 0) { 11405 _PyUnicode_HASH(self) = 0; 11406 return 0; 11407 } 11408 x = _Py_HashBytes(PyUnicode_DATA(self), 11409 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11410 _PyUnicode_HASH(self) = x; 11411 return x; 11412} 11413 11414PyDoc_STRVAR(index__doc__, 11415 "S.index(sub[, start[, end]]) -> int\n\ 11416\n\ 11417Like S.find() but raise ValueError when the substring is not found."); 11418 11419static PyObject * 11420unicode_index(PyObject *self, PyObject *args) 11421{ 11422 Py_ssize_t result; 11423 PyObject *substring = NULL; 11424 Py_ssize_t start = 0; 11425 Py_ssize_t end = 0; 11426 11427 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11428 &start, &end)) 11429 return NULL; 11430 11431 if (PyUnicode_READY(self) == -1) { 11432 Py_DECREF(substring); 11433 return NULL; 11434 } 11435 if (PyUnicode_READY(substring) == -1) { 11436 Py_DECREF(substring); 11437 return NULL; 11438 } 11439 11440 result = any_find_slice(1, self, substring, start, end); 11441 11442 Py_DECREF(substring); 11443 11444 if (result == -2) 11445 return NULL; 11446 11447 if (result < 0) { 11448 PyErr_SetString(PyExc_ValueError, "substring not found"); 11449 return NULL; 11450 } 11451 11452 return PyLong_FromSsize_t(result); 11453} 11454 11455PyDoc_STRVAR(islower__doc__, 11456 "S.islower() -> bool\n\ 11457\n\ 11458Return True if all cased characters in S are lowercase and there is\n\ 11459at least one cased character in S, False otherwise."); 11460 11461static PyObject* 11462unicode_islower(PyObject *self) 11463{ 11464 Py_ssize_t i, length; 11465 int kind; 11466 void *data; 11467 int cased; 11468 11469 if (PyUnicode_READY(self) == -1) 11470 return NULL; 11471 length = PyUnicode_GET_LENGTH(self); 11472 kind = PyUnicode_KIND(self); 11473 data = PyUnicode_DATA(self); 11474 11475 /* Shortcut for single character strings */ 11476 if (length == 1) 11477 return PyBool_FromLong( 11478 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11479 11480 /* Special case for empty strings */ 11481 if (length == 0) 11482 return PyBool_FromLong(0); 11483 11484 cased = 0; 11485 for (i = 0; i < length; i++) { 11486 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11487 11488 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11489 return PyBool_FromLong(0); 11490 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11491 cased = 1; 11492 } 11493 return PyBool_FromLong(cased); 11494} 11495 11496PyDoc_STRVAR(isupper__doc__, 11497 "S.isupper() -> bool\n\ 11498\n\ 11499Return True if all cased characters in S are uppercase and there is\n\ 11500at least one cased character in S, False otherwise."); 11501 11502static PyObject* 11503unicode_isupper(PyObject *self) 11504{ 11505 Py_ssize_t i, length; 11506 int kind; 11507 void *data; 11508 int cased; 11509 11510 if (PyUnicode_READY(self) == -1) 11511 return NULL; 11512 length = PyUnicode_GET_LENGTH(self); 11513 kind = PyUnicode_KIND(self); 11514 data = PyUnicode_DATA(self); 11515 11516 /* Shortcut for single character strings */ 11517 if (length == 1) 11518 return PyBool_FromLong( 11519 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11520 11521 /* Special case for empty strings */ 11522 if (length == 0) 11523 return PyBool_FromLong(0); 11524 11525 cased = 0; 11526 for (i = 0; i < length; i++) { 11527 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11528 11529 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11530 return PyBool_FromLong(0); 11531 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11532 cased = 1; 11533 } 11534 return PyBool_FromLong(cased); 11535} 11536 11537PyDoc_STRVAR(istitle__doc__, 11538 "S.istitle() -> bool\n\ 11539\n\ 11540Return True if S is a titlecased string and there is at least one\n\ 11541character in S, i.e. upper- and titlecase characters may only\n\ 11542follow uncased characters and lowercase characters only cased ones.\n\ 11543Return False otherwise."); 11544 11545static PyObject* 11546unicode_istitle(PyObject *self) 11547{ 11548 Py_ssize_t i, length; 11549 int kind; 11550 void *data; 11551 int cased, previous_is_cased; 11552 11553 if (PyUnicode_READY(self) == -1) 11554 return NULL; 11555 length = PyUnicode_GET_LENGTH(self); 11556 kind = PyUnicode_KIND(self); 11557 data = PyUnicode_DATA(self); 11558 11559 /* Shortcut for single character strings */ 11560 if (length == 1) { 11561 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11562 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11563 (Py_UNICODE_ISUPPER(ch) != 0)); 11564 } 11565 11566 /* Special case for empty strings */ 11567 if (length == 0) 11568 return PyBool_FromLong(0); 11569 11570 cased = 0; 11571 previous_is_cased = 0; 11572 for (i = 0; i < length; i++) { 11573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11574 11575 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11576 if (previous_is_cased) 11577 return PyBool_FromLong(0); 11578 previous_is_cased = 1; 11579 cased = 1; 11580 } 11581 else if (Py_UNICODE_ISLOWER(ch)) { 11582 if (!previous_is_cased) 11583 return PyBool_FromLong(0); 11584 previous_is_cased = 1; 11585 cased = 1; 11586 } 11587 else 11588 previous_is_cased = 0; 11589 } 11590 return PyBool_FromLong(cased); 11591} 11592 11593PyDoc_STRVAR(isspace__doc__, 11594 "S.isspace() -> bool\n\ 11595\n\ 11596Return True if all characters in S are whitespace\n\ 11597and there is at least one character in S, False otherwise."); 11598 11599static PyObject* 11600unicode_isspace(PyObject *self) 11601{ 11602 Py_ssize_t i, length; 11603 int kind; 11604 void *data; 11605 11606 if (PyUnicode_READY(self) == -1) 11607 return NULL; 11608 length = PyUnicode_GET_LENGTH(self); 11609 kind = PyUnicode_KIND(self); 11610 data = PyUnicode_DATA(self); 11611 11612 /* Shortcut for single character strings */ 11613 if (length == 1) 11614 return PyBool_FromLong( 11615 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11616 11617 /* Special case for empty strings */ 11618 if (length == 0) 11619 return PyBool_FromLong(0); 11620 11621 for (i = 0; i < length; i++) { 11622 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11623 if (!Py_UNICODE_ISSPACE(ch)) 11624 return PyBool_FromLong(0); 11625 } 11626 return PyBool_FromLong(1); 11627} 11628 11629PyDoc_STRVAR(isalpha__doc__, 11630 "S.isalpha() -> bool\n\ 11631\n\ 11632Return True if all characters in S are alphabetic\n\ 11633and there is at least one character in S, False otherwise."); 11634 11635static PyObject* 11636unicode_isalpha(PyObject *self) 11637{ 11638 Py_ssize_t i, length; 11639 int kind; 11640 void *data; 11641 11642 if (PyUnicode_READY(self) == -1) 11643 return NULL; 11644 length = PyUnicode_GET_LENGTH(self); 11645 kind = PyUnicode_KIND(self); 11646 data = PyUnicode_DATA(self); 11647 11648 /* Shortcut for single character strings */ 11649 if (length == 1) 11650 return PyBool_FromLong( 11651 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11652 11653 /* Special case for empty strings */ 11654 if (length == 0) 11655 return PyBool_FromLong(0); 11656 11657 for (i = 0; i < length; i++) { 11658 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11659 return PyBool_FromLong(0); 11660 } 11661 return PyBool_FromLong(1); 11662} 11663 11664PyDoc_STRVAR(isalnum__doc__, 11665 "S.isalnum() -> bool\n\ 11666\n\ 11667Return True if all characters in S are alphanumeric\n\ 11668and there is at least one character in S, False otherwise."); 11669 11670static PyObject* 11671unicode_isalnum(PyObject *self) 11672{ 11673 int kind; 11674 void *data; 11675 Py_ssize_t len, i; 11676 11677 if (PyUnicode_READY(self) == -1) 11678 return NULL; 11679 11680 kind = PyUnicode_KIND(self); 11681 data = PyUnicode_DATA(self); 11682 len = PyUnicode_GET_LENGTH(self); 11683 11684 /* Shortcut for single character strings */ 11685 if (len == 1) { 11686 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11687 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11688 } 11689 11690 /* Special case for empty strings */ 11691 if (len == 0) 11692 return PyBool_FromLong(0); 11693 11694 for (i = 0; i < len; i++) { 11695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11696 if (!Py_UNICODE_ISALNUM(ch)) 11697 return PyBool_FromLong(0); 11698 } 11699 return PyBool_FromLong(1); 11700} 11701 11702PyDoc_STRVAR(isdecimal__doc__, 11703 "S.isdecimal() -> bool\n\ 11704\n\ 11705Return True if there are only decimal characters in S,\n\ 11706False otherwise."); 11707 11708static PyObject* 11709unicode_isdecimal(PyObject *self) 11710{ 11711 Py_ssize_t i, length; 11712 int kind; 11713 void *data; 11714 11715 if (PyUnicode_READY(self) == -1) 11716 return NULL; 11717 length = PyUnicode_GET_LENGTH(self); 11718 kind = PyUnicode_KIND(self); 11719 data = PyUnicode_DATA(self); 11720 11721 /* Shortcut for single character strings */ 11722 if (length == 1) 11723 return PyBool_FromLong( 11724 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11725 11726 /* Special case for empty strings */ 11727 if (length == 0) 11728 return PyBool_FromLong(0); 11729 11730 for (i = 0; i < length; i++) { 11731 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11732 return PyBool_FromLong(0); 11733 } 11734 return PyBool_FromLong(1); 11735} 11736 11737PyDoc_STRVAR(isdigit__doc__, 11738 "S.isdigit() -> bool\n\ 11739\n\ 11740Return True if all characters in S are digits\n\ 11741and there is at least one character in S, False otherwise."); 11742 11743static PyObject* 11744unicode_isdigit(PyObject *self) 11745{ 11746 Py_ssize_t i, length; 11747 int kind; 11748 void *data; 11749 11750 if (PyUnicode_READY(self) == -1) 11751 return NULL; 11752 length = PyUnicode_GET_LENGTH(self); 11753 kind = PyUnicode_KIND(self); 11754 data = PyUnicode_DATA(self); 11755 11756 /* Shortcut for single character strings */ 11757 if (length == 1) { 11758 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11759 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11760 } 11761 11762 /* Special case for empty strings */ 11763 if (length == 0) 11764 return PyBool_FromLong(0); 11765 11766 for (i = 0; i < length; i++) { 11767 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11768 return PyBool_FromLong(0); 11769 } 11770 return PyBool_FromLong(1); 11771} 11772 11773PyDoc_STRVAR(isnumeric__doc__, 11774 "S.isnumeric() -> bool\n\ 11775\n\ 11776Return True if there are only numeric characters in S,\n\ 11777False otherwise."); 11778 11779static PyObject* 11780unicode_isnumeric(PyObject *self) 11781{ 11782 Py_ssize_t i, length; 11783 int kind; 11784 void *data; 11785 11786 if (PyUnicode_READY(self) == -1) 11787 return NULL; 11788 length = PyUnicode_GET_LENGTH(self); 11789 kind = PyUnicode_KIND(self); 11790 data = PyUnicode_DATA(self); 11791 11792 /* Shortcut for single character strings */ 11793 if (length == 1) 11794 return PyBool_FromLong( 11795 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11796 11797 /* Special case for empty strings */ 11798 if (length == 0) 11799 return PyBool_FromLong(0); 11800 11801 for (i = 0; i < length; i++) { 11802 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11803 return PyBool_FromLong(0); 11804 } 11805 return PyBool_FromLong(1); 11806} 11807 11808int 11809PyUnicode_IsIdentifier(PyObject *self) 11810{ 11811 int kind; 11812 void *data; 11813 Py_ssize_t i; 11814 Py_UCS4 first; 11815 11816 if (PyUnicode_READY(self) == -1) { 11817 Py_FatalError("identifier not ready"); 11818 return 0; 11819 } 11820 11821 /* Special case for empty strings */ 11822 if (PyUnicode_GET_LENGTH(self) == 0) 11823 return 0; 11824 kind = PyUnicode_KIND(self); 11825 data = PyUnicode_DATA(self); 11826 11827 /* PEP 3131 says that the first character must be in 11828 XID_Start and subsequent characters in XID_Continue, 11829 and for the ASCII range, the 2.x rules apply (i.e 11830 start with letters and underscore, continue with 11831 letters, digits, underscore). However, given the current 11832 definition of XID_Start and XID_Continue, it is sufficient 11833 to check just for these, except that _ must be allowed 11834 as starting an identifier. */ 11835 first = PyUnicode_READ(kind, data, 0); 11836 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11837 return 0; 11838 11839 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11840 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11841 return 0; 11842 return 1; 11843} 11844 11845PyDoc_STRVAR(isidentifier__doc__, 11846 "S.isidentifier() -> bool\n\ 11847\n\ 11848Return True if S is a valid identifier according\n\ 11849to the language definition.\n\ 11850\n\ 11851Use keyword.iskeyword() to test for reserved identifiers\n\ 11852such as \"def\" and \"class\".\n"); 11853 11854static PyObject* 11855unicode_isidentifier(PyObject *self) 11856{ 11857 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11858} 11859 11860PyDoc_STRVAR(isprintable__doc__, 11861 "S.isprintable() -> bool\n\ 11862\n\ 11863Return True if all characters in S are considered\n\ 11864printable in repr() or S is empty, False otherwise."); 11865 11866static PyObject* 11867unicode_isprintable(PyObject *self) 11868{ 11869 Py_ssize_t i, length; 11870 int kind; 11871 void *data; 11872 11873 if (PyUnicode_READY(self) == -1) 11874 return NULL; 11875 length = PyUnicode_GET_LENGTH(self); 11876 kind = PyUnicode_KIND(self); 11877 data = PyUnicode_DATA(self); 11878 11879 /* Shortcut for single character strings */ 11880 if (length == 1) 11881 return PyBool_FromLong( 11882 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11883 11884 for (i = 0; i < length; i++) { 11885 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11886 Py_RETURN_FALSE; 11887 } 11888 } 11889 Py_RETURN_TRUE; 11890} 11891 11892PyDoc_STRVAR(join__doc__, 11893 "S.join(iterable) -> str\n\ 11894\n\ 11895Return a string which is the concatenation of the strings in the\n\ 11896iterable. The separator between elements is S."); 11897 11898static PyObject* 11899unicode_join(PyObject *self, PyObject *data) 11900{ 11901 return PyUnicode_Join(self, data); 11902} 11903 11904static Py_ssize_t 11905unicode_length(PyObject *self) 11906{ 11907 if (PyUnicode_READY(self) == -1) 11908 return -1; 11909 return PyUnicode_GET_LENGTH(self); 11910} 11911 11912PyDoc_STRVAR(ljust__doc__, 11913 "S.ljust(width[, fillchar]) -> str\n\ 11914\n\ 11915Return S left-justified in a Unicode string of length width. Padding is\n\ 11916done using the specified fill character (default is a space)."); 11917 11918static PyObject * 11919unicode_ljust(PyObject *self, PyObject *args) 11920{ 11921 Py_ssize_t width; 11922 Py_UCS4 fillchar = ' '; 11923 11924 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11925 return NULL; 11926 11927 if (PyUnicode_READY(self) == -1) 11928 return NULL; 11929 11930 if (PyUnicode_GET_LENGTH(self) >= width) 11931 return unicode_result_unchanged(self); 11932 11933 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11934} 11935 11936PyDoc_STRVAR(lower__doc__, 11937 "S.lower() -> str\n\ 11938\n\ 11939Return a copy of the string S converted to lowercase."); 11940 11941static PyObject* 11942unicode_lower(PyObject *self) 11943{ 11944 if (PyUnicode_READY(self) == -1) 11945 return NULL; 11946 if (PyUnicode_IS_ASCII(self)) 11947 return ascii_upper_or_lower(self, 1); 11948 return case_operation(self, do_lower); 11949} 11950 11951#define LEFTSTRIP 0 11952#define RIGHTSTRIP 1 11953#define BOTHSTRIP 2 11954 11955/* Arrays indexed by above */ 11956static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11957 11958#define STRIPNAME(i) (stripformat[i]+3) 11959 11960/* externally visible for str.strip(unicode) */ 11961PyObject * 11962_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11963{ 11964 void *data; 11965 int kind; 11966 Py_ssize_t i, j, len; 11967 BLOOM_MASK sepmask; 11968 Py_ssize_t seplen; 11969 11970 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11971 return NULL; 11972 11973 kind = PyUnicode_KIND(self); 11974 data = PyUnicode_DATA(self); 11975 len = PyUnicode_GET_LENGTH(self); 11976 seplen = PyUnicode_GET_LENGTH(sepobj); 11977 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11978 PyUnicode_DATA(sepobj), 11979 seplen); 11980 11981 i = 0; 11982 if (striptype != RIGHTSTRIP) { 11983 while (i < len) { 11984 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11985 if (!BLOOM(sepmask, ch)) 11986 break; 11987 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 11988 break; 11989 i++; 11990 } 11991 } 11992 11993 j = len; 11994 if (striptype != LEFTSTRIP) { 11995 j--; 11996 while (j >= i) { 11997 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 11998 if (!BLOOM(sepmask, ch)) 11999 break; 12000 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12001 break; 12002 j--; 12003 } 12004 12005 j++; 12006 } 12007 12008 return PyUnicode_Substring(self, i, j); 12009} 12010 12011PyObject* 12012PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12013{ 12014 unsigned char *data; 12015 int kind; 12016 Py_ssize_t length; 12017 12018 if (PyUnicode_READY(self) == -1) 12019 return NULL; 12020 12021 length = PyUnicode_GET_LENGTH(self); 12022 end = Py_MIN(end, length); 12023 12024 if (start == 0 && end == length) 12025 return unicode_result_unchanged(self); 12026 12027 if (start < 0 || end < 0) { 12028 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12029 return NULL; 12030 } 12031 if (start >= length || end < start) 12032 _Py_RETURN_UNICODE_EMPTY(); 12033 12034 length = end - start; 12035 if (PyUnicode_IS_ASCII(self)) { 12036 data = PyUnicode_1BYTE_DATA(self); 12037 return _PyUnicode_FromASCII((char*)(data + start), length); 12038 } 12039 else { 12040 kind = PyUnicode_KIND(self); 12041 data = PyUnicode_1BYTE_DATA(self); 12042 return PyUnicode_FromKindAndData(kind, 12043 data + kind * start, 12044 length); 12045 } 12046} 12047 12048static PyObject * 12049do_strip(PyObject *self, int striptype) 12050{ 12051 Py_ssize_t len, i, j; 12052 12053 if (PyUnicode_READY(self) == -1) 12054 return NULL; 12055 12056 len = PyUnicode_GET_LENGTH(self); 12057 12058 if (PyUnicode_IS_ASCII(self)) { 12059 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12060 12061 i = 0; 12062 if (striptype != RIGHTSTRIP) { 12063 while (i < len) { 12064 Py_UCS1 ch = data[i]; 12065 if (!_Py_ascii_whitespace[ch]) 12066 break; 12067 i++; 12068 } 12069 } 12070 12071 j = len; 12072 if (striptype != LEFTSTRIP) { 12073 j--; 12074 while (j >= i) { 12075 Py_UCS1 ch = data[j]; 12076 if (!_Py_ascii_whitespace[ch]) 12077 break; 12078 j--; 12079 } 12080 j++; 12081 } 12082 } 12083 else { 12084 int kind = PyUnicode_KIND(self); 12085 void *data = PyUnicode_DATA(self); 12086 12087 i = 0; 12088 if (striptype != RIGHTSTRIP) { 12089 while (i < len) { 12090 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12091 if (!Py_UNICODE_ISSPACE(ch)) 12092 break; 12093 i++; 12094 } 12095 } 12096 12097 j = len; 12098 if (striptype != LEFTSTRIP) { 12099 j--; 12100 while (j >= i) { 12101 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12102 if (!Py_UNICODE_ISSPACE(ch)) 12103 break; 12104 j--; 12105 } 12106 j++; 12107 } 12108 } 12109 12110 return PyUnicode_Substring(self, i, j); 12111} 12112 12113 12114static PyObject * 12115do_argstrip(PyObject *self, int striptype, PyObject *args) 12116{ 12117 PyObject *sep = NULL; 12118 12119 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12120 return NULL; 12121 12122 if (sep != NULL && sep != Py_None) { 12123 if (PyUnicode_Check(sep)) 12124 return _PyUnicode_XStrip(self, striptype, sep); 12125 else { 12126 PyErr_Format(PyExc_TypeError, 12127 "%s arg must be None or str", 12128 STRIPNAME(striptype)); 12129 return NULL; 12130 } 12131 } 12132 12133 return do_strip(self, striptype); 12134} 12135 12136 12137PyDoc_STRVAR(strip__doc__, 12138 "S.strip([chars]) -> str\n\ 12139\n\ 12140Return a copy of the string S with leading and trailing\n\ 12141whitespace removed.\n\ 12142If chars is given and not None, remove characters in chars instead."); 12143 12144static PyObject * 12145unicode_strip(PyObject *self, PyObject *args) 12146{ 12147 if (PyTuple_GET_SIZE(args) == 0) 12148 return do_strip(self, BOTHSTRIP); /* Common case */ 12149 else 12150 return do_argstrip(self, BOTHSTRIP, args); 12151} 12152 12153 12154PyDoc_STRVAR(lstrip__doc__, 12155 "S.lstrip([chars]) -> str\n\ 12156\n\ 12157Return a copy of the string S with leading whitespace removed.\n\ 12158If chars is given and not None, remove characters in chars instead."); 12159 12160static PyObject * 12161unicode_lstrip(PyObject *self, PyObject *args) 12162{ 12163 if (PyTuple_GET_SIZE(args) == 0) 12164 return do_strip(self, LEFTSTRIP); /* Common case */ 12165 else 12166 return do_argstrip(self, LEFTSTRIP, args); 12167} 12168 12169 12170PyDoc_STRVAR(rstrip__doc__, 12171 "S.rstrip([chars]) -> str\n\ 12172\n\ 12173Return a copy of the string S with trailing whitespace removed.\n\ 12174If chars is given and not None, remove characters in chars instead."); 12175 12176static PyObject * 12177unicode_rstrip(PyObject *self, PyObject *args) 12178{ 12179 if (PyTuple_GET_SIZE(args) == 0) 12180 return do_strip(self, RIGHTSTRIP); /* Common case */ 12181 else 12182 return do_argstrip(self, RIGHTSTRIP, args); 12183} 12184 12185 12186static PyObject* 12187unicode_repeat(PyObject *str, Py_ssize_t len) 12188{ 12189 PyObject *u; 12190 Py_ssize_t nchars, n; 12191 12192 if (len < 1) 12193 _Py_RETURN_UNICODE_EMPTY(); 12194 12195 /* no repeat, return original string */ 12196 if (len == 1) 12197 return unicode_result_unchanged(str); 12198 12199 if (PyUnicode_READY(str) == -1) 12200 return NULL; 12201 12202 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12203 PyErr_SetString(PyExc_OverflowError, 12204 "repeated string is too long"); 12205 return NULL; 12206 } 12207 nchars = len * PyUnicode_GET_LENGTH(str); 12208 12209 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12210 if (!u) 12211 return NULL; 12212 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12213 12214 if (PyUnicode_GET_LENGTH(str) == 1) { 12215 const int kind = PyUnicode_KIND(str); 12216 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12217 if (kind == PyUnicode_1BYTE_KIND) { 12218 void *to = PyUnicode_DATA(u); 12219 memset(to, (unsigned char)fill_char, len); 12220 } 12221 else if (kind == PyUnicode_2BYTE_KIND) { 12222 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12223 for (n = 0; n < len; ++n) 12224 ucs2[n] = fill_char; 12225 } else { 12226 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12227 assert(kind == PyUnicode_4BYTE_KIND); 12228 for (n = 0; n < len; ++n) 12229 ucs4[n] = fill_char; 12230 } 12231 } 12232 else { 12233 /* number of characters copied this far */ 12234 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12235 const Py_ssize_t char_size = PyUnicode_KIND(str); 12236 char *to = (char *) PyUnicode_DATA(u); 12237 Py_MEMCPY(to, PyUnicode_DATA(str), 12238 PyUnicode_GET_LENGTH(str) * char_size); 12239 while (done < nchars) { 12240 n = (done <= nchars-done) ? done : nchars-done; 12241 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12242 done += n; 12243 } 12244 } 12245 12246 assert(_PyUnicode_CheckConsistency(u, 1)); 12247 return u; 12248} 12249 12250PyObject * 12251PyUnicode_Replace(PyObject *obj, 12252 PyObject *subobj, 12253 PyObject *replobj, 12254 Py_ssize_t maxcount) 12255{ 12256 PyObject *self; 12257 PyObject *str1; 12258 PyObject *str2; 12259 PyObject *result; 12260 12261 self = PyUnicode_FromObject(obj); 12262 if (self == NULL) 12263 return NULL; 12264 str1 = PyUnicode_FromObject(subobj); 12265 if (str1 == NULL) { 12266 Py_DECREF(self); 12267 return NULL; 12268 } 12269 str2 = PyUnicode_FromObject(replobj); 12270 if (str2 == NULL) { 12271 Py_DECREF(self); 12272 Py_DECREF(str1); 12273 return NULL; 12274 } 12275 if (PyUnicode_READY(self) == -1 || 12276 PyUnicode_READY(str1) == -1 || 12277 PyUnicode_READY(str2) == -1) 12278 result = NULL; 12279 else 12280 result = replace(self, str1, str2, maxcount); 12281 Py_DECREF(self); 12282 Py_DECREF(str1); 12283 Py_DECREF(str2); 12284 return result; 12285} 12286 12287PyDoc_STRVAR(replace__doc__, 12288 "S.replace(old, new[, count]) -> str\n\ 12289\n\ 12290Return a copy of S with all occurrences of substring\n\ 12291old replaced by new. If the optional argument count is\n\ 12292given, only the first count occurrences are replaced."); 12293 12294static PyObject* 12295unicode_replace(PyObject *self, PyObject *args) 12296{ 12297 PyObject *str1; 12298 PyObject *str2; 12299 Py_ssize_t maxcount = -1; 12300 PyObject *result; 12301 12302 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12303 return NULL; 12304 if (PyUnicode_READY(self) == -1) 12305 return NULL; 12306 str1 = PyUnicode_FromObject(str1); 12307 if (str1 == NULL) 12308 return NULL; 12309 str2 = PyUnicode_FromObject(str2); 12310 if (str2 == NULL) { 12311 Py_DECREF(str1); 12312 return NULL; 12313 } 12314 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12315 result = NULL; 12316 else 12317 result = replace(self, str1, str2, maxcount); 12318 12319 Py_DECREF(str1); 12320 Py_DECREF(str2); 12321 return result; 12322} 12323 12324static PyObject * 12325unicode_repr(PyObject *unicode) 12326{ 12327 PyObject *repr; 12328 Py_ssize_t isize; 12329 Py_ssize_t osize, squote, dquote, i, o; 12330 Py_UCS4 max, quote; 12331 int ikind, okind, unchanged; 12332 void *idata, *odata; 12333 12334 if (PyUnicode_READY(unicode) == -1) 12335 return NULL; 12336 12337 isize = PyUnicode_GET_LENGTH(unicode); 12338 idata = PyUnicode_DATA(unicode); 12339 12340 /* Compute length of output, quote characters, and 12341 maximum character */ 12342 osize = 0; 12343 max = 127; 12344 squote = dquote = 0; 12345 ikind = PyUnicode_KIND(unicode); 12346 for (i = 0; i < isize; i++) { 12347 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12348 Py_ssize_t incr = 1; 12349 switch (ch) { 12350 case '\'': squote++; break; 12351 case '"': dquote++; break; 12352 case '\\': case '\t': case '\r': case '\n': 12353 incr = 2; 12354 break; 12355 default: 12356 /* Fast-path ASCII */ 12357 if (ch < ' ' || ch == 0x7f) 12358 incr = 4; /* \xHH */ 12359 else if (ch < 0x7f) 12360 ; 12361 else if (Py_UNICODE_ISPRINTABLE(ch)) 12362 max = ch > max ? ch : max; 12363 else if (ch < 0x100) 12364 incr = 4; /* \xHH */ 12365 else if (ch < 0x10000) 12366 incr = 6; /* \uHHHH */ 12367 else 12368 incr = 10; /* \uHHHHHHHH */ 12369 } 12370 if (osize > PY_SSIZE_T_MAX - incr) { 12371 PyErr_SetString(PyExc_OverflowError, 12372 "string is too long to generate repr"); 12373 return NULL; 12374 } 12375 osize += incr; 12376 } 12377 12378 quote = '\''; 12379 unchanged = (osize == isize); 12380 if (squote) { 12381 unchanged = 0; 12382 if (dquote) 12383 /* Both squote and dquote present. Use squote, 12384 and escape them */ 12385 osize += squote; 12386 else 12387 quote = '"'; 12388 } 12389 osize += 2; /* quotes */ 12390 12391 repr = PyUnicode_New(osize, max); 12392 if (repr == NULL) 12393 return NULL; 12394 okind = PyUnicode_KIND(repr); 12395 odata = PyUnicode_DATA(repr); 12396 12397 PyUnicode_WRITE(okind, odata, 0, quote); 12398 PyUnicode_WRITE(okind, odata, osize-1, quote); 12399 if (unchanged) { 12400 _PyUnicode_FastCopyCharacters(repr, 1, 12401 unicode, 0, 12402 isize); 12403 } 12404 else { 12405 for (i = 0, o = 1; i < isize; i++) { 12406 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12407 12408 /* Escape quotes and backslashes */ 12409 if ((ch == quote) || (ch == '\\')) { 12410 PyUnicode_WRITE(okind, odata, o++, '\\'); 12411 PyUnicode_WRITE(okind, odata, o++, ch); 12412 continue; 12413 } 12414 12415 /* Map special whitespace to '\t', \n', '\r' */ 12416 if (ch == '\t') { 12417 PyUnicode_WRITE(okind, odata, o++, '\\'); 12418 PyUnicode_WRITE(okind, odata, o++, 't'); 12419 } 12420 else if (ch == '\n') { 12421 PyUnicode_WRITE(okind, odata, o++, '\\'); 12422 PyUnicode_WRITE(okind, odata, o++, 'n'); 12423 } 12424 else if (ch == '\r') { 12425 PyUnicode_WRITE(okind, odata, o++, '\\'); 12426 PyUnicode_WRITE(okind, odata, o++, 'r'); 12427 } 12428 12429 /* Map non-printable US ASCII to '\xhh' */ 12430 else if (ch < ' ' || ch == 0x7F) { 12431 PyUnicode_WRITE(okind, odata, o++, '\\'); 12432 PyUnicode_WRITE(okind, odata, o++, 'x'); 12433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12435 } 12436 12437 /* Copy ASCII characters as-is */ 12438 else if (ch < 0x7F) { 12439 PyUnicode_WRITE(okind, odata, o++, ch); 12440 } 12441 12442 /* Non-ASCII characters */ 12443 else { 12444 /* Map Unicode whitespace and control characters 12445 (categories Z* and C* except ASCII space) 12446 */ 12447 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12448 PyUnicode_WRITE(okind, odata, o++, '\\'); 12449 /* Map 8-bit characters to '\xhh' */ 12450 if (ch <= 0xff) { 12451 PyUnicode_WRITE(okind, odata, o++, 'x'); 12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12454 } 12455 /* Map 16-bit characters to '\uxxxx' */ 12456 else if (ch <= 0xffff) { 12457 PyUnicode_WRITE(okind, odata, o++, 'u'); 12458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12462 } 12463 /* Map 21-bit characters to '\U00xxxxxx' */ 12464 else { 12465 PyUnicode_WRITE(okind, odata, o++, 'U'); 12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12474 } 12475 } 12476 /* Copy characters as-is */ 12477 else { 12478 PyUnicode_WRITE(okind, odata, o++, ch); 12479 } 12480 } 12481 } 12482 } 12483 /* Closing quote already added at the beginning */ 12484 assert(_PyUnicode_CheckConsistency(repr, 1)); 12485 return repr; 12486} 12487 12488PyDoc_STRVAR(rfind__doc__, 12489 "S.rfind(sub[, start[, end]]) -> int\n\ 12490\n\ 12491Return the highest index in S where substring sub is found,\n\ 12492such that sub is contained within S[start:end]. Optional\n\ 12493arguments start and end are interpreted as in slice notation.\n\ 12494\n\ 12495Return -1 on failure."); 12496 12497static PyObject * 12498unicode_rfind(PyObject *self, PyObject *args) 12499{ 12500 PyObject *substring = NULL; 12501 Py_ssize_t start = 0; 12502 Py_ssize_t end = 0; 12503 Py_ssize_t result; 12504 12505 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12506 &start, &end)) 12507 return NULL; 12508 12509 if (PyUnicode_READY(self) == -1) { 12510 Py_DECREF(substring); 12511 return NULL; 12512 } 12513 if (PyUnicode_READY(substring) == -1) { 12514 Py_DECREF(substring); 12515 return NULL; 12516 } 12517 12518 result = any_find_slice(-1, self, substring, start, end); 12519 12520 Py_DECREF(substring); 12521 12522 if (result == -2) 12523 return NULL; 12524 12525 return PyLong_FromSsize_t(result); 12526} 12527 12528PyDoc_STRVAR(rindex__doc__, 12529 "S.rindex(sub[, start[, end]]) -> int\n\ 12530\n\ 12531Like S.rfind() but raise ValueError when the substring is not found."); 12532 12533static PyObject * 12534unicode_rindex(PyObject *self, PyObject *args) 12535{ 12536 PyObject *substring = NULL; 12537 Py_ssize_t start = 0; 12538 Py_ssize_t end = 0; 12539 Py_ssize_t result; 12540 12541 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12542 &start, &end)) 12543 return NULL; 12544 12545 if (PyUnicode_READY(self) == -1) { 12546 Py_DECREF(substring); 12547 return NULL; 12548 } 12549 if (PyUnicode_READY(substring) == -1) { 12550 Py_DECREF(substring); 12551 return NULL; 12552 } 12553 12554 result = any_find_slice(-1, self, substring, start, end); 12555 12556 Py_DECREF(substring); 12557 12558 if (result == -2) 12559 return NULL; 12560 12561 if (result < 0) { 12562 PyErr_SetString(PyExc_ValueError, "substring not found"); 12563 return NULL; 12564 } 12565 12566 return PyLong_FromSsize_t(result); 12567} 12568 12569PyDoc_STRVAR(rjust__doc__, 12570 "S.rjust(width[, fillchar]) -> str\n\ 12571\n\ 12572Return S right-justified in a string of length width. Padding is\n\ 12573done using the specified fill character (default is a space)."); 12574 12575static PyObject * 12576unicode_rjust(PyObject *self, PyObject *args) 12577{ 12578 Py_ssize_t width; 12579 Py_UCS4 fillchar = ' '; 12580 12581 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12582 return NULL; 12583 12584 if (PyUnicode_READY(self) == -1) 12585 return NULL; 12586 12587 if (PyUnicode_GET_LENGTH(self) >= width) 12588 return unicode_result_unchanged(self); 12589 12590 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12591} 12592 12593PyObject * 12594PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12595{ 12596 PyObject *result; 12597 12598 s = PyUnicode_FromObject(s); 12599 if (s == NULL) 12600 return NULL; 12601 if (sep != NULL) { 12602 sep = PyUnicode_FromObject(sep); 12603 if (sep == NULL) { 12604 Py_DECREF(s); 12605 return NULL; 12606 } 12607 } 12608 12609 result = split(s, sep, maxsplit); 12610 12611 Py_DECREF(s); 12612 Py_XDECREF(sep); 12613 return result; 12614} 12615 12616PyDoc_STRVAR(split__doc__, 12617 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12618\n\ 12619Return a list of the words in S, using sep as the\n\ 12620delimiter string. If maxsplit is given, at most maxsplit\n\ 12621splits are done. If sep is not specified or is None, any\n\ 12622whitespace string is a separator and empty strings are\n\ 12623removed from the result."); 12624 12625static PyObject* 12626unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12627{ 12628 static char *kwlist[] = {"sep", "maxsplit", 0}; 12629 PyObject *substring = Py_None; 12630 Py_ssize_t maxcount = -1; 12631 12632 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12633 kwlist, &substring, &maxcount)) 12634 return NULL; 12635 12636 if (substring == Py_None) 12637 return split(self, NULL, maxcount); 12638 else if (PyUnicode_Check(substring)) 12639 return split(self, substring, maxcount); 12640 else 12641 return PyUnicode_Split(self, substring, maxcount); 12642} 12643 12644PyObject * 12645PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12646{ 12647 PyObject* str_obj; 12648 PyObject* sep_obj; 12649 PyObject* out; 12650 int kind1, kind2, kind; 12651 void *buf1 = NULL, *buf2 = NULL; 12652 Py_ssize_t len1, len2; 12653 12654 str_obj = PyUnicode_FromObject(str_in); 12655 if (!str_obj) 12656 return NULL; 12657 sep_obj = PyUnicode_FromObject(sep_in); 12658 if (!sep_obj) { 12659 Py_DECREF(str_obj); 12660 return NULL; 12661 } 12662 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12663 Py_DECREF(sep_obj); 12664 Py_DECREF(str_obj); 12665 return NULL; 12666 } 12667 12668 kind1 = PyUnicode_KIND(str_obj); 12669 kind2 = PyUnicode_KIND(sep_obj); 12670 kind = Py_MAX(kind1, kind2); 12671 buf1 = PyUnicode_DATA(str_obj); 12672 if (kind1 != kind) 12673 buf1 = _PyUnicode_AsKind(str_obj, kind); 12674 if (!buf1) 12675 goto onError; 12676 buf2 = PyUnicode_DATA(sep_obj); 12677 if (kind2 != kind) 12678 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12679 if (!buf2) 12680 goto onError; 12681 len1 = PyUnicode_GET_LENGTH(str_obj); 12682 len2 = PyUnicode_GET_LENGTH(sep_obj); 12683 12684 switch (kind) { 12685 case PyUnicode_1BYTE_KIND: 12686 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12687 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12688 else 12689 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12690 break; 12691 case PyUnicode_2BYTE_KIND: 12692 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12693 break; 12694 case PyUnicode_4BYTE_KIND: 12695 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12696 break; 12697 default: 12698 assert(0); 12699 out = 0; 12700 } 12701 12702 Py_DECREF(sep_obj); 12703 Py_DECREF(str_obj); 12704 if (kind1 != kind) 12705 PyMem_Free(buf1); 12706 if (kind2 != kind) 12707 PyMem_Free(buf2); 12708 12709 return out; 12710 onError: 12711 Py_DECREF(sep_obj); 12712 Py_DECREF(str_obj); 12713 if (kind1 != kind && buf1) 12714 PyMem_Free(buf1); 12715 if (kind2 != kind && buf2) 12716 PyMem_Free(buf2); 12717 return NULL; 12718} 12719 12720 12721PyObject * 12722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12723{ 12724 PyObject* str_obj; 12725 PyObject* sep_obj; 12726 PyObject* out; 12727 int kind1, kind2, kind; 12728 void *buf1 = NULL, *buf2 = NULL; 12729 Py_ssize_t len1, len2; 12730 12731 str_obj = PyUnicode_FromObject(str_in); 12732 if (!str_obj) 12733 return NULL; 12734 sep_obj = PyUnicode_FromObject(sep_in); 12735 if (!sep_obj) { 12736 Py_DECREF(str_obj); 12737 return NULL; 12738 } 12739 12740 kind1 = PyUnicode_KIND(str_obj); 12741 kind2 = PyUnicode_KIND(sep_obj); 12742 kind = Py_MAX(kind1, kind2); 12743 buf1 = PyUnicode_DATA(str_obj); 12744 if (kind1 != kind) 12745 buf1 = _PyUnicode_AsKind(str_obj, kind); 12746 if (!buf1) 12747 goto onError; 12748 buf2 = PyUnicode_DATA(sep_obj); 12749 if (kind2 != kind) 12750 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12751 if (!buf2) 12752 goto onError; 12753 len1 = PyUnicode_GET_LENGTH(str_obj); 12754 len2 = PyUnicode_GET_LENGTH(sep_obj); 12755 12756 switch (kind) { 12757 case PyUnicode_1BYTE_KIND: 12758 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12759 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12760 else 12761 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12762 break; 12763 case PyUnicode_2BYTE_KIND: 12764 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12765 break; 12766 case PyUnicode_4BYTE_KIND: 12767 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12768 break; 12769 default: 12770 assert(0); 12771 out = 0; 12772 } 12773 12774 Py_DECREF(sep_obj); 12775 Py_DECREF(str_obj); 12776 if (kind1 != kind) 12777 PyMem_Free(buf1); 12778 if (kind2 != kind) 12779 PyMem_Free(buf2); 12780 12781 return out; 12782 onError: 12783 Py_DECREF(sep_obj); 12784 Py_DECREF(str_obj); 12785 if (kind1 != kind && buf1) 12786 PyMem_Free(buf1); 12787 if (kind2 != kind && buf2) 12788 PyMem_Free(buf2); 12789 return NULL; 12790} 12791 12792PyDoc_STRVAR(partition__doc__, 12793 "S.partition(sep) -> (head, sep, tail)\n\ 12794\n\ 12795Search for the separator sep in S, and return the part before it,\n\ 12796the separator itself, and the part after it. If the separator is not\n\ 12797found, return S and two empty strings."); 12798 12799static PyObject* 12800unicode_partition(PyObject *self, PyObject *separator) 12801{ 12802 return PyUnicode_Partition(self, separator); 12803} 12804 12805PyDoc_STRVAR(rpartition__doc__, 12806 "S.rpartition(sep) -> (head, sep, tail)\n\ 12807\n\ 12808Search for the separator sep in S, starting at the end of S, and return\n\ 12809the part before it, the separator itself, and the part after it. If the\n\ 12810separator is not found, return two empty strings and S."); 12811 12812static PyObject* 12813unicode_rpartition(PyObject *self, PyObject *separator) 12814{ 12815 return PyUnicode_RPartition(self, separator); 12816} 12817 12818PyObject * 12819PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12820{ 12821 PyObject *result; 12822 12823 s = PyUnicode_FromObject(s); 12824 if (s == NULL) 12825 return NULL; 12826 if (sep != NULL) { 12827 sep = PyUnicode_FromObject(sep); 12828 if (sep == NULL) { 12829 Py_DECREF(s); 12830 return NULL; 12831 } 12832 } 12833 12834 result = rsplit(s, sep, maxsplit); 12835 12836 Py_DECREF(s); 12837 Py_XDECREF(sep); 12838 return result; 12839} 12840 12841PyDoc_STRVAR(rsplit__doc__, 12842 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12843\n\ 12844Return a list of the words in S, using sep as the\n\ 12845delimiter string, starting at the end of the string and\n\ 12846working to the front. If maxsplit is given, at most maxsplit\n\ 12847splits are done. If sep is not specified, any whitespace string\n\ 12848is a separator."); 12849 12850static PyObject* 12851unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12852{ 12853 static char *kwlist[] = {"sep", "maxsplit", 0}; 12854 PyObject *substring = Py_None; 12855 Py_ssize_t maxcount = -1; 12856 12857 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12858 kwlist, &substring, &maxcount)) 12859 return NULL; 12860 12861 if (substring == Py_None) 12862 return rsplit(self, NULL, maxcount); 12863 else if (PyUnicode_Check(substring)) 12864 return rsplit(self, substring, maxcount); 12865 else 12866 return PyUnicode_RSplit(self, substring, maxcount); 12867} 12868 12869PyDoc_STRVAR(splitlines__doc__, 12870 "S.splitlines([keepends]) -> list of strings\n\ 12871\n\ 12872Return a list of the lines in S, breaking at line boundaries.\n\ 12873Line breaks are not included in the resulting list unless keepends\n\ 12874is given and true."); 12875 12876static PyObject* 12877unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12878{ 12879 static char *kwlist[] = {"keepends", 0}; 12880 int keepends = 0; 12881 12882 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12883 kwlist, &keepends)) 12884 return NULL; 12885 12886 return PyUnicode_Splitlines(self, keepends); 12887} 12888 12889static 12890PyObject *unicode_str(PyObject *self) 12891{ 12892 return unicode_result_unchanged(self); 12893} 12894 12895PyDoc_STRVAR(swapcase__doc__, 12896 "S.swapcase() -> str\n\ 12897\n\ 12898Return a copy of S with uppercase characters converted to lowercase\n\ 12899and vice versa."); 12900 12901static PyObject* 12902unicode_swapcase(PyObject *self) 12903{ 12904 if (PyUnicode_READY(self) == -1) 12905 return NULL; 12906 return case_operation(self, do_swapcase); 12907} 12908 12909/*[clinic input] 12910 12911@staticmethod 12912str.maketrans as unicode_maketrans 12913 12914 x: object 12915 12916 y: unicode=NULL 12917 12918 z: unicode=NULL 12919 12920 / 12921 12922Return a translation table usable for str.translate(). 12923 12924If there is only one argument, it must be a dictionary mapping Unicode 12925ordinals (integers) or characters to Unicode ordinals, strings or None. 12926Character keys will be then converted to ordinals. 12927If there are two arguments, they must be strings of equal length, and 12928in the resulting dictionary, each character in x will be mapped to the 12929character at the same position in y. If there is a third argument, it 12930must be a string, whose characters will be mapped to None in the result. 12931[clinic start generated code]*/ 12932 12933PyDoc_STRVAR(unicode_maketrans__doc__, 12934"maketrans(x, y=None, z=None, /)\n" 12935"--\n" 12936"\n" 12937"Return a translation table usable for str.translate().\n" 12938"\n" 12939"If there is only one argument, it must be a dictionary mapping Unicode\n" 12940"ordinals (integers) or characters to Unicode ordinals, strings or None.\n" 12941"Character keys will be then converted to ordinals.\n" 12942"If there are two arguments, they must be strings of equal length, and\n" 12943"in the resulting dictionary, each character in x will be mapped to the\n" 12944"character at the same position in y. If there is a third argument, it\n" 12945"must be a string, whose characters will be mapped to None in the result."); 12946 12947#define UNICODE_MAKETRANS_METHODDEF \ 12948 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__}, 12949 12950static PyObject * 12951unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z); 12952 12953static PyObject * 12954unicode_maketrans(void *null, PyObject *args) 12955{ 12956 PyObject *return_value = NULL; 12957 PyObject *x; 12958 PyObject *y = NULL; 12959 PyObject *z = NULL; 12960 12961 if (!PyArg_ParseTuple(args, 12962 "O|UU:maketrans", 12963 &x, &y, &z)) 12964 goto exit; 12965 return_value = unicode_maketrans_impl(x, y, z); 12966 12967exit: 12968 return return_value; 12969} 12970 12971static PyObject * 12972unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12973/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/ 12974{ 12975 PyObject *new = NULL, *key, *value; 12976 Py_ssize_t i = 0; 12977 int res; 12978 12979 new = PyDict_New(); 12980 if (!new) 12981 return NULL; 12982 if (y != NULL) { 12983 int x_kind, y_kind, z_kind; 12984 void *x_data, *y_data, *z_data; 12985 12986 /* x must be a string too, of equal length */ 12987 if (!PyUnicode_Check(x)) { 12988 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12989 "be a string if there is a second argument"); 12990 goto err; 12991 } 12992 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12993 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12994 "arguments must have equal length"); 12995 goto err; 12996 } 12997 /* create entries for translating chars in x to those in y */ 12998 x_kind = PyUnicode_KIND(x); 12999 y_kind = PyUnicode_KIND(y); 13000 x_data = PyUnicode_DATA(x); 13001 y_data = PyUnicode_DATA(y); 13002 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13003 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13004 if (!key) 13005 goto err; 13006 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13007 if (!value) { 13008 Py_DECREF(key); 13009 goto err; 13010 } 13011 res = PyDict_SetItem(new, key, value); 13012 Py_DECREF(key); 13013 Py_DECREF(value); 13014 if (res < 0) 13015 goto err; 13016 } 13017 /* create entries for deleting chars in z */ 13018 if (z != NULL) { 13019 z_kind = PyUnicode_KIND(z); 13020 z_data = PyUnicode_DATA(z); 13021 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13022 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13023 if (!key) 13024 goto err; 13025 res = PyDict_SetItem(new, key, Py_None); 13026 Py_DECREF(key); 13027 if (res < 0) 13028 goto err; 13029 } 13030 } 13031 } else { 13032 int kind; 13033 void *data; 13034 13035 /* x must be a dict */ 13036 if (!PyDict_CheckExact(x)) { 13037 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13038 "to maketrans it must be a dict"); 13039 goto err; 13040 } 13041 /* copy entries into the new dict, converting string keys to int keys */ 13042 while (PyDict_Next(x, &i, &key, &value)) { 13043 if (PyUnicode_Check(key)) { 13044 /* convert string keys to integer keys */ 13045 PyObject *newkey; 13046 if (PyUnicode_GET_LENGTH(key) != 1) { 13047 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13048 "table must be of length 1"); 13049 goto err; 13050 } 13051 kind = PyUnicode_KIND(key); 13052 data = PyUnicode_DATA(key); 13053 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13054 if (!newkey) 13055 goto err; 13056 res = PyDict_SetItem(new, newkey, value); 13057 Py_DECREF(newkey); 13058 if (res < 0) 13059 goto err; 13060 } else if (PyLong_Check(key)) { 13061 /* just keep integer keys */ 13062 if (PyDict_SetItem(new, key, value) < 0) 13063 goto err; 13064 } else { 13065 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13066 "be strings or integers"); 13067 goto err; 13068 } 13069 } 13070 } 13071 return new; 13072 err: 13073 Py_DECREF(new); 13074 return NULL; 13075} 13076 13077PyDoc_STRVAR(translate__doc__, 13078 "S.translate(table) -> str\n\ 13079\n\ 13080Return a copy of the string S, where all characters have been mapped\n\ 13081through the given translation table, which must be a mapping of\n\ 13082Unicode ordinals to Unicode ordinals, strings, or None.\n\ 13083Unmapped characters are left untouched. Characters mapped to None\n\ 13084are deleted."); 13085 13086static PyObject* 13087unicode_translate(PyObject *self, PyObject *table) 13088{ 13089 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13090} 13091 13092PyDoc_STRVAR(upper__doc__, 13093 "S.upper() -> str\n\ 13094\n\ 13095Return a copy of S converted to uppercase."); 13096 13097static PyObject* 13098unicode_upper(PyObject *self) 13099{ 13100 if (PyUnicode_READY(self) == -1) 13101 return NULL; 13102 if (PyUnicode_IS_ASCII(self)) 13103 return ascii_upper_or_lower(self, 0); 13104 return case_operation(self, do_upper); 13105} 13106 13107PyDoc_STRVAR(zfill__doc__, 13108 "S.zfill(width) -> str\n\ 13109\n\ 13110Pad a numeric string S with zeros on the left, to fill a field\n\ 13111of the specified width. The string S is never truncated."); 13112 13113static PyObject * 13114unicode_zfill(PyObject *self, PyObject *args) 13115{ 13116 Py_ssize_t fill; 13117 PyObject *u; 13118 Py_ssize_t width; 13119 int kind; 13120 void *data; 13121 Py_UCS4 chr; 13122 13123 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13124 return NULL; 13125 13126 if (PyUnicode_READY(self) == -1) 13127 return NULL; 13128 13129 if (PyUnicode_GET_LENGTH(self) >= width) 13130 return unicode_result_unchanged(self); 13131 13132 fill = width - PyUnicode_GET_LENGTH(self); 13133 13134 u = pad(self, fill, 0, '0'); 13135 13136 if (u == NULL) 13137 return NULL; 13138 13139 kind = PyUnicode_KIND(u); 13140 data = PyUnicode_DATA(u); 13141 chr = PyUnicode_READ(kind, data, fill); 13142 13143 if (chr == '+' || chr == '-') { 13144 /* move sign to beginning of string */ 13145 PyUnicode_WRITE(kind, data, 0, chr); 13146 PyUnicode_WRITE(kind, data, fill, '0'); 13147 } 13148 13149 assert(_PyUnicode_CheckConsistency(u, 1)); 13150 return u; 13151} 13152 13153#if 0 13154static PyObject * 13155unicode__decimal2ascii(PyObject *self) 13156{ 13157 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13158} 13159#endif 13160 13161PyDoc_STRVAR(startswith__doc__, 13162 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13163\n\ 13164Return True if S starts with the specified prefix, False otherwise.\n\ 13165With optional start, test S beginning at that position.\n\ 13166With optional end, stop comparing S at that position.\n\ 13167prefix can also be a tuple of strings to try."); 13168 13169static PyObject * 13170unicode_startswith(PyObject *self, 13171 PyObject *args) 13172{ 13173 PyObject *subobj; 13174 PyObject *substring; 13175 Py_ssize_t start = 0; 13176 Py_ssize_t end = PY_SSIZE_T_MAX; 13177 int result; 13178 13179 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13180 return NULL; 13181 if (PyTuple_Check(subobj)) { 13182 Py_ssize_t i; 13183 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13184 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13185 if (substring == NULL) 13186 return NULL; 13187 result = tailmatch(self, substring, start, end, -1); 13188 Py_DECREF(substring); 13189 if (result == -1) 13190 return NULL; 13191 if (result) { 13192 Py_RETURN_TRUE; 13193 } 13194 } 13195 /* nothing matched */ 13196 Py_RETURN_FALSE; 13197 } 13198 substring = PyUnicode_FromObject(subobj); 13199 if (substring == NULL) { 13200 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13201 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13202 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13203 return NULL; 13204 } 13205 result = tailmatch(self, substring, start, end, -1); 13206 Py_DECREF(substring); 13207 if (result == -1) 13208 return NULL; 13209 return PyBool_FromLong(result); 13210} 13211 13212 13213PyDoc_STRVAR(endswith__doc__, 13214 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13215\n\ 13216Return True if S ends with the specified suffix, False otherwise.\n\ 13217With optional start, test S beginning at that position.\n\ 13218With optional end, stop comparing S at that position.\n\ 13219suffix can also be a tuple of strings to try."); 13220 13221static PyObject * 13222unicode_endswith(PyObject *self, 13223 PyObject *args) 13224{ 13225 PyObject *subobj; 13226 PyObject *substring; 13227 Py_ssize_t start = 0; 13228 Py_ssize_t end = PY_SSIZE_T_MAX; 13229 int result; 13230 13231 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13232 return NULL; 13233 if (PyTuple_Check(subobj)) { 13234 Py_ssize_t i; 13235 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13236 substring = PyUnicode_FromObject( 13237 PyTuple_GET_ITEM(subobj, i)); 13238 if (substring == NULL) 13239 return NULL; 13240 result = tailmatch(self, substring, start, end, +1); 13241 Py_DECREF(substring); 13242 if (result == -1) 13243 return NULL; 13244 if (result) { 13245 Py_RETURN_TRUE; 13246 } 13247 } 13248 Py_RETURN_FALSE; 13249 } 13250 substring = PyUnicode_FromObject(subobj); 13251 if (substring == NULL) { 13252 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13253 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13254 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13255 return NULL; 13256 } 13257 result = tailmatch(self, substring, start, end, +1); 13258 Py_DECREF(substring); 13259 if (result == -1) 13260 return NULL; 13261 return PyBool_FromLong(result); 13262} 13263 13264Py_LOCAL_INLINE(void) 13265_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13266{ 13267 if (!writer->readonly) 13268 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13269 else { 13270 /* Copy-on-write mode: set buffer size to 0 so 13271 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13272 * next write. */ 13273 writer->size = 0; 13274 } 13275 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13276 writer->data = PyUnicode_DATA(writer->buffer); 13277 writer->kind = PyUnicode_KIND(writer->buffer); 13278} 13279 13280void 13281_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13282{ 13283 memset(writer, 0, sizeof(*writer)); 13284#ifdef Py_DEBUG 13285 writer->kind = 5; /* invalid kind */ 13286#endif 13287 writer->min_char = 127; 13288} 13289 13290int 13291_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13292 Py_ssize_t length, Py_UCS4 maxchar) 13293{ 13294#ifdef MS_WINDOWS 13295 /* On Windows, overallocate by 50% is the best factor */ 13296# define OVERALLOCATE_FACTOR 2 13297#else 13298 /* On Linux, overallocate by 25% is the best factor */ 13299# define OVERALLOCATE_FACTOR 4 13300#endif 13301 Py_ssize_t newlen; 13302 PyObject *newbuffer; 13303 13304 assert(length > 0); 13305 13306 if (length > PY_SSIZE_T_MAX - writer->pos) { 13307 PyErr_NoMemory(); 13308 return -1; 13309 } 13310 newlen = writer->pos + length; 13311 13312 maxchar = Py_MAX(maxchar, writer->min_char); 13313 13314 if (writer->buffer == NULL) { 13315 assert(!writer->readonly); 13316 if (writer->overallocate 13317 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13318 /* overallocate to limit the number of realloc() */ 13319 newlen += newlen / OVERALLOCATE_FACTOR; 13320 } 13321 if (newlen < writer->min_length) 13322 newlen = writer->min_length; 13323 13324 writer->buffer = PyUnicode_New(newlen, maxchar); 13325 if (writer->buffer == NULL) 13326 return -1; 13327 } 13328 else if (newlen > writer->size) { 13329 if (writer->overallocate 13330 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13331 /* overallocate to limit the number of realloc() */ 13332 newlen += newlen / OVERALLOCATE_FACTOR; 13333 } 13334 if (newlen < writer->min_length) 13335 newlen = writer->min_length; 13336 13337 if (maxchar > writer->maxchar || writer->readonly) { 13338 /* resize + widen */ 13339 newbuffer = PyUnicode_New(newlen, maxchar); 13340 if (newbuffer == NULL) 13341 return -1; 13342 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13343 writer->buffer, 0, writer->pos); 13344 Py_DECREF(writer->buffer); 13345 writer->readonly = 0; 13346 } 13347 else { 13348 newbuffer = resize_compact(writer->buffer, newlen); 13349 if (newbuffer == NULL) 13350 return -1; 13351 } 13352 writer->buffer = newbuffer; 13353 } 13354 else if (maxchar > writer->maxchar) { 13355 assert(!writer->readonly); 13356 newbuffer = PyUnicode_New(writer->size, maxchar); 13357 if (newbuffer == NULL) 13358 return -1; 13359 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13360 writer->buffer, 0, writer->pos); 13361 Py_DECREF(writer->buffer); 13362 writer->buffer = newbuffer; 13363 } 13364 _PyUnicodeWriter_Update(writer); 13365 return 0; 13366 13367#undef OVERALLOCATE_FACTOR 13368} 13369 13370Py_LOCAL_INLINE(int) 13371_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13372{ 13373 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13374 return -1; 13375 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13376 writer->pos++; 13377 return 0; 13378} 13379 13380int 13381_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13382{ 13383 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13384} 13385 13386int 13387_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13388{ 13389 Py_UCS4 maxchar; 13390 Py_ssize_t len; 13391 13392 if (PyUnicode_READY(str) == -1) 13393 return -1; 13394 len = PyUnicode_GET_LENGTH(str); 13395 if (len == 0) 13396 return 0; 13397 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13398 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13399 if (writer->buffer == NULL && !writer->overallocate) { 13400 writer->readonly = 1; 13401 Py_INCREF(str); 13402 writer->buffer = str; 13403 _PyUnicodeWriter_Update(writer); 13404 writer->pos += len; 13405 return 0; 13406 } 13407 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13408 return -1; 13409 } 13410 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13411 str, 0, len); 13412 writer->pos += len; 13413 return 0; 13414} 13415 13416int 13417_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13418 Py_ssize_t start, Py_ssize_t end) 13419{ 13420 Py_UCS4 maxchar; 13421 Py_ssize_t len; 13422 13423 if (PyUnicode_READY(str) == -1) 13424 return -1; 13425 13426 assert(0 <= start); 13427 assert(end <= PyUnicode_GET_LENGTH(str)); 13428 assert(start <= end); 13429 13430 if (end == 0) 13431 return 0; 13432 13433 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13434 return _PyUnicodeWriter_WriteStr(writer, str); 13435 13436 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13437 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13438 else 13439 maxchar = writer->maxchar; 13440 len = end - start; 13441 13442 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13443 return -1; 13444 13445 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13446 str, start, len); 13447 writer->pos += len; 13448 return 0; 13449} 13450 13451int 13452_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13453 const char *ascii, Py_ssize_t len) 13454{ 13455 if (len == -1) 13456 len = strlen(ascii); 13457 13458 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13459 13460 if (writer->buffer == NULL && !writer->overallocate) { 13461 PyObject *str; 13462 13463 str = _PyUnicode_FromASCII(ascii, len); 13464 if (str == NULL) 13465 return -1; 13466 13467 writer->readonly = 1; 13468 writer->buffer = str; 13469 _PyUnicodeWriter_Update(writer); 13470 writer->pos += len; 13471 return 0; 13472 } 13473 13474 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13475 return -1; 13476 13477 switch (writer->kind) 13478 { 13479 case PyUnicode_1BYTE_KIND: 13480 { 13481 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13482 Py_UCS1 *data = writer->data; 13483 13484 Py_MEMCPY(data + writer->pos, str, len); 13485 break; 13486 } 13487 case PyUnicode_2BYTE_KIND: 13488 { 13489 _PyUnicode_CONVERT_BYTES( 13490 Py_UCS1, Py_UCS2, 13491 ascii, ascii + len, 13492 (Py_UCS2 *)writer->data + writer->pos); 13493 break; 13494 } 13495 case PyUnicode_4BYTE_KIND: 13496 { 13497 _PyUnicode_CONVERT_BYTES( 13498 Py_UCS1, Py_UCS4, 13499 ascii, ascii + len, 13500 (Py_UCS4 *)writer->data + writer->pos); 13501 break; 13502 } 13503 default: 13504 assert(0); 13505 } 13506 13507 writer->pos += len; 13508 return 0; 13509} 13510 13511int 13512_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13513 const char *str, Py_ssize_t len) 13514{ 13515 Py_UCS4 maxchar; 13516 13517 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13518 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13519 return -1; 13520 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13521 writer->pos += len; 13522 return 0; 13523} 13524 13525PyObject * 13526_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13527{ 13528 PyObject *str; 13529 if (writer->pos == 0) { 13530 Py_CLEAR(writer->buffer); 13531 _Py_RETURN_UNICODE_EMPTY(); 13532 } 13533 if (writer->readonly) { 13534 str = writer->buffer; 13535 writer->buffer = NULL; 13536 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13537 return str; 13538 } 13539 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13540 PyObject *newbuffer; 13541 newbuffer = resize_compact(writer->buffer, writer->pos); 13542 if (newbuffer == NULL) { 13543 Py_CLEAR(writer->buffer); 13544 return NULL; 13545 } 13546 writer->buffer = newbuffer; 13547 } 13548 str = writer->buffer; 13549 writer->buffer = NULL; 13550 assert(_PyUnicode_CheckConsistency(str, 1)); 13551 return unicode_result_ready(str); 13552} 13553 13554void 13555_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13556{ 13557 Py_CLEAR(writer->buffer); 13558} 13559 13560#include "stringlib/unicode_format.h" 13561 13562PyDoc_STRVAR(format__doc__, 13563 "S.format(*args, **kwargs) -> str\n\ 13564\n\ 13565Return a formatted version of S, using substitutions from args and kwargs.\n\ 13566The substitutions are identified by braces ('{' and '}')."); 13567 13568PyDoc_STRVAR(format_map__doc__, 13569 "S.format_map(mapping) -> str\n\ 13570\n\ 13571Return a formatted version of S, using substitutions from mapping.\n\ 13572The substitutions are identified by braces ('{' and '}')."); 13573 13574static PyObject * 13575unicode__format__(PyObject* self, PyObject* args) 13576{ 13577 PyObject *format_spec; 13578 _PyUnicodeWriter writer; 13579 int ret; 13580 13581 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13582 return NULL; 13583 13584 if (PyUnicode_READY(self) == -1) 13585 return NULL; 13586 _PyUnicodeWriter_Init(&writer); 13587 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13588 self, format_spec, 0, 13589 PyUnicode_GET_LENGTH(format_spec)); 13590 if (ret == -1) { 13591 _PyUnicodeWriter_Dealloc(&writer); 13592 return NULL; 13593 } 13594 return _PyUnicodeWriter_Finish(&writer); 13595} 13596 13597PyDoc_STRVAR(p_format__doc__, 13598 "S.__format__(format_spec) -> str\n\ 13599\n\ 13600Return a formatted version of S as described by format_spec."); 13601 13602static PyObject * 13603unicode__sizeof__(PyObject *v) 13604{ 13605 Py_ssize_t size; 13606 13607 /* If it's a compact object, account for base structure + 13608 character data. */ 13609 if (PyUnicode_IS_COMPACT_ASCII(v)) 13610 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13611 else if (PyUnicode_IS_COMPACT(v)) 13612 size = sizeof(PyCompactUnicodeObject) + 13613 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13614 else { 13615 /* If it is a two-block object, account for base object, and 13616 for character block if present. */ 13617 size = sizeof(PyUnicodeObject); 13618 if (_PyUnicode_DATA_ANY(v)) 13619 size += (PyUnicode_GET_LENGTH(v) + 1) * 13620 PyUnicode_KIND(v); 13621 } 13622 /* If the wstr pointer is present, account for it unless it is shared 13623 with the data pointer. Check if the data is not shared. */ 13624 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13625 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13626 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13627 size += PyUnicode_UTF8_LENGTH(v) + 1; 13628 13629 return PyLong_FromSsize_t(size); 13630} 13631 13632PyDoc_STRVAR(sizeof__doc__, 13633 "S.__sizeof__() -> size of S in memory, in bytes"); 13634 13635static PyObject * 13636unicode_getnewargs(PyObject *v) 13637{ 13638 PyObject *copy = _PyUnicode_Copy(v); 13639 if (!copy) 13640 return NULL; 13641 return Py_BuildValue("(N)", copy); 13642} 13643 13644static PyMethodDef unicode_methods[] = { 13645 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13646 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13647 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13648 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13649 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13650 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13651 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13652 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13653 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13654 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13655 {"expandtabs", (PyCFunction) unicode_expandtabs, 13656 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13657 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13658 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13659 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13660 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13661 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13662 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13663 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13664 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13665 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13666 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13667 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13668 {"splitlines", (PyCFunction) unicode_splitlines, 13669 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13670 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13671 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13672 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13673 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13674 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13675 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13676 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13677 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13678 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13679 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13680 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13681 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13682 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13683 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13684 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13685 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13686 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13687 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13688 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13689 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13690 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13691 UNICODE_MAKETRANS_METHODDEF 13692 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13693#if 0 13694 /* These methods are just used for debugging the implementation. */ 13695 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13696#endif 13697 13698 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13699 {NULL, NULL} 13700}; 13701 13702static PyObject * 13703unicode_mod(PyObject *v, PyObject *w) 13704{ 13705 if (!PyUnicode_Check(v)) 13706 Py_RETURN_NOTIMPLEMENTED; 13707 return PyUnicode_Format(v, w); 13708} 13709 13710static PyNumberMethods unicode_as_number = { 13711 0, /*nb_add*/ 13712 0, /*nb_subtract*/ 13713 0, /*nb_multiply*/ 13714 unicode_mod, /*nb_remainder*/ 13715}; 13716 13717static PySequenceMethods unicode_as_sequence = { 13718 (lenfunc) unicode_length, /* sq_length */ 13719 PyUnicode_Concat, /* sq_concat */ 13720 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13721 (ssizeargfunc) unicode_getitem, /* sq_item */ 13722 0, /* sq_slice */ 13723 0, /* sq_ass_item */ 13724 0, /* sq_ass_slice */ 13725 PyUnicode_Contains, /* sq_contains */ 13726}; 13727 13728static PyObject* 13729unicode_subscript(PyObject* self, PyObject* item) 13730{ 13731 if (PyUnicode_READY(self) == -1) 13732 return NULL; 13733 13734 if (PyIndex_Check(item)) { 13735 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13736 if (i == -1 && PyErr_Occurred()) 13737 return NULL; 13738 if (i < 0) 13739 i += PyUnicode_GET_LENGTH(self); 13740 return unicode_getitem(self, i); 13741 } else if (PySlice_Check(item)) { 13742 Py_ssize_t start, stop, step, slicelength, cur, i; 13743 PyObject *result; 13744 void *src_data, *dest_data; 13745 int src_kind, dest_kind; 13746 Py_UCS4 ch, max_char, kind_limit; 13747 13748 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13749 &start, &stop, &step, &slicelength) < 0) { 13750 return NULL; 13751 } 13752 13753 if (slicelength <= 0) { 13754 _Py_RETURN_UNICODE_EMPTY(); 13755 } else if (start == 0 && step == 1 && 13756 slicelength == PyUnicode_GET_LENGTH(self)) { 13757 return unicode_result_unchanged(self); 13758 } else if (step == 1) { 13759 return PyUnicode_Substring(self, 13760 start, start + slicelength); 13761 } 13762 /* General case */ 13763 src_kind = PyUnicode_KIND(self); 13764 src_data = PyUnicode_DATA(self); 13765 if (!PyUnicode_IS_ASCII(self)) { 13766 kind_limit = kind_maxchar_limit(src_kind); 13767 max_char = 0; 13768 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13769 ch = PyUnicode_READ(src_kind, src_data, cur); 13770 if (ch > max_char) { 13771 max_char = ch; 13772 if (max_char >= kind_limit) 13773 break; 13774 } 13775 } 13776 } 13777 else 13778 max_char = 127; 13779 result = PyUnicode_New(slicelength, max_char); 13780 if (result == NULL) 13781 return NULL; 13782 dest_kind = PyUnicode_KIND(result); 13783 dest_data = PyUnicode_DATA(result); 13784 13785 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13786 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13787 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13788 } 13789 assert(_PyUnicode_CheckConsistency(result, 1)); 13790 return result; 13791 } else { 13792 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13793 return NULL; 13794 } 13795} 13796 13797static PyMappingMethods unicode_as_mapping = { 13798 (lenfunc)unicode_length, /* mp_length */ 13799 (binaryfunc)unicode_subscript, /* mp_subscript */ 13800 (objobjargproc)0, /* mp_ass_subscript */ 13801}; 13802 13803 13804/* Helpers for PyUnicode_Format() */ 13805 13806struct unicode_formatter_t { 13807 PyObject *args; 13808 int args_owned; 13809 Py_ssize_t arglen, argidx; 13810 PyObject *dict; 13811 13812 enum PyUnicode_Kind fmtkind; 13813 Py_ssize_t fmtcnt, fmtpos; 13814 void *fmtdata; 13815 PyObject *fmtstr; 13816 13817 _PyUnicodeWriter writer; 13818}; 13819 13820struct unicode_format_arg_t { 13821 Py_UCS4 ch; 13822 int flags; 13823 Py_ssize_t width; 13824 int prec; 13825 int sign; 13826}; 13827 13828static PyObject * 13829unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13830{ 13831 Py_ssize_t argidx = ctx->argidx; 13832 13833 if (argidx < ctx->arglen) { 13834 ctx->argidx++; 13835 if (ctx->arglen < 0) 13836 return ctx->args; 13837 else 13838 return PyTuple_GetItem(ctx->args, argidx); 13839 } 13840 PyErr_SetString(PyExc_TypeError, 13841 "not enough arguments for format string"); 13842 return NULL; 13843} 13844 13845/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13846 13847/* Format a float into the writer if the writer is not NULL, or into *p_output 13848 otherwise. 13849 13850 Return 0 on success, raise an exception and return -1 on error. */ 13851static int 13852formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13853 PyObject **p_output, 13854 _PyUnicodeWriter *writer) 13855{ 13856 char *p; 13857 double x; 13858 Py_ssize_t len; 13859 int prec; 13860 int dtoa_flags; 13861 13862 x = PyFloat_AsDouble(v); 13863 if (x == -1.0 && PyErr_Occurred()) 13864 return -1; 13865 13866 prec = arg->prec; 13867 if (prec < 0) 13868 prec = 6; 13869 13870 if (arg->flags & F_ALT) 13871 dtoa_flags = Py_DTSF_ALT; 13872 else 13873 dtoa_flags = 0; 13874 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13875 if (p == NULL) 13876 return -1; 13877 len = strlen(p); 13878 if (writer) { 13879 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13880 PyMem_Free(p); 13881 return -1; 13882 } 13883 } 13884 else 13885 *p_output = _PyUnicode_FromASCII(p, len); 13886 PyMem_Free(p); 13887 return 0; 13888} 13889 13890/* formatlong() emulates the format codes d, u, o, x and X, and 13891 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13892 * Python's regular ints. 13893 * Return value: a new PyUnicodeObject*, or NULL if error. 13894 * The output string is of the form 13895 * "-"? ("0x" | "0X")? digit+ 13896 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13897 * set in flags. The case of hex digits will be correct, 13898 * There will be at least prec digits, zero-filled on the left if 13899 * necessary to get that many. 13900 * val object to be converted 13901 * flags bitmask of format flags; only F_ALT is looked at 13902 * prec minimum number of digits; 0-fill on left if needed 13903 * type a character in [duoxX]; u acts the same as d 13904 * 13905 * CAUTION: o, x and X conversions on regular ints can never 13906 * produce a '-' sign, but can for Python's unbounded ints. 13907 */ 13908static PyObject* 13909formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13910{ 13911 PyObject *result = NULL; 13912 char *buf; 13913 Py_ssize_t i; 13914 int sign; /* 1 if '-', else 0 */ 13915 int len; /* number of characters */ 13916 Py_ssize_t llen; 13917 int numdigits; /* len == numnondigits + numdigits */ 13918 int numnondigits = 0; 13919 int prec = arg->prec; 13920 int type = arg->ch; 13921 13922 /* Avoid exceeding SSIZE_T_MAX */ 13923 if (prec > INT_MAX-3) { 13924 PyErr_SetString(PyExc_OverflowError, 13925 "precision too large"); 13926 return NULL; 13927 } 13928 13929 assert(PyLong_Check(val)); 13930 13931 switch (type) { 13932 default: 13933 assert(!"'type' not in [diuoxX]"); 13934 case 'd': 13935 case 'i': 13936 case 'u': 13937 /* int and int subclasses should print numerically when a numeric */ 13938 /* format code is used (see issue18780) */ 13939 result = PyNumber_ToBase(val, 10); 13940 break; 13941 case 'o': 13942 numnondigits = 2; 13943 result = PyNumber_ToBase(val, 8); 13944 break; 13945 case 'x': 13946 case 'X': 13947 numnondigits = 2; 13948 result = PyNumber_ToBase(val, 16); 13949 break; 13950 } 13951 if (!result) 13952 return NULL; 13953 13954 assert(unicode_modifiable(result)); 13955 assert(PyUnicode_IS_READY(result)); 13956 assert(PyUnicode_IS_ASCII(result)); 13957 13958 /* To modify the string in-place, there can only be one reference. */ 13959 if (Py_REFCNT(result) != 1) { 13960 Py_DECREF(result); 13961 PyErr_BadInternalCall(); 13962 return NULL; 13963 } 13964 buf = PyUnicode_DATA(result); 13965 llen = PyUnicode_GET_LENGTH(result); 13966 if (llen > INT_MAX) { 13967 Py_DECREF(result); 13968 PyErr_SetString(PyExc_ValueError, 13969 "string too large in _PyBytes_FormatLong"); 13970 return NULL; 13971 } 13972 len = (int)llen; 13973 sign = buf[0] == '-'; 13974 numnondigits += sign; 13975 numdigits = len - numnondigits; 13976 assert(numdigits > 0); 13977 13978 /* Get rid of base marker unless F_ALT */ 13979 if (((arg->flags & F_ALT) == 0 && 13980 (type == 'o' || type == 'x' || type == 'X'))) { 13981 assert(buf[sign] == '0'); 13982 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13983 buf[sign+1] == 'o'); 13984 numnondigits -= 2; 13985 buf += 2; 13986 len -= 2; 13987 if (sign) 13988 buf[0] = '-'; 13989 assert(len == numnondigits + numdigits); 13990 assert(numdigits > 0); 13991 } 13992 13993 /* Fill with leading zeroes to meet minimum width. */ 13994 if (prec > numdigits) { 13995 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13996 numnondigits + prec); 13997 char *b1; 13998 if (!r1) { 13999 Py_DECREF(result); 14000 return NULL; 14001 } 14002 b1 = PyBytes_AS_STRING(r1); 14003 for (i = 0; i < numnondigits; ++i) 14004 *b1++ = *buf++; 14005 for (i = 0; i < prec - numdigits; i++) 14006 *b1++ = '0'; 14007 for (i = 0; i < numdigits; i++) 14008 *b1++ = *buf++; 14009 *b1 = '\0'; 14010 Py_DECREF(result); 14011 result = r1; 14012 buf = PyBytes_AS_STRING(result); 14013 len = numnondigits + prec; 14014 } 14015 14016 /* Fix up case for hex conversions. */ 14017 if (type == 'X') { 14018 /* Need to convert all lower case letters to upper case. 14019 and need to convert 0x to 0X (and -0x to -0X). */ 14020 for (i = 0; i < len; i++) 14021 if (buf[i] >= 'a' && buf[i] <= 'x') 14022 buf[i] -= 'a'-'A'; 14023 } 14024 if (!PyUnicode_Check(result) 14025 || buf != PyUnicode_DATA(result)) { 14026 PyObject *unicode; 14027 unicode = _PyUnicode_FromASCII(buf, len); 14028 Py_DECREF(result); 14029 result = unicode; 14030 } 14031 else if (len != PyUnicode_GET_LENGTH(result)) { 14032 if (PyUnicode_Resize(&result, len) < 0) 14033 Py_CLEAR(result); 14034 } 14035 return result; 14036} 14037 14038/* Format an integer or a float as an integer. 14039 * Return 1 if the number has been formatted into the writer, 14040 * 0 if the number has been formatted into *p_output 14041 * -1 and raise an exception on error */ 14042static int 14043mainformatlong(PyObject *v, 14044 struct unicode_format_arg_t *arg, 14045 PyObject **p_output, 14046 _PyUnicodeWriter *writer) 14047{ 14048 PyObject *iobj, *res; 14049 char type = (char)arg->ch; 14050 14051 if (!PyNumber_Check(v)) 14052 goto wrongtype; 14053 14054 /* make sure number is a type of integer */ 14055 /* if not, issue deprecation warning for now */ 14056 if (!PyLong_Check(v)) { 14057 if (type == 'o' || type == 'x' || type == 'X') { 14058 iobj = PyNumber_Index(v); 14059 if (iobj == NULL) { 14060 PyErr_Clear(); 14061 if (PyErr_WarnEx(PyExc_DeprecationWarning, 14062 "automatic int conversions have been deprecated", 14063 1)) { 14064 return -1; 14065 } 14066 iobj = PyNumber_Long(v); 14067 if (iobj == NULL ) { 14068 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14069 goto wrongtype; 14070 return -1; 14071 } 14072 } 14073 } 14074 else { 14075 iobj = PyNumber_Long(v); 14076 if (iobj == NULL ) { 14077 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14078 goto wrongtype; 14079 return -1; 14080 } 14081 } 14082 assert(PyLong_Check(iobj)); 14083 } 14084 else { 14085 iobj = v; 14086 Py_INCREF(iobj); 14087 } 14088 14089 if (PyLong_CheckExact(v) 14090 && arg->width == -1 && arg->prec == -1 14091 && !(arg->flags & (F_SIGN | F_BLANK)) 14092 && type != 'X') 14093 { 14094 /* Fast path */ 14095 int alternate = arg->flags & F_ALT; 14096 int base; 14097 14098 switch(type) 14099 { 14100 default: 14101 assert(0 && "'type' not in [diuoxX]"); 14102 case 'd': 14103 case 'i': 14104 case 'u': 14105 base = 10; 14106 break; 14107 case 'o': 14108 base = 8; 14109 break; 14110 case 'x': 14111 case 'X': 14112 base = 16; 14113 break; 14114 } 14115 14116 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14117 Py_DECREF(iobj); 14118 return -1; 14119 } 14120 Py_DECREF(iobj); 14121 return 1; 14122 } 14123 14124 res = formatlong(iobj, arg); 14125 Py_DECREF(iobj); 14126 if (res == NULL) 14127 return -1; 14128 *p_output = res; 14129 return 0; 14130 14131wrongtype: 14132 PyErr_Format(PyExc_TypeError, 14133 "%%%c format: a number is required, " 14134 "not %.200s", 14135 type, Py_TYPE(v)->tp_name); 14136 return -1; 14137} 14138 14139static Py_UCS4 14140formatchar(PyObject *v) 14141{ 14142 /* presume that the buffer is at least 3 characters long */ 14143 if (PyUnicode_Check(v)) { 14144 if (PyUnicode_GET_LENGTH(v) == 1) { 14145 return PyUnicode_READ_CHAR(v, 0); 14146 } 14147 goto onError; 14148 } 14149 else { 14150 PyObject *iobj; 14151 long x; 14152 /* make sure number is a type of integer */ 14153 /* if not, issue deprecation warning for now */ 14154 if (!PyLong_Check(v)) { 14155 iobj = PyNumber_Index(v); 14156 if (iobj == NULL) { 14157 PyErr_Clear(); 14158 if (PyErr_WarnEx(PyExc_DeprecationWarning, 14159 "automatic int conversions have been deprecated", 14160 1)) { 14161 return -1; 14162 } 14163 iobj = PyNumber_Long(v); 14164 if (iobj == NULL ) { 14165 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14166 goto onError; 14167 return -1; 14168 } 14169 } 14170 v = iobj; 14171 Py_DECREF(iobj); 14172 } 14173 /* Integer input truncated to a character */ 14174 x = PyLong_AsLong(v); 14175 if (x == -1 && PyErr_Occurred()) 14176 goto onError; 14177 14178 if (x < 0 || x > MAX_UNICODE) { 14179 PyErr_SetString(PyExc_OverflowError, 14180 "%c arg not in range(0x110000)"); 14181 return (Py_UCS4) -1; 14182 } 14183 14184 return (Py_UCS4) x; 14185 } 14186 14187 onError: 14188 PyErr_SetString(PyExc_TypeError, 14189 "%c requires int or char"); 14190 return (Py_UCS4) -1; 14191} 14192 14193/* Parse options of an argument: flags, width, precision. 14194 Handle also "%(name)" syntax. 14195 14196 Return 0 if the argument has been formatted into arg->str. 14197 Return 1 if the argument has been written into ctx->writer, 14198 Raise an exception and return -1 on error. */ 14199static int 14200unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14201 struct unicode_format_arg_t *arg) 14202{ 14203#define FORMAT_READ(ctx) \ 14204 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14205 14206 PyObject *v; 14207 14208 if (arg->ch == '(') { 14209 /* Get argument value from a dictionary. Example: "%(name)s". */ 14210 Py_ssize_t keystart; 14211 Py_ssize_t keylen; 14212 PyObject *key; 14213 int pcount = 1; 14214 14215 if (ctx->dict == NULL) { 14216 PyErr_SetString(PyExc_TypeError, 14217 "format requires a mapping"); 14218 return -1; 14219 } 14220 ++ctx->fmtpos; 14221 --ctx->fmtcnt; 14222 keystart = ctx->fmtpos; 14223 /* Skip over balanced parentheses */ 14224 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14225 arg->ch = FORMAT_READ(ctx); 14226 if (arg->ch == ')') 14227 --pcount; 14228 else if (arg->ch == '(') 14229 ++pcount; 14230 ctx->fmtpos++; 14231 } 14232 keylen = ctx->fmtpos - keystart - 1; 14233 if (ctx->fmtcnt < 0 || pcount > 0) { 14234 PyErr_SetString(PyExc_ValueError, 14235 "incomplete format key"); 14236 return -1; 14237 } 14238 key = PyUnicode_Substring(ctx->fmtstr, 14239 keystart, keystart + keylen); 14240 if (key == NULL) 14241 return -1; 14242 if (ctx->args_owned) { 14243 Py_DECREF(ctx->args); 14244 ctx->args_owned = 0; 14245 } 14246 ctx->args = PyObject_GetItem(ctx->dict, key); 14247 Py_DECREF(key); 14248 if (ctx->args == NULL) 14249 return -1; 14250 ctx->args_owned = 1; 14251 ctx->arglen = -1; 14252 ctx->argidx = -2; 14253 } 14254 14255 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14256 while (--ctx->fmtcnt >= 0) { 14257 arg->ch = FORMAT_READ(ctx); 14258 ctx->fmtpos++; 14259 switch (arg->ch) { 14260 case '-': arg->flags |= F_LJUST; continue; 14261 case '+': arg->flags |= F_SIGN; continue; 14262 case ' ': arg->flags |= F_BLANK; continue; 14263 case '#': arg->flags |= F_ALT; continue; 14264 case '0': arg->flags |= F_ZERO; continue; 14265 } 14266 break; 14267 } 14268 14269 /* Parse width. Example: "%10s" => width=10 */ 14270 if (arg->ch == '*') { 14271 v = unicode_format_getnextarg(ctx); 14272 if (v == NULL) 14273 return -1; 14274 if (!PyLong_Check(v)) { 14275 PyErr_SetString(PyExc_TypeError, 14276 "* wants int"); 14277 return -1; 14278 } 14279 arg->width = PyLong_AsSsize_t(v); 14280 if (arg->width == -1 && PyErr_Occurred()) 14281 return -1; 14282 if (arg->width < 0) { 14283 arg->flags |= F_LJUST; 14284 arg->width = -arg->width; 14285 } 14286 if (--ctx->fmtcnt >= 0) { 14287 arg->ch = FORMAT_READ(ctx); 14288 ctx->fmtpos++; 14289 } 14290 } 14291 else if (arg->ch >= '0' && arg->ch <= '9') { 14292 arg->width = arg->ch - '0'; 14293 while (--ctx->fmtcnt >= 0) { 14294 arg->ch = FORMAT_READ(ctx); 14295 ctx->fmtpos++; 14296 if (arg->ch < '0' || arg->ch > '9') 14297 break; 14298 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14299 mixing signed and unsigned comparison. Since arg->ch is between 14300 '0' and '9', casting to int is safe. */ 14301 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14302 PyErr_SetString(PyExc_ValueError, 14303 "width too big"); 14304 return -1; 14305 } 14306 arg->width = arg->width*10 + (arg->ch - '0'); 14307 } 14308 } 14309 14310 /* Parse precision. Example: "%.3f" => prec=3 */ 14311 if (arg->ch == '.') { 14312 arg->prec = 0; 14313 if (--ctx->fmtcnt >= 0) { 14314 arg->ch = FORMAT_READ(ctx); 14315 ctx->fmtpos++; 14316 } 14317 if (arg->ch == '*') { 14318 v = unicode_format_getnextarg(ctx); 14319 if (v == NULL) 14320 return -1; 14321 if (!PyLong_Check(v)) { 14322 PyErr_SetString(PyExc_TypeError, 14323 "* wants int"); 14324 return -1; 14325 } 14326 arg->prec = _PyLong_AsInt(v); 14327 if (arg->prec == -1 && PyErr_Occurred()) 14328 return -1; 14329 if (arg->prec < 0) 14330 arg->prec = 0; 14331 if (--ctx->fmtcnt >= 0) { 14332 arg->ch = FORMAT_READ(ctx); 14333 ctx->fmtpos++; 14334 } 14335 } 14336 else if (arg->ch >= '0' && arg->ch <= '9') { 14337 arg->prec = arg->ch - '0'; 14338 while (--ctx->fmtcnt >= 0) { 14339 arg->ch = FORMAT_READ(ctx); 14340 ctx->fmtpos++; 14341 if (arg->ch < '0' || arg->ch > '9') 14342 break; 14343 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14344 PyErr_SetString(PyExc_ValueError, 14345 "precision too big"); 14346 return -1; 14347 } 14348 arg->prec = arg->prec*10 + (arg->ch - '0'); 14349 } 14350 } 14351 } 14352 14353 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14354 if (ctx->fmtcnt >= 0) { 14355 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14356 if (--ctx->fmtcnt >= 0) { 14357 arg->ch = FORMAT_READ(ctx); 14358 ctx->fmtpos++; 14359 } 14360 } 14361 } 14362 if (ctx->fmtcnt < 0) { 14363 PyErr_SetString(PyExc_ValueError, 14364 "incomplete format"); 14365 return -1; 14366 } 14367 return 0; 14368 14369#undef FORMAT_READ 14370} 14371 14372/* Format one argument. Supported conversion specifiers: 14373 14374 - "s", "r", "a": any type 14375 - "i", "d", "u": int or float 14376 - "o", "x", "X": int 14377 - "e", "E", "f", "F", "g", "G": float 14378 - "c": int or str (1 character) 14379 14380 When possible, the output is written directly into the Unicode writer 14381 (ctx->writer). A string is created when padding is required. 14382 14383 Return 0 if the argument has been formatted into *p_str, 14384 1 if the argument has been written into ctx->writer, 14385 -1 on error. */ 14386static int 14387unicode_format_arg_format(struct unicode_formatter_t *ctx, 14388 struct unicode_format_arg_t *arg, 14389 PyObject **p_str) 14390{ 14391 PyObject *v; 14392 _PyUnicodeWriter *writer = &ctx->writer; 14393 14394 if (ctx->fmtcnt == 0) 14395 ctx->writer.overallocate = 0; 14396 14397 if (arg->ch == '%') { 14398 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14399 return -1; 14400 return 1; 14401 } 14402 14403 v = unicode_format_getnextarg(ctx); 14404 if (v == NULL) 14405 return -1; 14406 14407 14408 switch (arg->ch) { 14409 case 's': 14410 case 'r': 14411 case 'a': 14412 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14413 /* Fast path */ 14414 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14415 return -1; 14416 return 1; 14417 } 14418 14419 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14420 *p_str = v; 14421 Py_INCREF(*p_str); 14422 } 14423 else { 14424 if (arg->ch == 's') 14425 *p_str = PyObject_Str(v); 14426 else if (arg->ch == 'r') 14427 *p_str = PyObject_Repr(v); 14428 else 14429 *p_str = PyObject_ASCII(v); 14430 } 14431 break; 14432 14433 case 'i': 14434 case 'd': 14435 case 'u': 14436 case 'o': 14437 case 'x': 14438 case 'X': 14439 { 14440 int ret = mainformatlong(v, arg, p_str, writer); 14441 if (ret != 0) 14442 return ret; 14443 arg->sign = 1; 14444 break; 14445 } 14446 14447 case 'e': 14448 case 'E': 14449 case 'f': 14450 case 'F': 14451 case 'g': 14452 case 'G': 14453 if (arg->width == -1 && arg->prec == -1 14454 && !(arg->flags & (F_SIGN | F_BLANK))) 14455 { 14456 /* Fast path */ 14457 if (formatfloat(v, arg, NULL, writer) == -1) 14458 return -1; 14459 return 1; 14460 } 14461 14462 arg->sign = 1; 14463 if (formatfloat(v, arg, p_str, NULL) == -1) 14464 return -1; 14465 break; 14466 14467 case 'c': 14468 { 14469 Py_UCS4 ch = formatchar(v); 14470 if (ch == (Py_UCS4) -1) 14471 return -1; 14472 if (arg->width == -1 && arg->prec == -1) { 14473 /* Fast path */ 14474 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14475 return -1; 14476 return 1; 14477 } 14478 *p_str = PyUnicode_FromOrdinal(ch); 14479 break; 14480 } 14481 14482 default: 14483 PyErr_Format(PyExc_ValueError, 14484 "unsupported format character '%c' (0x%x) " 14485 "at index %zd", 14486 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14487 (int)arg->ch, 14488 ctx->fmtpos - 1); 14489 return -1; 14490 } 14491 if (*p_str == NULL) 14492 return -1; 14493 assert (PyUnicode_Check(*p_str)); 14494 return 0; 14495} 14496 14497static int 14498unicode_format_arg_output(struct unicode_formatter_t *ctx, 14499 struct unicode_format_arg_t *arg, 14500 PyObject *str) 14501{ 14502 Py_ssize_t len; 14503 enum PyUnicode_Kind kind; 14504 void *pbuf; 14505 Py_ssize_t pindex; 14506 Py_UCS4 signchar; 14507 Py_ssize_t buflen; 14508 Py_UCS4 maxchar; 14509 Py_ssize_t sublen; 14510 _PyUnicodeWriter *writer = &ctx->writer; 14511 Py_UCS4 fill; 14512 14513 fill = ' '; 14514 if (arg->sign && arg->flags & F_ZERO) 14515 fill = '0'; 14516 14517 if (PyUnicode_READY(str) == -1) 14518 return -1; 14519 14520 len = PyUnicode_GET_LENGTH(str); 14521 if ((arg->width == -1 || arg->width <= len) 14522 && (arg->prec == -1 || arg->prec >= len) 14523 && !(arg->flags & (F_SIGN | F_BLANK))) 14524 { 14525 /* Fast path */ 14526 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14527 return -1; 14528 return 0; 14529 } 14530 14531 /* Truncate the string for "s", "r" and "a" formats 14532 if the precision is set */ 14533 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14534 if (arg->prec >= 0 && len > arg->prec) 14535 len = arg->prec; 14536 } 14537 14538 /* Adjust sign and width */ 14539 kind = PyUnicode_KIND(str); 14540 pbuf = PyUnicode_DATA(str); 14541 pindex = 0; 14542 signchar = '\0'; 14543 if (arg->sign) { 14544 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14545 if (ch == '-' || ch == '+') { 14546 signchar = ch; 14547 len--; 14548 pindex++; 14549 } 14550 else if (arg->flags & F_SIGN) 14551 signchar = '+'; 14552 else if (arg->flags & F_BLANK) 14553 signchar = ' '; 14554 else 14555 arg->sign = 0; 14556 } 14557 if (arg->width < len) 14558 arg->width = len; 14559 14560 /* Prepare the writer */ 14561 maxchar = writer->maxchar; 14562 if (!(arg->flags & F_LJUST)) { 14563 if (arg->sign) { 14564 if ((arg->width-1) > len) 14565 maxchar = Py_MAX(maxchar, fill); 14566 } 14567 else { 14568 if (arg->width > len) 14569 maxchar = Py_MAX(maxchar, fill); 14570 } 14571 } 14572 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14573 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14574 maxchar = Py_MAX(maxchar, strmaxchar); 14575 } 14576 14577 buflen = arg->width; 14578 if (arg->sign && len == arg->width) 14579 buflen++; 14580 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14581 return -1; 14582 14583 /* Write the sign if needed */ 14584 if (arg->sign) { 14585 if (fill != ' ') { 14586 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14587 writer->pos += 1; 14588 } 14589 if (arg->width > len) 14590 arg->width--; 14591 } 14592 14593 /* Write the numeric prefix for "x", "X" and "o" formats 14594 if the alternate form is used. 14595 For example, write "0x" for the "%#x" format. */ 14596 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14597 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14598 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14599 if (fill != ' ') { 14600 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14601 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14602 writer->pos += 2; 14603 pindex += 2; 14604 } 14605 arg->width -= 2; 14606 if (arg->width < 0) 14607 arg->width = 0; 14608 len -= 2; 14609 } 14610 14611 /* Pad left with the fill character if needed */ 14612 if (arg->width > len && !(arg->flags & F_LJUST)) { 14613 sublen = arg->width - len; 14614 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14615 writer->pos += sublen; 14616 arg->width = len; 14617 } 14618 14619 /* If padding with spaces: write sign if needed and/or numeric prefix if 14620 the alternate form is used */ 14621 if (fill == ' ') { 14622 if (arg->sign) { 14623 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14624 writer->pos += 1; 14625 } 14626 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14627 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14628 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14629 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14630 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14631 writer->pos += 2; 14632 pindex += 2; 14633 } 14634 } 14635 14636 /* Write characters */ 14637 if (len) { 14638 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14639 str, pindex, len); 14640 writer->pos += len; 14641 } 14642 14643 /* Pad right with the fill character if needed */ 14644 if (arg->width > len) { 14645 sublen = arg->width - len; 14646 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14647 writer->pos += sublen; 14648 } 14649 return 0; 14650} 14651 14652/* Helper of PyUnicode_Format(): format one arg. 14653 Return 0 on success, raise an exception and return -1 on error. */ 14654static int 14655unicode_format_arg(struct unicode_formatter_t *ctx) 14656{ 14657 struct unicode_format_arg_t arg; 14658 PyObject *str; 14659 int ret; 14660 14661 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14662 arg.flags = 0; 14663 arg.width = -1; 14664 arg.prec = -1; 14665 arg.sign = 0; 14666 str = NULL; 14667 14668 ret = unicode_format_arg_parse(ctx, &arg); 14669 if (ret == -1) 14670 return -1; 14671 14672 ret = unicode_format_arg_format(ctx, &arg, &str); 14673 if (ret == -1) 14674 return -1; 14675 14676 if (ret != 1) { 14677 ret = unicode_format_arg_output(ctx, &arg, str); 14678 Py_DECREF(str); 14679 if (ret == -1) 14680 return -1; 14681 } 14682 14683 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14684 PyErr_SetString(PyExc_TypeError, 14685 "not all arguments converted during string formatting"); 14686 return -1; 14687 } 14688 return 0; 14689} 14690 14691PyObject * 14692PyUnicode_Format(PyObject *format, PyObject *args) 14693{ 14694 struct unicode_formatter_t ctx; 14695 14696 if (format == NULL || args == NULL) { 14697 PyErr_BadInternalCall(); 14698 return NULL; 14699 } 14700 14701 ctx.fmtstr = PyUnicode_FromObject(format); 14702 if (ctx.fmtstr == NULL) 14703 return NULL; 14704 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14705 Py_DECREF(ctx.fmtstr); 14706 return NULL; 14707 } 14708 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14709 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14710 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14711 ctx.fmtpos = 0; 14712 14713 _PyUnicodeWriter_Init(&ctx.writer); 14714 ctx.writer.min_length = ctx.fmtcnt + 100; 14715 ctx.writer.overallocate = 1; 14716 14717 if (PyTuple_Check(args)) { 14718 ctx.arglen = PyTuple_Size(args); 14719 ctx.argidx = 0; 14720 } 14721 else { 14722 ctx.arglen = -1; 14723 ctx.argidx = -2; 14724 } 14725 ctx.args_owned = 0; 14726 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14727 ctx.dict = args; 14728 else 14729 ctx.dict = NULL; 14730 ctx.args = args; 14731 14732 while (--ctx.fmtcnt >= 0) { 14733 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14734 Py_ssize_t nonfmtpos; 14735 14736 nonfmtpos = ctx.fmtpos++; 14737 while (ctx.fmtcnt >= 0 && 14738 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14739 ctx.fmtpos++; 14740 ctx.fmtcnt--; 14741 } 14742 if (ctx.fmtcnt < 0) { 14743 ctx.fmtpos--; 14744 ctx.writer.overallocate = 0; 14745 } 14746 14747 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14748 nonfmtpos, ctx.fmtpos) < 0) 14749 goto onError; 14750 } 14751 else { 14752 ctx.fmtpos++; 14753 if (unicode_format_arg(&ctx) == -1) 14754 goto onError; 14755 } 14756 } 14757 14758 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14759 PyErr_SetString(PyExc_TypeError, 14760 "not all arguments converted during string formatting"); 14761 goto onError; 14762 } 14763 14764 if (ctx.args_owned) { 14765 Py_DECREF(ctx.args); 14766 } 14767 Py_DECREF(ctx.fmtstr); 14768 return _PyUnicodeWriter_Finish(&ctx.writer); 14769 14770 onError: 14771 Py_DECREF(ctx.fmtstr); 14772 _PyUnicodeWriter_Dealloc(&ctx.writer); 14773 if (ctx.args_owned) { 14774 Py_DECREF(ctx.args); 14775 } 14776 return NULL; 14777} 14778 14779static PyObject * 14780unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14781 14782static PyObject * 14783unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14784{ 14785 PyObject *x = NULL; 14786 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14787 char *encoding = NULL; 14788 char *errors = NULL; 14789 14790 if (type != &PyUnicode_Type) 14791 return unicode_subtype_new(type, args, kwds); 14792 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14793 kwlist, &x, &encoding, &errors)) 14794 return NULL; 14795 if (x == NULL) 14796 _Py_RETURN_UNICODE_EMPTY(); 14797 if (encoding == NULL && errors == NULL) 14798 return PyObject_Str(x); 14799 else 14800 return PyUnicode_FromEncodedObject(x, encoding, errors); 14801} 14802 14803static PyObject * 14804unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14805{ 14806 PyObject *unicode, *self; 14807 Py_ssize_t length, char_size; 14808 int share_wstr, share_utf8; 14809 unsigned int kind; 14810 void *data; 14811 14812 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14813 14814 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14815 if (unicode == NULL) 14816 return NULL; 14817 assert(_PyUnicode_CHECK(unicode)); 14818 if (PyUnicode_READY(unicode) == -1) { 14819 Py_DECREF(unicode); 14820 return NULL; 14821 } 14822 14823 self = type->tp_alloc(type, 0); 14824 if (self == NULL) { 14825 Py_DECREF(unicode); 14826 return NULL; 14827 } 14828 kind = PyUnicode_KIND(unicode); 14829 length = PyUnicode_GET_LENGTH(unicode); 14830 14831 _PyUnicode_LENGTH(self) = length; 14832#ifdef Py_DEBUG 14833 _PyUnicode_HASH(self) = -1; 14834#else 14835 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14836#endif 14837 _PyUnicode_STATE(self).interned = 0; 14838 _PyUnicode_STATE(self).kind = kind; 14839 _PyUnicode_STATE(self).compact = 0; 14840 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14841 _PyUnicode_STATE(self).ready = 1; 14842 _PyUnicode_WSTR(self) = NULL; 14843 _PyUnicode_UTF8_LENGTH(self) = 0; 14844 _PyUnicode_UTF8(self) = NULL; 14845 _PyUnicode_WSTR_LENGTH(self) = 0; 14846 _PyUnicode_DATA_ANY(self) = NULL; 14847 14848 share_utf8 = 0; 14849 share_wstr = 0; 14850 if (kind == PyUnicode_1BYTE_KIND) { 14851 char_size = 1; 14852 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14853 share_utf8 = 1; 14854 } 14855 else if (kind == PyUnicode_2BYTE_KIND) { 14856 char_size = 2; 14857 if (sizeof(wchar_t) == 2) 14858 share_wstr = 1; 14859 } 14860 else { 14861 assert(kind == PyUnicode_4BYTE_KIND); 14862 char_size = 4; 14863 if (sizeof(wchar_t) == 4) 14864 share_wstr = 1; 14865 } 14866 14867 /* Ensure we won't overflow the length. */ 14868 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14869 PyErr_NoMemory(); 14870 goto onError; 14871 } 14872 data = PyObject_MALLOC((length + 1) * char_size); 14873 if (data == NULL) { 14874 PyErr_NoMemory(); 14875 goto onError; 14876 } 14877 14878 _PyUnicode_DATA_ANY(self) = data; 14879 if (share_utf8) { 14880 _PyUnicode_UTF8_LENGTH(self) = length; 14881 _PyUnicode_UTF8(self) = data; 14882 } 14883 if (share_wstr) { 14884 _PyUnicode_WSTR_LENGTH(self) = length; 14885 _PyUnicode_WSTR(self) = (wchar_t *)data; 14886 } 14887 14888 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14889 kind * (length + 1)); 14890 assert(_PyUnicode_CheckConsistency(self, 1)); 14891#ifdef Py_DEBUG 14892 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14893#endif 14894 Py_DECREF(unicode); 14895 return self; 14896 14897onError: 14898 Py_DECREF(unicode); 14899 Py_DECREF(self); 14900 return NULL; 14901} 14902 14903PyDoc_STRVAR(unicode_doc, 14904"str(object='') -> str\n\ 14905str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14906\n\ 14907Create a new string object from the given object. If encoding or\n\ 14908errors is specified, then the object must expose a data buffer\n\ 14909that will be decoded using the given encoding and error handler.\n\ 14910Otherwise, returns the result of object.__str__() (if defined)\n\ 14911or repr(object).\n\ 14912encoding defaults to sys.getdefaultencoding().\n\ 14913errors defaults to 'strict'."); 14914 14915static PyObject *unicode_iter(PyObject *seq); 14916 14917PyTypeObject PyUnicode_Type = { 14918 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14919 "str", /* tp_name */ 14920 sizeof(PyUnicodeObject), /* tp_size */ 14921 0, /* tp_itemsize */ 14922 /* Slots */ 14923 (destructor)unicode_dealloc, /* tp_dealloc */ 14924 0, /* tp_print */ 14925 0, /* tp_getattr */ 14926 0, /* tp_setattr */ 14927 0, /* tp_reserved */ 14928 unicode_repr, /* tp_repr */ 14929 &unicode_as_number, /* tp_as_number */ 14930 &unicode_as_sequence, /* tp_as_sequence */ 14931 &unicode_as_mapping, /* tp_as_mapping */ 14932 (hashfunc) unicode_hash, /* tp_hash*/ 14933 0, /* tp_call*/ 14934 (reprfunc) unicode_str, /* tp_str */ 14935 PyObject_GenericGetAttr, /* tp_getattro */ 14936 0, /* tp_setattro */ 14937 0, /* tp_as_buffer */ 14938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14939 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14940 unicode_doc, /* tp_doc */ 14941 0, /* tp_traverse */ 14942 0, /* tp_clear */ 14943 PyUnicode_RichCompare, /* tp_richcompare */ 14944 0, /* tp_weaklistoffset */ 14945 unicode_iter, /* tp_iter */ 14946 0, /* tp_iternext */ 14947 unicode_methods, /* tp_methods */ 14948 0, /* tp_members */ 14949 0, /* tp_getset */ 14950 &PyBaseObject_Type, /* tp_base */ 14951 0, /* tp_dict */ 14952 0, /* tp_descr_get */ 14953 0, /* tp_descr_set */ 14954 0, /* tp_dictoffset */ 14955 0, /* tp_init */ 14956 0, /* tp_alloc */ 14957 unicode_new, /* tp_new */ 14958 PyObject_Del, /* tp_free */ 14959}; 14960 14961/* Initialize the Unicode implementation */ 14962 14963int _PyUnicode_Init(void) 14964{ 14965 /* XXX - move this array to unicodectype.c ? */ 14966 Py_UCS2 linebreak[] = { 14967 0x000A, /* LINE FEED */ 14968 0x000D, /* CARRIAGE RETURN */ 14969 0x001C, /* FILE SEPARATOR */ 14970 0x001D, /* GROUP SEPARATOR */ 14971 0x001E, /* RECORD SEPARATOR */ 14972 0x0085, /* NEXT LINE */ 14973 0x2028, /* LINE SEPARATOR */ 14974 0x2029, /* PARAGRAPH SEPARATOR */ 14975 }; 14976 14977 /* Init the implementation */ 14978 _Py_INCREF_UNICODE_EMPTY(); 14979 if (!unicode_empty) 14980 Py_FatalError("Can't create empty string"); 14981 Py_DECREF(unicode_empty); 14982 14983 if (PyType_Ready(&PyUnicode_Type) < 0) 14984 Py_FatalError("Can't initialize 'unicode'"); 14985 14986 /* initialize the linebreak bloom filter */ 14987 bloom_linebreak = make_bloom_mask( 14988 PyUnicode_2BYTE_KIND, linebreak, 14989 Py_ARRAY_LENGTH(linebreak)); 14990 14991 if (PyType_Ready(&EncodingMapType) < 0) 14992 Py_FatalError("Can't initialize encoding map type"); 14993 14994 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 14995 Py_FatalError("Can't initialize field name iterator type"); 14996 14997 if (PyType_Ready(&PyFormatterIter_Type) < 0) 14998 Py_FatalError("Can't initialize formatter iter type"); 14999 15000#ifdef HAVE_MBCS 15001 winver.dwOSVersionInfoSize = sizeof(winver); 15002 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 15003 PyErr_SetFromWindowsErr(0); 15004 return -1; 15005 } 15006#endif 15007 return 0; 15008} 15009 15010/* Finalize the Unicode implementation */ 15011 15012int 15013PyUnicode_ClearFreeList(void) 15014{ 15015 return 0; 15016} 15017 15018void 15019_PyUnicode_Fini(void) 15020{ 15021 int i; 15022 15023 Py_CLEAR(unicode_empty); 15024 15025 for (i = 0; i < 256; i++) 15026 Py_CLEAR(unicode_latin1[i]); 15027 _PyUnicode_ClearStaticStrings(); 15028 (void)PyUnicode_ClearFreeList(); 15029} 15030 15031void 15032PyUnicode_InternInPlace(PyObject **p) 15033{ 15034 PyObject *s = *p; 15035 PyObject *t; 15036#ifdef Py_DEBUG 15037 assert(s != NULL); 15038 assert(_PyUnicode_CHECK(s)); 15039#else 15040 if (s == NULL || !PyUnicode_Check(s)) 15041 return; 15042#endif 15043 /* If it's a subclass, we don't really know what putting 15044 it in the interned dict might do. */ 15045 if (!PyUnicode_CheckExact(s)) 15046 return; 15047 if (PyUnicode_CHECK_INTERNED(s)) 15048 return; 15049 if (interned == NULL) { 15050 interned = PyDict_New(); 15051 if (interned == NULL) { 15052 PyErr_Clear(); /* Don't leave an exception */ 15053 return; 15054 } 15055 } 15056 /* It might be that the GetItem call fails even 15057 though the key is present in the dictionary, 15058 namely when this happens during a stack overflow. */ 15059 Py_ALLOW_RECURSION 15060 t = PyDict_GetItem(interned, s); 15061 Py_END_ALLOW_RECURSION 15062 15063 if (t) { 15064 Py_INCREF(t); 15065 Py_DECREF(*p); 15066 *p = t; 15067 return; 15068 } 15069 15070 PyThreadState_GET()->recursion_critical = 1; 15071 if (PyDict_SetItem(interned, s, s) < 0) { 15072 PyErr_Clear(); 15073 PyThreadState_GET()->recursion_critical = 0; 15074 return; 15075 } 15076 PyThreadState_GET()->recursion_critical = 0; 15077 /* The two references in interned are not counted by refcnt. 15078 The deallocator will take care of this */ 15079 Py_REFCNT(s) -= 2; 15080 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15081} 15082 15083void 15084PyUnicode_InternImmortal(PyObject **p) 15085{ 15086 PyUnicode_InternInPlace(p); 15087 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15088 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15089 Py_INCREF(*p); 15090 } 15091} 15092 15093PyObject * 15094PyUnicode_InternFromString(const char *cp) 15095{ 15096 PyObject *s = PyUnicode_FromString(cp); 15097 if (s == NULL) 15098 return NULL; 15099 PyUnicode_InternInPlace(&s); 15100 return s; 15101} 15102 15103void 15104_Py_ReleaseInternedUnicodeStrings(void) 15105{ 15106 PyObject *keys; 15107 PyObject *s; 15108 Py_ssize_t i, n; 15109 Py_ssize_t immortal_size = 0, mortal_size = 0; 15110 15111 if (interned == NULL || !PyDict_Check(interned)) 15112 return; 15113 keys = PyDict_Keys(interned); 15114 if (keys == NULL || !PyList_Check(keys)) { 15115 PyErr_Clear(); 15116 return; 15117 } 15118 15119 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15120 detector, interned unicode strings are not forcibly deallocated; 15121 rather, we give them their stolen references back, and then clear 15122 and DECREF the interned dict. */ 15123 15124 n = PyList_GET_SIZE(keys); 15125 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15126 n); 15127 for (i = 0; i < n; i++) { 15128 s = PyList_GET_ITEM(keys, i); 15129 if (PyUnicode_READY(s) == -1) { 15130 assert(0 && "could not ready string"); 15131 fprintf(stderr, "could not ready string\n"); 15132 } 15133 switch (PyUnicode_CHECK_INTERNED(s)) { 15134 case SSTATE_NOT_INTERNED: 15135 /* XXX Shouldn't happen */ 15136 break; 15137 case SSTATE_INTERNED_IMMORTAL: 15138 Py_REFCNT(s) += 1; 15139 immortal_size += PyUnicode_GET_LENGTH(s); 15140 break; 15141 case SSTATE_INTERNED_MORTAL: 15142 Py_REFCNT(s) += 2; 15143 mortal_size += PyUnicode_GET_LENGTH(s); 15144 break; 15145 default: 15146 Py_FatalError("Inconsistent interned string state."); 15147 } 15148 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15149 } 15150 fprintf(stderr, "total size of all interned strings: " 15151 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15152 "mortal/immortal\n", mortal_size, immortal_size); 15153 Py_DECREF(keys); 15154 PyDict_Clear(interned); 15155 Py_CLEAR(interned); 15156} 15157 15158 15159/********************* Unicode Iterator **************************/ 15160 15161typedef struct { 15162 PyObject_HEAD 15163 Py_ssize_t it_index; 15164 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15165} unicodeiterobject; 15166 15167static void 15168unicodeiter_dealloc(unicodeiterobject *it) 15169{ 15170 _PyObject_GC_UNTRACK(it); 15171 Py_XDECREF(it->it_seq); 15172 PyObject_GC_Del(it); 15173} 15174 15175static int 15176unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15177{ 15178 Py_VISIT(it->it_seq); 15179 return 0; 15180} 15181 15182static PyObject * 15183unicodeiter_next(unicodeiterobject *it) 15184{ 15185 PyObject *seq, *item; 15186 15187 assert(it != NULL); 15188 seq = it->it_seq; 15189 if (seq == NULL) 15190 return NULL; 15191 assert(_PyUnicode_CHECK(seq)); 15192 15193 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15194 int kind = PyUnicode_KIND(seq); 15195 void *data = PyUnicode_DATA(seq); 15196 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15197 item = PyUnicode_FromOrdinal(chr); 15198 if (item != NULL) 15199 ++it->it_index; 15200 return item; 15201 } 15202 15203 Py_DECREF(seq); 15204 it->it_seq = NULL; 15205 return NULL; 15206} 15207 15208static PyObject * 15209unicodeiter_len(unicodeiterobject *it) 15210{ 15211 Py_ssize_t len = 0; 15212 if (it->it_seq) 15213 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15214 return PyLong_FromSsize_t(len); 15215} 15216 15217PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15218 15219static PyObject * 15220unicodeiter_reduce(unicodeiterobject *it) 15221{ 15222 if (it->it_seq != NULL) { 15223 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15224 it->it_seq, it->it_index); 15225 } else { 15226 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15227 if (u == NULL) 15228 return NULL; 15229 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15230 } 15231} 15232 15233PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15234 15235static PyObject * 15236unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15237{ 15238 Py_ssize_t index = PyLong_AsSsize_t(state); 15239 if (index == -1 && PyErr_Occurred()) 15240 return NULL; 15241 if (it->it_seq != NULL) { 15242 if (index < 0) 15243 index = 0; 15244 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15245 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15246 it->it_index = index; 15247 } 15248 Py_RETURN_NONE; 15249} 15250 15251PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15252 15253static PyMethodDef unicodeiter_methods[] = { 15254 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15255 length_hint_doc}, 15256 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15257 reduce_doc}, 15258 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15259 setstate_doc}, 15260 {NULL, NULL} /* sentinel */ 15261}; 15262 15263PyTypeObject PyUnicodeIter_Type = { 15264 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15265 "str_iterator", /* tp_name */ 15266 sizeof(unicodeiterobject), /* tp_basicsize */ 15267 0, /* tp_itemsize */ 15268 /* methods */ 15269 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15270 0, /* tp_print */ 15271 0, /* tp_getattr */ 15272 0, /* tp_setattr */ 15273 0, /* tp_reserved */ 15274 0, /* tp_repr */ 15275 0, /* tp_as_number */ 15276 0, /* tp_as_sequence */ 15277 0, /* tp_as_mapping */ 15278 0, /* tp_hash */ 15279 0, /* tp_call */ 15280 0, /* tp_str */ 15281 PyObject_GenericGetAttr, /* tp_getattro */ 15282 0, /* tp_setattro */ 15283 0, /* tp_as_buffer */ 15284 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15285 0, /* tp_doc */ 15286 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15287 0, /* tp_clear */ 15288 0, /* tp_richcompare */ 15289 0, /* tp_weaklistoffset */ 15290 PyObject_SelfIter, /* tp_iter */ 15291 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15292 unicodeiter_methods, /* tp_methods */ 15293 0, 15294}; 15295 15296static PyObject * 15297unicode_iter(PyObject *seq) 15298{ 15299 unicodeiterobject *it; 15300 15301 if (!PyUnicode_Check(seq)) { 15302 PyErr_BadInternalCall(); 15303 return NULL; 15304 } 15305 if (PyUnicode_READY(seq) == -1) 15306 return NULL; 15307 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15308 if (it == NULL) 15309 return NULL; 15310 it->it_index = 0; 15311 Py_INCREF(seq); 15312 it->it_seq = seq; 15313 _PyObject_GC_TRACK(it); 15314 return (PyObject *)it; 15315} 15316 15317 15318size_t 15319Py_UNICODE_strlen(const Py_UNICODE *u) 15320{ 15321 int res = 0; 15322 while(*u++) 15323 res++; 15324 return res; 15325} 15326 15327Py_UNICODE* 15328Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15329{ 15330 Py_UNICODE *u = s1; 15331 while ((*u++ = *s2++)); 15332 return s1; 15333} 15334 15335Py_UNICODE* 15336Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15337{ 15338 Py_UNICODE *u = s1; 15339 while ((*u++ = *s2++)) 15340 if (n-- == 0) 15341 break; 15342 return s1; 15343} 15344 15345Py_UNICODE* 15346Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15347{ 15348 Py_UNICODE *u1 = s1; 15349 u1 += Py_UNICODE_strlen(u1); 15350 Py_UNICODE_strcpy(u1, s2); 15351 return s1; 15352} 15353 15354int 15355Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15356{ 15357 while (*s1 && *s2 && *s1 == *s2) 15358 s1++, s2++; 15359 if (*s1 && *s2) 15360 return (*s1 < *s2) ? -1 : +1; 15361 if (*s1) 15362 return 1; 15363 if (*s2) 15364 return -1; 15365 return 0; 15366} 15367 15368int 15369Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15370{ 15371 Py_UNICODE u1, u2; 15372 for (; n != 0; n--) { 15373 u1 = *s1; 15374 u2 = *s2; 15375 if (u1 != u2) 15376 return (u1 < u2) ? -1 : +1; 15377 if (u1 == '\0') 15378 return 0; 15379 s1++; 15380 s2++; 15381 } 15382 return 0; 15383} 15384 15385Py_UNICODE* 15386Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15387{ 15388 const Py_UNICODE *p; 15389 for (p = s; *p; p++) 15390 if (*p == c) 15391 return (Py_UNICODE*)p; 15392 return NULL; 15393} 15394 15395Py_UNICODE* 15396Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15397{ 15398 const Py_UNICODE *p; 15399 p = s + Py_UNICODE_strlen(s); 15400 while (p != s) { 15401 p--; 15402 if (*p == c) 15403 return (Py_UNICODE*)p; 15404 } 15405 return NULL; 15406} 15407 15408Py_UNICODE* 15409PyUnicode_AsUnicodeCopy(PyObject *unicode) 15410{ 15411 Py_UNICODE *u, *copy; 15412 Py_ssize_t len, size; 15413 15414 if (!PyUnicode_Check(unicode)) { 15415 PyErr_BadArgument(); 15416 return NULL; 15417 } 15418 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15419 if (u == NULL) 15420 return NULL; 15421 /* Ensure we won't overflow the size. */ 15422 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 15423 PyErr_NoMemory(); 15424 return NULL; 15425 } 15426 size = len + 1; /* copy the null character */ 15427 size *= sizeof(Py_UNICODE); 15428 copy = PyMem_Malloc(size); 15429 if (copy == NULL) { 15430 PyErr_NoMemory(); 15431 return NULL; 15432 } 15433 memcpy(copy, u, size); 15434 return copy; 15435} 15436 15437/* A _string module, to export formatter_parser and formatter_field_name_split 15438 to the string.Formatter class implemented in Python. */ 15439 15440static PyMethodDef _string_methods[] = { 15441 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15442 METH_O, PyDoc_STR("split the argument as a field name")}, 15443 {"formatter_parser", (PyCFunction) formatter_parser, 15444 METH_O, PyDoc_STR("parse the argument as a format string")}, 15445 {NULL, NULL} 15446}; 15447 15448static struct PyModuleDef _string_module = { 15449 PyModuleDef_HEAD_INIT, 15450 "_string", 15451 PyDoc_STR("string helper module"), 15452 0, 15453 _string_methods, 15454 NULL, 15455 NULL, 15456 NULL, 15457 NULL 15458}; 15459 15460PyMODINIT_FUNC 15461PyInit__string(void) 15462{ 15463 return PyModule_Create(&_string_module); 15464} 15465 15466 15467#ifdef __cplusplus 15468} 15469#endif 15470