unicodeobject.c revision fd97a6fb2d501f0ecb104513b5c0c1707dd6f87e
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/*[clinic input] 51class str "PyUnicodeObject *" "&PyUnicode_Type" 52[clinic start generated code]*/ 53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 54 55/* --- Globals ------------------------------------------------------------ 56 57NOTE: In the interpreter's initialization phase, some globals are currently 58 initialized dynamically as needed. In the process Unicode objects may 59 be created before the Unicode type is ready. 60 61*/ 62 63 64#ifdef __cplusplus 65extern "C" { 66#endif 67 68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 69#define MAX_UNICODE 0x10ffff 70 71#ifdef Py_DEBUG 72# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 73#else 74# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 75#endif 76 77#define _PyUnicode_UTF8(op) \ 78 (((PyCompactUnicodeObject*)(op))->utf8) 79#define PyUnicode_UTF8(op) \ 80 (assert(_PyUnicode_CHECK(op)), \ 81 assert(PyUnicode_IS_READY(op)), \ 82 PyUnicode_IS_COMPACT_ASCII(op) ? \ 83 ((char*)((PyASCIIObject*)(op) + 1)) : \ 84 _PyUnicode_UTF8(op)) 85#define _PyUnicode_UTF8_LENGTH(op) \ 86 (((PyCompactUnicodeObject*)(op))->utf8_length) 87#define PyUnicode_UTF8_LENGTH(op) \ 88 (assert(_PyUnicode_CHECK(op)), \ 89 assert(PyUnicode_IS_READY(op)), \ 90 PyUnicode_IS_COMPACT_ASCII(op) ? \ 91 ((PyASCIIObject*)(op))->length : \ 92 _PyUnicode_UTF8_LENGTH(op)) 93#define _PyUnicode_WSTR(op) \ 94 (((PyASCIIObject*)(op))->wstr) 95#define _PyUnicode_WSTR_LENGTH(op) \ 96 (((PyCompactUnicodeObject*)(op))->wstr_length) 97#define _PyUnicode_LENGTH(op) \ 98 (((PyASCIIObject *)(op))->length) 99#define _PyUnicode_STATE(op) \ 100 (((PyASCIIObject *)(op))->state) 101#define _PyUnicode_HASH(op) \ 102 (((PyASCIIObject *)(op))->hash) 103#define _PyUnicode_KIND(op) \ 104 (assert(_PyUnicode_CHECK(op)), \ 105 ((PyASCIIObject *)(op))->state.kind) 106#define _PyUnicode_GET_LENGTH(op) \ 107 (assert(_PyUnicode_CHECK(op)), \ 108 ((PyASCIIObject *)(op))->length) 109#define _PyUnicode_DATA_ANY(op) \ 110 (((PyUnicodeObject*)(op))->data.any) 111 112#undef PyUnicode_READY 113#define PyUnicode_READY(op) \ 114 (assert(_PyUnicode_CHECK(op)), \ 115 (PyUnicode_IS_READY(op) ? \ 116 0 : \ 117 _PyUnicode_Ready(op))) 118 119#define _PyUnicode_SHARE_UTF8(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 123#define _PyUnicode_SHARE_WSTR(op) \ 124 (assert(_PyUnicode_CHECK(op)), \ 125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 126 127/* true if the Unicode object has an allocated UTF-8 memory block 128 (not shared with other data) */ 129#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 130 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 ((_PyUnicode_WSTR(op) && \ 138 (!PyUnicode_IS_READY(op) || \ 139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 140 141/* Generic helper macro to convert characters of different types. 142 from_type and to_type have to be valid type names, begin and end 143 are pointers to the source characters which should be of type 144 "from_type *". to is a pointer of type "to_type *" and points to the 145 buffer where the result characters are written to. */ 146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 147 do { \ 148 to_type *_to = (to_type *)(to); \ 149 const from_type *_iter = (from_type *)(begin); \ 150 const from_type *_end = (from_type *)(end); \ 151 Py_ssize_t n = (_end) - (_iter); \ 152 const from_type *_unrolled_end = \ 153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 154 while (_iter < (_unrolled_end)) { \ 155 _to[0] = (to_type) _iter[0]; \ 156 _to[1] = (to_type) _iter[1]; \ 157 _to[2] = (to_type) _iter[2]; \ 158 _to[3] = (to_type) _iter[3]; \ 159 _iter += 4; _to += 4; \ 160 } \ 161 while (_iter < (_end)) \ 162 *_to++ = (to_type) *_iter++; \ 163 } while (0) 164 165/* This dictionary holds all interned unicode strings. Note that references 166 to strings in this dictionary are *not* counted in the string's ob_refcnt. 167 When the interned string reaches a refcnt of 0 the string deallocation 168 function will delete the reference from this dictionary. 169 170 Another way to look at this is that to say that the actual reference 171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 172*/ 173static PyObject *interned = NULL; 174 175/* The empty Unicode object is shared to improve performance. */ 176static PyObject *unicode_empty = NULL; 177 178#define _Py_INCREF_UNICODE_EMPTY() \ 179 do { \ 180 if (unicode_empty != NULL) \ 181 Py_INCREF(unicode_empty); \ 182 else { \ 183 unicode_empty = PyUnicode_New(0, 0); \ 184 if (unicode_empty != NULL) { \ 185 Py_INCREF(unicode_empty); \ 186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 187 } \ 188 } \ 189 } while (0) 190 191#define _Py_RETURN_UNICODE_EMPTY() \ 192 do { \ 193 _Py_INCREF_UNICODE_EMPTY(); \ 194 return unicode_empty; \ 195 } while (0) 196 197/* Forward declaration */ 198Py_LOCAL_INLINE(int) 199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 200 201/* List of static strings. */ 202static _Py_Identifier *static_strings = NULL; 203 204/* Single character Unicode strings in the Latin-1 range are being 205 shared as well. */ 206static PyObject *unicode_latin1[256] = {NULL}; 207 208/* Fast detection of the most frequent whitespace characters */ 209const unsigned char _Py_ascii_whitespace[] = { 210 0, 0, 0, 0, 0, 0, 0, 0, 211/* case 0x0009: * CHARACTER TABULATION */ 212/* case 0x000A: * LINE FEED */ 213/* case 0x000B: * LINE TABULATION */ 214/* case 0x000C: * FORM FEED */ 215/* case 0x000D: * CARRIAGE RETURN */ 216 0, 1, 1, 1, 1, 1, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218/* case 0x001C: * FILE SEPARATOR */ 219/* case 0x001D: * GROUP SEPARATOR */ 220/* case 0x001E: * RECORD SEPARATOR */ 221/* case 0x001F: * UNIT SEPARATOR */ 222 0, 0, 0, 0, 1, 1, 1, 1, 223/* case 0x0020: * SPACE */ 224 1, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0 237}; 238 239/* forward */ 240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 241static PyObject* get_latin1_char(unsigned char ch); 242static int unicode_modifiable(PyObject *unicode); 243 244 245static PyObject * 246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 247static PyObject * 248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 249static PyObject * 250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 251 252static PyObject * 253unicode_encode_call_errorhandler(const char *errors, 254 PyObject **errorHandler,const char *encoding, const char *reason, 255 PyObject *unicode, PyObject **exceptionObject, 256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 257 258static void 259raise_encode_exception(PyObject **exceptionObject, 260 const char *encoding, 261 PyObject *unicode, 262 Py_ssize_t startpos, Py_ssize_t endpos, 263 const char *reason); 264 265/* Same for linebreaks */ 266static unsigned char ascii_linebreak[] = { 267 0, 0, 0, 0, 0, 0, 0, 0, 268/* 0x000A, * LINE FEED */ 269/* 0x000B, * LINE TABULATION */ 270/* 0x000C, * FORM FEED */ 271/* 0x000D, * CARRIAGE RETURN */ 272 0, 0, 1, 1, 1, 1, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274/* 0x001C, * FILE SEPARATOR */ 275/* 0x001D, * GROUP SEPARATOR */ 276/* 0x001E, * RECORD SEPARATOR */ 277 0, 0, 0, 0, 1, 1, 1, 0, 278 0, 0, 0, 0, 0, 0, 0, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0 291}; 292 293/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 294 This function is kept for backward compatibility with the old API. */ 295Py_UNICODE 296PyUnicode_GetMax(void) 297{ 298#ifdef Py_UNICODE_WIDE 299 return 0x10FFFF; 300#else 301 /* This is actually an illegal character, so it should 302 not be passed to unichr. */ 303 return 0xFFFF; 304#endif 305} 306 307#ifdef Py_DEBUG 308int 309_PyUnicode_CheckConsistency(PyObject *op, int check_content) 310{ 311 PyASCIIObject *ascii; 312 unsigned int kind; 313 314 assert(PyUnicode_Check(op)); 315 316 ascii = (PyASCIIObject *)op; 317 kind = ascii->state.kind; 318 319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 320 assert(kind == PyUnicode_1BYTE_KIND); 321 assert(ascii->state.ready == 1); 322 } 323 else { 324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 325 void *data; 326 327 if (ascii->state.compact == 1) { 328 data = compact + 1; 329 assert(kind == PyUnicode_1BYTE_KIND 330 || kind == PyUnicode_2BYTE_KIND 331 || kind == PyUnicode_4BYTE_KIND); 332 assert(ascii->state.ascii == 0); 333 assert(ascii->state.ready == 1); 334 assert (compact->utf8 != data); 335 } 336 else { 337 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 338 339 data = unicode->data.any; 340 if (kind == PyUnicode_WCHAR_KIND) { 341 assert(ascii->length == 0); 342 assert(ascii->hash == -1); 343 assert(ascii->state.compact == 0); 344 assert(ascii->state.ascii == 0); 345 assert(ascii->state.ready == 0); 346 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 347 assert(ascii->wstr != NULL); 348 assert(data == NULL); 349 assert(compact->utf8 == NULL); 350 } 351 else { 352 assert(kind == PyUnicode_1BYTE_KIND 353 || kind == PyUnicode_2BYTE_KIND 354 || kind == PyUnicode_4BYTE_KIND); 355 assert(ascii->state.compact == 0); 356 assert(ascii->state.ready == 1); 357 assert(data != NULL); 358 if (ascii->state.ascii) { 359 assert (compact->utf8 == data); 360 assert (compact->utf8_length == ascii->length); 361 } 362 else 363 assert (compact->utf8 != data); 364 } 365 } 366 if (kind != PyUnicode_WCHAR_KIND) { 367 if ( 368#if SIZEOF_WCHAR_T == 2 369 kind == PyUnicode_2BYTE_KIND 370#else 371 kind == PyUnicode_4BYTE_KIND 372#endif 373 ) 374 { 375 assert(ascii->wstr == data); 376 assert(compact->wstr_length == ascii->length); 377 } else 378 assert(ascii->wstr != data); 379 } 380 381 if (compact->utf8 == NULL) 382 assert(compact->utf8_length == 0); 383 if (ascii->wstr == NULL) 384 assert(compact->wstr_length == 0); 385 } 386 /* check that the best kind is used */ 387 if (check_content && kind != PyUnicode_WCHAR_KIND) 388 { 389 Py_ssize_t i; 390 Py_UCS4 maxchar = 0; 391 void *data; 392 Py_UCS4 ch; 393 394 data = PyUnicode_DATA(ascii); 395 for (i=0; i < ascii->length; i++) 396 { 397 ch = PyUnicode_READ(kind, data, i); 398 if (ch > maxchar) 399 maxchar = ch; 400 } 401 if (kind == PyUnicode_1BYTE_KIND) { 402 if (ascii->state.ascii == 0) { 403 assert(maxchar >= 128); 404 assert(maxchar <= 255); 405 } 406 else 407 assert(maxchar < 128); 408 } 409 else if (kind == PyUnicode_2BYTE_KIND) { 410 assert(maxchar >= 0x100); 411 assert(maxchar <= 0xFFFF); 412 } 413 else { 414 assert(maxchar >= 0x10000); 415 assert(maxchar <= MAX_UNICODE); 416 } 417 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 418 } 419 return 1; 420} 421#endif 422 423static PyObject* 424unicode_result_wchar(PyObject *unicode) 425{ 426#ifndef Py_DEBUG 427 Py_ssize_t len; 428 429 len = _PyUnicode_WSTR_LENGTH(unicode); 430 if (len == 0) { 431 Py_DECREF(unicode); 432 _Py_RETURN_UNICODE_EMPTY(); 433 } 434 435 if (len == 1) { 436 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 437 if ((Py_UCS4)ch < 256) { 438 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 439 Py_DECREF(unicode); 440 return latin1_char; 441 } 442 } 443 444 if (_PyUnicode_Ready(unicode) < 0) { 445 Py_DECREF(unicode); 446 return NULL; 447 } 448#else 449 assert(Py_REFCNT(unicode) == 1); 450 451 /* don't make the result ready in debug mode to ensure that the caller 452 makes the string ready before using it */ 453 assert(_PyUnicode_CheckConsistency(unicode, 1)); 454#endif 455 return unicode; 456} 457 458static PyObject* 459unicode_result_ready(PyObject *unicode) 460{ 461 Py_ssize_t length; 462 463 length = PyUnicode_GET_LENGTH(unicode); 464 if (length == 0) { 465 if (unicode != unicode_empty) { 466 Py_DECREF(unicode); 467 _Py_RETURN_UNICODE_EMPTY(); 468 } 469 return unicode_empty; 470 } 471 472 if (length == 1) { 473 void *data = PyUnicode_DATA(unicode); 474 int kind = PyUnicode_KIND(unicode); 475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 476 if (ch < 256) { 477 PyObject *latin1_char = unicode_latin1[ch]; 478 if (latin1_char != NULL) { 479 if (unicode != latin1_char) { 480 Py_INCREF(latin1_char); 481 Py_DECREF(unicode); 482 } 483 return latin1_char; 484 } 485 else { 486 assert(_PyUnicode_CheckConsistency(unicode, 1)); 487 Py_INCREF(unicode); 488 unicode_latin1[ch] = unicode; 489 return unicode; 490 } 491 } 492 } 493 494 assert(_PyUnicode_CheckConsistency(unicode, 1)); 495 return unicode; 496} 497 498static PyObject* 499unicode_result(PyObject *unicode) 500{ 501 assert(_PyUnicode_CHECK(unicode)); 502 if (PyUnicode_IS_READY(unicode)) 503 return unicode_result_ready(unicode); 504 else 505 return unicode_result_wchar(unicode); 506} 507 508static PyObject* 509unicode_result_unchanged(PyObject *unicode) 510{ 511 if (PyUnicode_CheckExact(unicode)) { 512 if (PyUnicode_READY(unicode) == -1) 513 return NULL; 514 Py_INCREF(unicode); 515 return unicode; 516 } 517 else 518 /* Subtype -- return genuine unicode string with the same value. */ 519 return _PyUnicode_Copy(unicode); 520} 521 522#ifdef HAVE_MBCS 523static OSVERSIONINFOEX winver; 524#endif 525 526/* --- Bloom Filters ----------------------------------------------------- */ 527 528/* stuff to implement simple "bloom filters" for Unicode characters. 529 to keep things simple, we use a single bitmask, using the least 5 530 bits from each unicode characters as the bit index. */ 531 532/* the linebreak mask is set up by Unicode_Init below */ 533 534#if LONG_BIT >= 128 535#define BLOOM_WIDTH 128 536#elif LONG_BIT >= 64 537#define BLOOM_WIDTH 64 538#elif LONG_BIT >= 32 539#define BLOOM_WIDTH 32 540#else 541#error "LONG_BIT is smaller than 32" 542#endif 543 544#define BLOOM_MASK unsigned long 545 546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 547 548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 549 550#define BLOOM_LINEBREAK(ch) \ 551 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 553 554Py_LOCAL_INLINE(BLOOM_MASK) 555make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 556{ 557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 558 do { \ 559 TYPE *data = (TYPE *)PTR; \ 560 TYPE *end = data + LEN; \ 561 Py_UCS4 ch; \ 562 for (; data != end; data++) { \ 563 ch = *data; \ 564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 565 } \ 566 break; \ 567 } while (0) 568 569 /* calculate simple bloom-style bitmask for a given unicode string */ 570 571 BLOOM_MASK mask; 572 573 mask = 0; 574 switch (kind) { 575 case PyUnicode_1BYTE_KIND: 576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 577 break; 578 case PyUnicode_2BYTE_KIND: 579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 580 break; 581 case PyUnicode_4BYTE_KIND: 582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 583 break; 584 default: 585 assert(0); 586 } 587 return mask; 588 589#undef BLOOM_UPDATE 590} 591 592/* Compilation of templated routines */ 593 594#include "stringlib/asciilib.h" 595#include "stringlib/fastsearch.h" 596#include "stringlib/partition.h" 597#include "stringlib/split.h" 598#include "stringlib/count.h" 599#include "stringlib/find.h" 600#include "stringlib/find_max_char.h" 601#include "stringlib/localeutil.h" 602#include "stringlib/undef.h" 603 604#include "stringlib/ucs1lib.h" 605#include "stringlib/fastsearch.h" 606#include "stringlib/partition.h" 607#include "stringlib/split.h" 608#include "stringlib/count.h" 609#include "stringlib/find.h" 610#include "stringlib/replace.h" 611#include "stringlib/find_max_char.h" 612#include "stringlib/localeutil.h" 613#include "stringlib/undef.h" 614 615#include "stringlib/ucs2lib.h" 616#include "stringlib/fastsearch.h" 617#include "stringlib/partition.h" 618#include "stringlib/split.h" 619#include "stringlib/count.h" 620#include "stringlib/find.h" 621#include "stringlib/replace.h" 622#include "stringlib/find_max_char.h" 623#include "stringlib/localeutil.h" 624#include "stringlib/undef.h" 625 626#include "stringlib/ucs4lib.h" 627#include "stringlib/fastsearch.h" 628#include "stringlib/partition.h" 629#include "stringlib/split.h" 630#include "stringlib/count.h" 631#include "stringlib/find.h" 632#include "stringlib/replace.h" 633#include "stringlib/find_max_char.h" 634#include "stringlib/localeutil.h" 635#include "stringlib/undef.h" 636 637#include "stringlib/unicodedefs.h" 638#include "stringlib/fastsearch.h" 639#include "stringlib/count.h" 640#include "stringlib/find.h" 641#include "stringlib/undef.h" 642 643/* --- Unicode Object ----------------------------------------------------- */ 644 645static PyObject * 646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 647 648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 649 Py_ssize_t size, Py_UCS4 ch, 650 int direction) 651{ 652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 653 654 switch (kind) { 655 case PyUnicode_1BYTE_KIND: 656 { 657 Py_UCS1 ch1 = (Py_UCS1) ch; 658 if (ch1 == ch) 659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 660 else 661 return -1; 662 } 663 case PyUnicode_2BYTE_KIND: 664 { 665 Py_UCS2 ch2 = (Py_UCS2) ch; 666 if (ch2 == ch) 667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 668 else 669 return -1; 670 } 671 case PyUnicode_4BYTE_KIND: 672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 673 default: 674 assert(0); 675 return -1; 676 } 677} 678 679#ifdef Py_DEBUG 680/* Fill the data of an Unicode string with invalid characters to detect bugs 681 earlier. 682 683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 685 invalid character in Unicode 6.0. */ 686static void 687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 688{ 689 int kind = PyUnicode_KIND(unicode); 690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 691 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 692 if (length <= old_length) 693 return; 694 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 695} 696#endif 697 698static PyObject* 699resize_compact(PyObject *unicode, Py_ssize_t length) 700{ 701 Py_ssize_t char_size; 702 Py_ssize_t struct_size; 703 Py_ssize_t new_size; 704 int share_wstr; 705 PyObject *new_unicode; 706#ifdef Py_DEBUG 707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 708#endif 709 710 assert(unicode_modifiable(unicode)); 711 assert(PyUnicode_IS_READY(unicode)); 712 assert(PyUnicode_IS_COMPACT(unicode)); 713 714 char_size = PyUnicode_KIND(unicode); 715 if (PyUnicode_IS_ASCII(unicode)) 716 struct_size = sizeof(PyASCIIObject); 717 else 718 struct_size = sizeof(PyCompactUnicodeObject); 719 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 720 721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 722 PyErr_NoMemory(); 723 return NULL; 724 } 725 new_size = (struct_size + (length + 1) * char_size); 726 727 _Py_DEC_REFTOTAL; 728 _Py_ForgetReference(unicode); 729 730 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 731 if (new_unicode == NULL) { 732 _Py_NewReference(unicode); 733 PyErr_NoMemory(); 734 return NULL; 735 } 736 unicode = new_unicode; 737 _Py_NewReference(unicode); 738 739 _PyUnicode_LENGTH(unicode) = length; 740 if (share_wstr) { 741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 742 if (!PyUnicode_IS_ASCII(unicode)) 743 _PyUnicode_WSTR_LENGTH(unicode) = length; 744 } 745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 746 PyObject_DEL(_PyUnicode_WSTR(unicode)); 747 _PyUnicode_WSTR(unicode) = NULL; 748 } 749#ifdef Py_DEBUG 750 unicode_fill_invalid(unicode, old_length); 751#endif 752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 753 length, 0); 754 assert(_PyUnicode_CheckConsistency(unicode, 0)); 755 return unicode; 756} 757 758static int 759resize_inplace(PyObject *unicode, Py_ssize_t length) 760{ 761 wchar_t *wstr; 762 Py_ssize_t new_size; 763 assert(!PyUnicode_IS_COMPACT(unicode)); 764 assert(Py_REFCNT(unicode) == 1); 765 766 if (PyUnicode_IS_READY(unicode)) { 767 Py_ssize_t char_size; 768 int share_wstr, share_utf8; 769 void *data; 770#ifdef Py_DEBUG 771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 772#endif 773 774 data = _PyUnicode_DATA_ANY(unicode); 775 char_size = PyUnicode_KIND(unicode); 776 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 778 779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 780 PyErr_NoMemory(); 781 return -1; 782 } 783 new_size = (length + 1) * char_size; 784 785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 786 { 787 PyObject_DEL(_PyUnicode_UTF8(unicode)); 788 _PyUnicode_UTF8(unicode) = NULL; 789 _PyUnicode_UTF8_LENGTH(unicode) = 0; 790 } 791 792 data = (PyObject *)PyObject_REALLOC(data, new_size); 793 if (data == NULL) { 794 PyErr_NoMemory(); 795 return -1; 796 } 797 _PyUnicode_DATA_ANY(unicode) = data; 798 if (share_wstr) { 799 _PyUnicode_WSTR(unicode) = data; 800 _PyUnicode_WSTR_LENGTH(unicode) = length; 801 } 802 if (share_utf8) { 803 _PyUnicode_UTF8(unicode) = data; 804 _PyUnicode_UTF8_LENGTH(unicode) = length; 805 } 806 _PyUnicode_LENGTH(unicode) = length; 807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 808#ifdef Py_DEBUG 809 unicode_fill_invalid(unicode, old_length); 810#endif 811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 812 assert(_PyUnicode_CheckConsistency(unicode, 0)); 813 return 0; 814 } 815 } 816 assert(_PyUnicode_WSTR(unicode) != NULL); 817 818 /* check for integer overflow */ 819 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 820 PyErr_NoMemory(); 821 return -1; 822 } 823 new_size = sizeof(wchar_t) * (length + 1); 824 wstr = _PyUnicode_WSTR(unicode); 825 wstr = PyObject_REALLOC(wstr, new_size); 826 if (!wstr) { 827 PyErr_NoMemory(); 828 return -1; 829 } 830 _PyUnicode_WSTR(unicode) = wstr; 831 _PyUnicode_WSTR(unicode)[length] = 0; 832 _PyUnicode_WSTR_LENGTH(unicode) = length; 833 assert(_PyUnicode_CheckConsistency(unicode, 0)); 834 return 0; 835} 836 837static PyObject* 838resize_copy(PyObject *unicode, Py_ssize_t length) 839{ 840 Py_ssize_t copy_length; 841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 842 PyObject *copy; 843 844 if (PyUnicode_READY(unicode) == -1) 845 return NULL; 846 847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 848 if (copy == NULL) 849 return NULL; 850 851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 853 return copy; 854 } 855 else { 856 PyObject *w; 857 858 w = (PyObject*)_PyUnicode_New(length); 859 if (w == NULL) 860 return NULL; 861 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 862 copy_length = Py_MIN(copy_length, length); 863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 864 copy_length * sizeof(wchar_t)); 865 return w; 866 } 867} 868 869/* We allocate one more byte to make sure the string is 870 Ux0000 terminated; some code (e.g. new_identifier) 871 relies on that. 872 873 XXX This allocator could further be enhanced by assuring that the 874 free list never reduces its size below 1. 875 876*/ 877 878static PyUnicodeObject * 879_PyUnicode_New(Py_ssize_t length) 880{ 881 PyUnicodeObject *unicode; 882 size_t new_size; 883 884 /* Optimization for empty strings */ 885 if (length == 0 && unicode_empty != NULL) { 886 Py_INCREF(unicode_empty); 887 return (PyUnicodeObject*)unicode_empty; 888 } 889 890 /* Ensure we won't overflow the size. */ 891 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 892 return (PyUnicodeObject *)PyErr_NoMemory(); 893 } 894 if (length < 0) { 895 PyErr_SetString(PyExc_SystemError, 896 "Negative size passed to _PyUnicode_New"); 897 return NULL; 898 } 899 900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 901 if (unicode == NULL) 902 return NULL; 903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 904 905 _PyUnicode_WSTR_LENGTH(unicode) = length; 906 _PyUnicode_HASH(unicode) = -1; 907 _PyUnicode_STATE(unicode).interned = 0; 908 _PyUnicode_STATE(unicode).kind = 0; 909 _PyUnicode_STATE(unicode).compact = 0; 910 _PyUnicode_STATE(unicode).ready = 0; 911 _PyUnicode_STATE(unicode).ascii = 0; 912 _PyUnicode_DATA_ANY(unicode) = NULL; 913 _PyUnicode_LENGTH(unicode) = 0; 914 _PyUnicode_UTF8(unicode) = NULL; 915 _PyUnicode_UTF8_LENGTH(unicode) = 0; 916 917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 918 if (!_PyUnicode_WSTR(unicode)) { 919 Py_DECREF(unicode); 920 PyErr_NoMemory(); 921 return NULL; 922 } 923 924 /* Initialize the first element to guard against cases where 925 * the caller fails before initializing str -- unicode_resize() 926 * reads str[0], and the Keep-Alive optimization can keep memory 927 * allocated for str alive across a call to unicode_dealloc(unicode). 928 * We don't want unicode_resize to read uninitialized memory in 929 * that case. 930 */ 931 _PyUnicode_WSTR(unicode)[0] = 0; 932 _PyUnicode_WSTR(unicode)[length] = 0; 933 934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 935 return unicode; 936} 937 938static const char* 939unicode_kind_name(PyObject *unicode) 940{ 941 /* don't check consistency: unicode_kind_name() is called from 942 _PyUnicode_Dump() */ 943 if (!PyUnicode_IS_COMPACT(unicode)) 944 { 945 if (!PyUnicode_IS_READY(unicode)) 946 return "wstr"; 947 switch (PyUnicode_KIND(unicode)) 948 { 949 case PyUnicode_1BYTE_KIND: 950 if (PyUnicode_IS_ASCII(unicode)) 951 return "legacy ascii"; 952 else 953 return "legacy latin1"; 954 case PyUnicode_2BYTE_KIND: 955 return "legacy UCS2"; 956 case PyUnicode_4BYTE_KIND: 957 return "legacy UCS4"; 958 default: 959 return "<legacy invalid kind>"; 960 } 961 } 962 assert(PyUnicode_IS_READY(unicode)); 963 switch (PyUnicode_KIND(unicode)) { 964 case PyUnicode_1BYTE_KIND: 965 if (PyUnicode_IS_ASCII(unicode)) 966 return "ascii"; 967 else 968 return "latin1"; 969 case PyUnicode_2BYTE_KIND: 970 return "UCS2"; 971 case PyUnicode_4BYTE_KIND: 972 return "UCS4"; 973 default: 974 return "<invalid compact kind>"; 975 } 976} 977 978#ifdef Py_DEBUG 979/* Functions wrapping macros for use in debugger */ 980char *_PyUnicode_utf8(void *unicode){ 981 return PyUnicode_UTF8(unicode); 982} 983 984void *_PyUnicode_compact_data(void *unicode) { 985 return _PyUnicode_COMPACT_DATA(unicode); 986} 987void *_PyUnicode_data(void *unicode){ 988 printf("obj %p\n", unicode); 989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 994 return PyUnicode_DATA(unicode); 995} 996 997void 998_PyUnicode_Dump(PyObject *op) 999{ 1000 PyASCIIObject *ascii = (PyASCIIObject *)op; 1001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1003 void *data; 1004 1005 if (ascii->state.compact) 1006 { 1007 if (ascii->state.ascii) 1008 data = (ascii + 1); 1009 else 1010 data = (compact + 1); 1011 } 1012 else 1013 data = unicode->data.any; 1014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1015 unicode_kind_name(op), ascii->length); 1016 1017 if (ascii->wstr == data) 1018 printf("shared "); 1019 printf("wstr=%p", ascii->wstr); 1020 1021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1023 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1024 printf("shared "); 1025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1026 compact->utf8, compact->utf8_length); 1027 } 1028 printf(", data=%p\n", data); 1029} 1030#endif 1031 1032PyObject * 1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1034{ 1035 PyObject *obj; 1036 PyCompactUnicodeObject *unicode; 1037 void *data; 1038 enum PyUnicode_Kind kind; 1039 int is_sharing, is_ascii; 1040 Py_ssize_t char_size; 1041 Py_ssize_t struct_size; 1042 1043 /* Optimization for empty strings */ 1044 if (size == 0 && unicode_empty != NULL) { 1045 Py_INCREF(unicode_empty); 1046 return unicode_empty; 1047 } 1048 1049 is_ascii = 0; 1050 is_sharing = 0; 1051 struct_size = sizeof(PyCompactUnicodeObject); 1052 if (maxchar < 128) { 1053 kind = PyUnicode_1BYTE_KIND; 1054 char_size = 1; 1055 is_ascii = 1; 1056 struct_size = sizeof(PyASCIIObject); 1057 } 1058 else if (maxchar < 256) { 1059 kind = PyUnicode_1BYTE_KIND; 1060 char_size = 1; 1061 } 1062 else if (maxchar < 65536) { 1063 kind = PyUnicode_2BYTE_KIND; 1064 char_size = 2; 1065 if (sizeof(wchar_t) == 2) 1066 is_sharing = 1; 1067 } 1068 else { 1069 if (maxchar > MAX_UNICODE) { 1070 PyErr_SetString(PyExc_SystemError, 1071 "invalid maximum character passed to PyUnicode_New"); 1072 return NULL; 1073 } 1074 kind = PyUnicode_4BYTE_KIND; 1075 char_size = 4; 1076 if (sizeof(wchar_t) == 4) 1077 is_sharing = 1; 1078 } 1079 1080 /* Ensure we won't overflow the size. */ 1081 if (size < 0) { 1082 PyErr_SetString(PyExc_SystemError, 1083 "Negative size passed to PyUnicode_New"); 1084 return NULL; 1085 } 1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1087 return PyErr_NoMemory(); 1088 1089 /* Duplicated allocation code from _PyObject_New() instead of a call to 1090 * PyObject_New() so we are able to allocate space for the object and 1091 * it's data buffer. 1092 */ 1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1094 if (obj == NULL) 1095 return PyErr_NoMemory(); 1096 obj = PyObject_INIT(obj, &PyUnicode_Type); 1097 if (obj == NULL) 1098 return NULL; 1099 1100 unicode = (PyCompactUnicodeObject *)obj; 1101 if (is_ascii) 1102 data = ((PyASCIIObject*)obj) + 1; 1103 else 1104 data = unicode + 1; 1105 _PyUnicode_LENGTH(unicode) = size; 1106 _PyUnicode_HASH(unicode) = -1; 1107 _PyUnicode_STATE(unicode).interned = 0; 1108 _PyUnicode_STATE(unicode).kind = kind; 1109 _PyUnicode_STATE(unicode).compact = 1; 1110 _PyUnicode_STATE(unicode).ready = 1; 1111 _PyUnicode_STATE(unicode).ascii = is_ascii; 1112 if (is_ascii) { 1113 ((char*)data)[size] = 0; 1114 _PyUnicode_WSTR(unicode) = NULL; 1115 } 1116 else if (kind == PyUnicode_1BYTE_KIND) { 1117 ((char*)data)[size] = 0; 1118 _PyUnicode_WSTR(unicode) = NULL; 1119 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1120 unicode->utf8 = NULL; 1121 unicode->utf8_length = 0; 1122 } 1123 else { 1124 unicode->utf8 = NULL; 1125 unicode->utf8_length = 0; 1126 if (kind == PyUnicode_2BYTE_KIND) 1127 ((Py_UCS2*)data)[size] = 0; 1128 else /* kind == PyUnicode_4BYTE_KIND */ 1129 ((Py_UCS4*)data)[size] = 0; 1130 if (is_sharing) { 1131 _PyUnicode_WSTR_LENGTH(unicode) = size; 1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1133 } 1134 else { 1135 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1136 _PyUnicode_WSTR(unicode) = NULL; 1137 } 1138 } 1139#ifdef Py_DEBUG 1140 unicode_fill_invalid((PyObject*)unicode, 0); 1141#endif 1142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1143 return obj; 1144} 1145 1146#if SIZEOF_WCHAR_T == 2 1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1148 will decode surrogate pairs, the other conversions are implemented as macros 1149 for efficiency. 1150 1151 This function assumes that unicode can hold one more code point than wstr 1152 characters for a terminating null character. */ 1153static void 1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1155 PyObject *unicode) 1156{ 1157 const wchar_t *iter; 1158 Py_UCS4 *ucs4_out; 1159 1160 assert(unicode != NULL); 1161 assert(_PyUnicode_CHECK(unicode)); 1162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1164 1165 for (iter = begin; iter < end; ) { 1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1167 _PyUnicode_GET_LENGTH(unicode))); 1168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1169 && (iter+1) < end 1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1171 { 1172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1173 iter += 2; 1174 } 1175 else { 1176 *ucs4_out++ = *iter; 1177 iter++; 1178 } 1179 } 1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1181 _PyUnicode_GET_LENGTH(unicode))); 1182 1183} 1184#endif 1185 1186static int 1187unicode_check_modifiable(PyObject *unicode) 1188{ 1189 if (!unicode_modifiable(unicode)) { 1190 PyErr_SetString(PyExc_SystemError, 1191 "Cannot modify a string currently used"); 1192 return -1; 1193 } 1194 return 0; 1195} 1196 1197static int 1198_copy_characters(PyObject *to, Py_ssize_t to_start, 1199 PyObject *from, Py_ssize_t from_start, 1200 Py_ssize_t how_many, int check_maxchar) 1201{ 1202 unsigned int from_kind, to_kind; 1203 void *from_data, *to_data; 1204 1205 assert(0 <= how_many); 1206 assert(0 <= from_start); 1207 assert(0 <= to_start); 1208 assert(PyUnicode_Check(from)); 1209 assert(PyUnicode_IS_READY(from)); 1210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1211 1212 assert(PyUnicode_Check(to)); 1213 assert(PyUnicode_IS_READY(to)); 1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1215 1216 if (how_many == 0) 1217 return 0; 1218 1219 from_kind = PyUnicode_KIND(from); 1220 from_data = PyUnicode_DATA(from); 1221 to_kind = PyUnicode_KIND(to); 1222 to_data = PyUnicode_DATA(to); 1223 1224#ifdef Py_DEBUG 1225 if (!check_maxchar 1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1227 { 1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1229 Py_UCS4 ch; 1230 Py_ssize_t i; 1231 for (i=0; i < how_many; i++) { 1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1233 assert(ch <= to_maxchar); 1234 } 1235 } 1236#endif 1237 1238 if (from_kind == to_kind) { 1239 if (check_maxchar 1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1241 { 1242 /* Writing Latin-1 characters into an ASCII string requires to 1243 check that all written characters are pure ASCII */ 1244 Py_UCS4 max_char; 1245 max_char = ucs1lib_find_max_char(from_data, 1246 (Py_UCS1*)from_data + how_many); 1247 if (max_char >= 128) 1248 return -1; 1249 } 1250 Py_MEMCPY((char*)to_data + to_kind * to_start, 1251 (char*)from_data + from_kind * from_start, 1252 to_kind * how_many); 1253 } 1254 else if (from_kind == PyUnicode_1BYTE_KIND 1255 && to_kind == PyUnicode_2BYTE_KIND) 1256 { 1257 _PyUnicode_CONVERT_BYTES( 1258 Py_UCS1, Py_UCS2, 1259 PyUnicode_1BYTE_DATA(from) + from_start, 1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1261 PyUnicode_2BYTE_DATA(to) + to_start 1262 ); 1263 } 1264 else if (from_kind == PyUnicode_1BYTE_KIND 1265 && to_kind == PyUnicode_4BYTE_KIND) 1266 { 1267 _PyUnicode_CONVERT_BYTES( 1268 Py_UCS1, Py_UCS4, 1269 PyUnicode_1BYTE_DATA(from) + from_start, 1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1271 PyUnicode_4BYTE_DATA(to) + to_start 1272 ); 1273 } 1274 else if (from_kind == PyUnicode_2BYTE_KIND 1275 && to_kind == PyUnicode_4BYTE_KIND) 1276 { 1277 _PyUnicode_CONVERT_BYTES( 1278 Py_UCS2, Py_UCS4, 1279 PyUnicode_2BYTE_DATA(from) + from_start, 1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1281 PyUnicode_4BYTE_DATA(to) + to_start 1282 ); 1283 } 1284 else { 1285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1286 1287 if (!check_maxchar) { 1288 if (from_kind == PyUnicode_2BYTE_KIND 1289 && to_kind == PyUnicode_1BYTE_KIND) 1290 { 1291 _PyUnicode_CONVERT_BYTES( 1292 Py_UCS2, Py_UCS1, 1293 PyUnicode_2BYTE_DATA(from) + from_start, 1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1295 PyUnicode_1BYTE_DATA(to) + to_start 1296 ); 1297 } 1298 else if (from_kind == PyUnicode_4BYTE_KIND 1299 && to_kind == PyUnicode_1BYTE_KIND) 1300 { 1301 _PyUnicode_CONVERT_BYTES( 1302 Py_UCS4, Py_UCS1, 1303 PyUnicode_4BYTE_DATA(from) + from_start, 1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1305 PyUnicode_1BYTE_DATA(to) + to_start 1306 ); 1307 } 1308 else if (from_kind == PyUnicode_4BYTE_KIND 1309 && to_kind == PyUnicode_2BYTE_KIND) 1310 { 1311 _PyUnicode_CONVERT_BYTES( 1312 Py_UCS4, Py_UCS2, 1313 PyUnicode_4BYTE_DATA(from) + from_start, 1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1315 PyUnicode_2BYTE_DATA(to) + to_start 1316 ); 1317 } 1318 else { 1319 assert(0); 1320 return -1; 1321 } 1322 } 1323 else { 1324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1325 Py_UCS4 ch; 1326 Py_ssize_t i; 1327 1328 for (i=0; i < how_many; i++) { 1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1330 if (ch > to_maxchar) 1331 return -1; 1332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1333 } 1334 } 1335 } 1336 return 0; 1337} 1338 1339void 1340_PyUnicode_FastCopyCharacters( 1341 PyObject *to, Py_ssize_t to_start, 1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1343{ 1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1345} 1346 1347Py_ssize_t 1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1349 PyObject *from, Py_ssize_t from_start, 1350 Py_ssize_t how_many) 1351{ 1352 int err; 1353 1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1355 PyErr_BadInternalCall(); 1356 return -1; 1357 } 1358 1359 if (PyUnicode_READY(from) == -1) 1360 return -1; 1361 if (PyUnicode_READY(to) == -1) 1362 return -1; 1363 1364 if (from_start < 0) { 1365 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1366 return -1; 1367 } 1368 if (to_start < 0) { 1369 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1370 return -1; 1371 } 1372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1374 PyErr_Format(PyExc_SystemError, 1375 "Cannot write %zi characters at %zi " 1376 "in a string of %zi characters", 1377 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1378 return -1; 1379 } 1380 1381 if (how_many == 0) 1382 return 0; 1383 1384 if (unicode_check_modifiable(to)) 1385 return -1; 1386 1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1388 if (err) { 1389 PyErr_Format(PyExc_SystemError, 1390 "Cannot copy %s characters " 1391 "into a string of %s characters", 1392 unicode_kind_name(from), 1393 unicode_kind_name(to)); 1394 return -1; 1395 } 1396 return how_many; 1397} 1398 1399/* Find the maximum code point and count the number of surrogate pairs so a 1400 correct string length can be computed before converting a string to UCS4. 1401 This function counts single surrogates as a character and not as a pair. 1402 1403 Return 0 on success, or -1 on error. */ 1404static int 1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1407{ 1408 const wchar_t *iter; 1409 Py_UCS4 ch; 1410 1411 assert(num_surrogates != NULL && maxchar != NULL); 1412 *num_surrogates = 0; 1413 *maxchar = 0; 1414 1415 for (iter = begin; iter < end; ) { 1416#if SIZEOF_WCHAR_T == 2 1417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1418 && (iter+1) < end 1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1420 { 1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1422 ++(*num_surrogates); 1423 iter += 2; 1424 } 1425 else 1426#endif 1427 { 1428 ch = *iter; 1429 iter++; 1430 } 1431 if (ch > *maxchar) { 1432 *maxchar = ch; 1433 if (*maxchar > MAX_UNICODE) { 1434 PyErr_Format(PyExc_ValueError, 1435 "character U+%x is not in range [U+0000; U+10ffff]", 1436 ch); 1437 return -1; 1438 } 1439 } 1440 } 1441 return 0; 1442} 1443 1444int 1445_PyUnicode_Ready(PyObject *unicode) 1446{ 1447 wchar_t *end; 1448 Py_UCS4 maxchar = 0; 1449 Py_ssize_t num_surrogates; 1450#if SIZEOF_WCHAR_T == 2 1451 Py_ssize_t length_wo_surrogates; 1452#endif 1453 1454 /* _PyUnicode_Ready() is only intended for old-style API usage where 1455 strings were created using _PyObject_New() and where no canonical 1456 representation (the str field) has been set yet aka strings 1457 which are not yet ready. */ 1458 assert(_PyUnicode_CHECK(unicode)); 1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1460 assert(_PyUnicode_WSTR(unicode) != NULL); 1461 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1462 assert(_PyUnicode_UTF8(unicode) == NULL); 1463 /* Actually, it should neither be interned nor be anything else: */ 1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1465 1466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1468 &maxchar, &num_surrogates) == -1) 1469 return -1; 1470 1471 if (maxchar < 256) { 1472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1473 if (!_PyUnicode_DATA_ANY(unicode)) { 1474 PyErr_NoMemory(); 1475 return -1; 1476 } 1477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1478 _PyUnicode_WSTR(unicode), end, 1479 PyUnicode_1BYTE_DATA(unicode)); 1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1483 if (maxchar < 128) { 1484 _PyUnicode_STATE(unicode).ascii = 1; 1485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1487 } 1488 else { 1489 _PyUnicode_STATE(unicode).ascii = 0; 1490 _PyUnicode_UTF8(unicode) = NULL; 1491 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1492 } 1493 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1494 _PyUnicode_WSTR(unicode) = NULL; 1495 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1496 } 1497 /* In this case we might have to convert down from 4-byte native 1498 wchar_t to 2-byte unicode. */ 1499 else if (maxchar < 65536) { 1500 assert(num_surrogates == 0 && 1501 "FindMaxCharAndNumSurrogatePairs() messed up"); 1502 1503#if SIZEOF_WCHAR_T == 2 1504 /* We can share representations and are done. */ 1505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1509 _PyUnicode_UTF8(unicode) = NULL; 1510 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1511#else 1512 /* sizeof(wchar_t) == 4 */ 1513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1515 if (!_PyUnicode_DATA_ANY(unicode)) { 1516 PyErr_NoMemory(); 1517 return -1; 1518 } 1519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1520 _PyUnicode_WSTR(unicode), end, 1521 PyUnicode_2BYTE_DATA(unicode)); 1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1525 _PyUnicode_UTF8(unicode) = NULL; 1526 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1527 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1528 _PyUnicode_WSTR(unicode) = NULL; 1529 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1530#endif 1531 } 1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1533 else { 1534#if SIZEOF_WCHAR_T == 2 1535 /* in case the native representation is 2-bytes, we need to allocate a 1536 new normalized 4-byte version. */ 1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1539 if (!_PyUnicode_DATA_ANY(unicode)) { 1540 PyErr_NoMemory(); 1541 return -1; 1542 } 1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1545 _PyUnicode_UTF8(unicode) = NULL; 1546 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1547 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1548 _PyUnicode_STATE(unicode).ready = 1; 1549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1550 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1551 _PyUnicode_WSTR(unicode) = NULL; 1552 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1553#else 1554 assert(num_surrogates == 0); 1555 1556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1558 _PyUnicode_UTF8(unicode) = NULL; 1559 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1561#endif 1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1563 } 1564 _PyUnicode_STATE(unicode).ready = 1; 1565 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1566 return 0; 1567} 1568 1569static void 1570unicode_dealloc(PyObject *unicode) 1571{ 1572 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1573 case SSTATE_NOT_INTERNED: 1574 break; 1575 1576 case SSTATE_INTERNED_MORTAL: 1577 /* revive dead object temporarily for DelItem */ 1578 Py_REFCNT(unicode) = 3; 1579 if (PyDict_DelItem(interned, unicode) != 0) 1580 Py_FatalError( 1581 "deletion of interned string failed"); 1582 break; 1583 1584 case SSTATE_INTERNED_IMMORTAL: 1585 Py_FatalError("Immortal interned string died."); 1586 1587 default: 1588 Py_FatalError("Inconsistent interned string state."); 1589 } 1590 1591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1592 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1594 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1597 1598 Py_TYPE(unicode)->tp_free(unicode); 1599} 1600 1601#ifdef Py_DEBUG 1602static int 1603unicode_is_singleton(PyObject *unicode) 1604{ 1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1606 if (unicode == unicode_empty) 1607 return 1; 1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1609 { 1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1611 if (ch < 256 && unicode_latin1[ch] == unicode) 1612 return 1; 1613 } 1614 return 0; 1615} 1616#endif 1617 1618static int 1619unicode_modifiable(PyObject *unicode) 1620{ 1621 assert(_PyUnicode_CHECK(unicode)); 1622 if (Py_REFCNT(unicode) != 1) 1623 return 0; 1624 if (_PyUnicode_HASH(unicode) != -1) 1625 return 0; 1626 if (PyUnicode_CHECK_INTERNED(unicode)) 1627 return 0; 1628 if (!PyUnicode_CheckExact(unicode)) 1629 return 0; 1630#ifdef Py_DEBUG 1631 /* singleton refcount is greater than 1 */ 1632 assert(!unicode_is_singleton(unicode)); 1633#endif 1634 return 1; 1635} 1636 1637static int 1638unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1639{ 1640 PyObject *unicode; 1641 Py_ssize_t old_length; 1642 1643 assert(p_unicode != NULL); 1644 unicode = *p_unicode; 1645 1646 assert(unicode != NULL); 1647 assert(PyUnicode_Check(unicode)); 1648 assert(0 <= length); 1649 1650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1651 old_length = PyUnicode_WSTR_LENGTH(unicode); 1652 else 1653 old_length = PyUnicode_GET_LENGTH(unicode); 1654 if (old_length == length) 1655 return 0; 1656 1657 if (length == 0) { 1658 _Py_INCREF_UNICODE_EMPTY(); 1659 if (!unicode_empty) 1660 return -1; 1661 Py_DECREF(*p_unicode); 1662 *p_unicode = unicode_empty; 1663 return 0; 1664 } 1665 1666 if (!unicode_modifiable(unicode)) { 1667 PyObject *copy = resize_copy(unicode, length); 1668 if (copy == NULL) 1669 return -1; 1670 Py_DECREF(*p_unicode); 1671 *p_unicode = copy; 1672 return 0; 1673 } 1674 1675 if (PyUnicode_IS_COMPACT(unicode)) { 1676 PyObject *new_unicode = resize_compact(unicode, length); 1677 if (new_unicode == NULL) 1678 return -1; 1679 *p_unicode = new_unicode; 1680 return 0; 1681 } 1682 return resize_inplace(unicode, length); 1683} 1684 1685int 1686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1687{ 1688 PyObject *unicode; 1689 if (p_unicode == NULL) { 1690 PyErr_BadInternalCall(); 1691 return -1; 1692 } 1693 unicode = *p_unicode; 1694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1695 { 1696 PyErr_BadInternalCall(); 1697 return -1; 1698 } 1699 return unicode_resize(p_unicode, length); 1700} 1701 1702/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1703 1704 WARNING: The function doesn't copy the terminating null character and 1705 doesn't check the maximum character (may write a latin1 character in an 1706 ASCII string). */ 1707static void 1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1709 const char *str, Py_ssize_t len) 1710{ 1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1712 void *data = PyUnicode_DATA(unicode); 1713 const char *end = str + len; 1714 1715 switch (kind) { 1716 case PyUnicode_1BYTE_KIND: { 1717 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1718#ifdef Py_DEBUG 1719 if (PyUnicode_IS_ASCII(unicode)) { 1720 Py_UCS4 maxchar = ucs1lib_find_max_char( 1721 (const Py_UCS1*)str, 1722 (const Py_UCS1*)str + len); 1723 assert(maxchar < 128); 1724 } 1725#endif 1726 memcpy((char *) data + index, str, len); 1727 break; 1728 } 1729 case PyUnicode_2BYTE_KIND: { 1730 Py_UCS2 *start = (Py_UCS2 *)data + index; 1731 Py_UCS2 *ucs2 = start; 1732 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1733 1734 for (; str < end; ++ucs2, ++str) 1735 *ucs2 = (Py_UCS2)*str; 1736 1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1738 break; 1739 } 1740 default: { 1741 Py_UCS4 *start = (Py_UCS4 *)data + index; 1742 Py_UCS4 *ucs4 = start; 1743 assert(kind == PyUnicode_4BYTE_KIND); 1744 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1745 1746 for (; str < end; ++ucs4, ++str) 1747 *ucs4 = (Py_UCS4)*str; 1748 1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1750 } 1751 } 1752} 1753 1754static PyObject* 1755get_latin1_char(unsigned char ch) 1756{ 1757 PyObject *unicode = unicode_latin1[ch]; 1758 if (!unicode) { 1759 unicode = PyUnicode_New(1, ch); 1760 if (!unicode) 1761 return NULL; 1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1763 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1764 unicode_latin1[ch] = unicode; 1765 } 1766 Py_INCREF(unicode); 1767 return unicode; 1768} 1769 1770static PyObject* 1771unicode_char(Py_UCS4 ch) 1772{ 1773 PyObject *unicode; 1774 1775 assert(ch <= MAX_UNICODE); 1776 1777 if (ch < 256) 1778 return get_latin1_char(ch); 1779 1780 unicode = PyUnicode_New(1, ch); 1781 if (unicode == NULL) 1782 return NULL; 1783 switch (PyUnicode_KIND(unicode)) { 1784 case PyUnicode_1BYTE_KIND: 1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1786 break; 1787 case PyUnicode_2BYTE_KIND: 1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1789 break; 1790 default: 1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1793 } 1794 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1795 return unicode; 1796} 1797 1798PyObject * 1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1800{ 1801 PyObject *unicode; 1802 Py_UCS4 maxchar = 0; 1803 Py_ssize_t num_surrogates; 1804 1805 if (u == NULL) 1806 return (PyObject*)_PyUnicode_New(size); 1807 1808 /* If the Unicode data is known at construction time, we can apply 1809 some optimizations which share commonly used objects. */ 1810 1811 /* Optimization for empty strings */ 1812 if (size == 0) 1813 _Py_RETURN_UNICODE_EMPTY(); 1814 1815 /* Single character Unicode objects in the Latin-1 range are 1816 shared when using this constructor */ 1817 if (size == 1 && (Py_UCS4)*u < 256) 1818 return get_latin1_char((unsigned char)*u); 1819 1820 /* If not empty and not single character, copy the Unicode data 1821 into the new object */ 1822 if (find_maxchar_surrogates(u, u + size, 1823 &maxchar, &num_surrogates) == -1) 1824 return NULL; 1825 1826 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1827 if (!unicode) 1828 return NULL; 1829 1830 switch (PyUnicode_KIND(unicode)) { 1831 case PyUnicode_1BYTE_KIND: 1832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1833 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1834 break; 1835 case PyUnicode_2BYTE_KIND: 1836#if Py_UNICODE_SIZE == 2 1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1838#else 1839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1840 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1841#endif 1842 break; 1843 case PyUnicode_4BYTE_KIND: 1844#if SIZEOF_WCHAR_T == 2 1845 /* This is the only case which has to process surrogates, thus 1846 a simple copy loop is not enough and we need a function. */ 1847 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1848#else 1849 assert(num_surrogates == 0); 1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1851#endif 1852 break; 1853 default: 1854 assert(0 && "Impossible state"); 1855 } 1856 1857 return unicode_result(unicode); 1858} 1859 1860PyObject * 1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1862{ 1863 if (size < 0) { 1864 PyErr_SetString(PyExc_SystemError, 1865 "Negative size passed to PyUnicode_FromStringAndSize"); 1866 return NULL; 1867 } 1868 if (u != NULL) 1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1870 else 1871 return (PyObject *)_PyUnicode_New(size); 1872} 1873 1874PyObject * 1875PyUnicode_FromString(const char *u) 1876{ 1877 size_t size = strlen(u); 1878 if (size > PY_SSIZE_T_MAX) { 1879 PyErr_SetString(PyExc_OverflowError, "input too long"); 1880 return NULL; 1881 } 1882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1883} 1884 1885PyObject * 1886_PyUnicode_FromId(_Py_Identifier *id) 1887{ 1888 if (!id->object) { 1889 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1890 strlen(id->string), 1891 NULL, NULL); 1892 if (!id->object) 1893 return NULL; 1894 PyUnicode_InternInPlace(&id->object); 1895 assert(!id->next); 1896 id->next = static_strings; 1897 static_strings = id; 1898 } 1899 return id->object; 1900} 1901 1902void 1903_PyUnicode_ClearStaticStrings() 1904{ 1905 _Py_Identifier *tmp, *s = static_strings; 1906 while (s) { 1907 Py_CLEAR(s->object); 1908 tmp = s->next; 1909 s->next = NULL; 1910 s = tmp; 1911 } 1912 static_strings = NULL; 1913} 1914 1915/* Internal function, doesn't check maximum character */ 1916 1917PyObject* 1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1919{ 1920 const unsigned char *s = (const unsigned char *)buffer; 1921 PyObject *unicode; 1922 if (size == 1) { 1923#ifdef Py_DEBUG 1924 assert((unsigned char)s[0] < 128); 1925#endif 1926 return get_latin1_char(s[0]); 1927 } 1928 unicode = PyUnicode_New(size, 127); 1929 if (!unicode) 1930 return NULL; 1931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1932 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1933 return unicode; 1934} 1935 1936static Py_UCS4 1937kind_maxchar_limit(unsigned int kind) 1938{ 1939 switch (kind) { 1940 case PyUnicode_1BYTE_KIND: 1941 return 0x80; 1942 case PyUnicode_2BYTE_KIND: 1943 return 0x100; 1944 case PyUnicode_4BYTE_KIND: 1945 return 0x10000; 1946 default: 1947 assert(0 && "invalid kind"); 1948 return MAX_UNICODE; 1949 } 1950} 1951 1952Py_LOCAL_INLINE(Py_UCS4) 1953align_maxchar(Py_UCS4 maxchar) 1954{ 1955 if (maxchar <= 127) 1956 return 127; 1957 else if (maxchar <= 255) 1958 return 255; 1959 else if (maxchar <= 65535) 1960 return 65535; 1961 else 1962 return MAX_UNICODE; 1963} 1964 1965static PyObject* 1966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1967{ 1968 PyObject *res; 1969 unsigned char max_char; 1970 1971 if (size == 0) 1972 _Py_RETURN_UNICODE_EMPTY(); 1973 assert(size > 0); 1974 if (size == 1) 1975 return get_latin1_char(u[0]); 1976 1977 max_char = ucs1lib_find_max_char(u, u + size); 1978 res = PyUnicode_New(size, max_char); 1979 if (!res) 1980 return NULL; 1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1982 assert(_PyUnicode_CheckConsistency(res, 1)); 1983 return res; 1984} 1985 1986static PyObject* 1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1988{ 1989 PyObject *res; 1990 Py_UCS2 max_char; 1991 1992 if (size == 0) 1993 _Py_RETURN_UNICODE_EMPTY(); 1994 assert(size > 0); 1995 if (size == 1) 1996 return unicode_char(u[0]); 1997 1998 max_char = ucs2lib_find_max_char(u, u + size); 1999 res = PyUnicode_New(size, max_char); 2000 if (!res) 2001 return NULL; 2002 if (max_char >= 256) 2003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2004 else { 2005 _PyUnicode_CONVERT_BYTES( 2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2007 } 2008 assert(_PyUnicode_CheckConsistency(res, 1)); 2009 return res; 2010} 2011 2012static PyObject* 2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2014{ 2015 PyObject *res; 2016 Py_UCS4 max_char; 2017 2018 if (size == 0) 2019 _Py_RETURN_UNICODE_EMPTY(); 2020 assert(size > 0); 2021 if (size == 1) 2022 return unicode_char(u[0]); 2023 2024 max_char = ucs4lib_find_max_char(u, u + size); 2025 res = PyUnicode_New(size, max_char); 2026 if (!res) 2027 return NULL; 2028 if (max_char < 256) 2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2030 PyUnicode_1BYTE_DATA(res)); 2031 else if (max_char < 0x10000) 2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2033 PyUnicode_2BYTE_DATA(res)); 2034 else 2035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2036 assert(_PyUnicode_CheckConsistency(res, 1)); 2037 return res; 2038} 2039 2040PyObject* 2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2042{ 2043 if (size < 0) { 2044 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2045 return NULL; 2046 } 2047 switch (kind) { 2048 case PyUnicode_1BYTE_KIND: 2049 return _PyUnicode_FromUCS1(buffer, size); 2050 case PyUnicode_2BYTE_KIND: 2051 return _PyUnicode_FromUCS2(buffer, size); 2052 case PyUnicode_4BYTE_KIND: 2053 return _PyUnicode_FromUCS4(buffer, size); 2054 default: 2055 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2056 return NULL; 2057 } 2058} 2059 2060Py_UCS4 2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2062{ 2063 enum PyUnicode_Kind kind; 2064 void *startptr, *endptr; 2065 2066 assert(PyUnicode_IS_READY(unicode)); 2067 assert(0 <= start); 2068 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2069 assert(start <= end); 2070 2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2072 return PyUnicode_MAX_CHAR_VALUE(unicode); 2073 2074 if (start == end) 2075 return 127; 2076 2077 if (PyUnicode_IS_ASCII(unicode)) 2078 return 127; 2079 2080 kind = PyUnicode_KIND(unicode); 2081 startptr = PyUnicode_DATA(unicode); 2082 endptr = (char *)startptr + end * kind; 2083 startptr = (char *)startptr + start * kind; 2084 switch(kind) { 2085 case PyUnicode_1BYTE_KIND: 2086 return ucs1lib_find_max_char(startptr, endptr); 2087 case PyUnicode_2BYTE_KIND: 2088 return ucs2lib_find_max_char(startptr, endptr); 2089 case PyUnicode_4BYTE_KIND: 2090 return ucs4lib_find_max_char(startptr, endptr); 2091 default: 2092 assert(0); 2093 return 0; 2094 } 2095} 2096 2097/* Ensure that a string uses the most efficient storage, if it is not the 2098 case: create a new string with of the right kind. Write NULL into *p_unicode 2099 on error. */ 2100static void 2101unicode_adjust_maxchar(PyObject **p_unicode) 2102{ 2103 PyObject *unicode, *copy; 2104 Py_UCS4 max_char; 2105 Py_ssize_t len; 2106 unsigned int kind; 2107 2108 assert(p_unicode != NULL); 2109 unicode = *p_unicode; 2110 assert(PyUnicode_IS_READY(unicode)); 2111 if (PyUnicode_IS_ASCII(unicode)) 2112 return; 2113 2114 len = PyUnicode_GET_LENGTH(unicode); 2115 kind = PyUnicode_KIND(unicode); 2116 if (kind == PyUnicode_1BYTE_KIND) { 2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2118 max_char = ucs1lib_find_max_char(u, u + len); 2119 if (max_char >= 128) 2120 return; 2121 } 2122 else if (kind == PyUnicode_2BYTE_KIND) { 2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2124 max_char = ucs2lib_find_max_char(u, u + len); 2125 if (max_char >= 256) 2126 return; 2127 } 2128 else { 2129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2130 assert(kind == PyUnicode_4BYTE_KIND); 2131 max_char = ucs4lib_find_max_char(u, u + len); 2132 if (max_char >= 0x10000) 2133 return; 2134 } 2135 copy = PyUnicode_New(len, max_char); 2136 if (copy != NULL) 2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2138 Py_DECREF(unicode); 2139 *p_unicode = copy; 2140} 2141 2142PyObject* 2143_PyUnicode_Copy(PyObject *unicode) 2144{ 2145 Py_ssize_t length; 2146 PyObject *copy; 2147 2148 if (!PyUnicode_Check(unicode)) { 2149 PyErr_BadInternalCall(); 2150 return NULL; 2151 } 2152 if (PyUnicode_READY(unicode) == -1) 2153 return NULL; 2154 2155 length = PyUnicode_GET_LENGTH(unicode); 2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2157 if (!copy) 2158 return NULL; 2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2160 2161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2162 length * PyUnicode_KIND(unicode)); 2163 assert(_PyUnicode_CheckConsistency(copy, 1)); 2164 return copy; 2165} 2166 2167 2168/* Widen Unicode objects to larger buffers. Don't write terminating null 2169 character. Return NULL on error. */ 2170 2171void* 2172_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2173{ 2174 Py_ssize_t len; 2175 void *result; 2176 unsigned int skind; 2177 2178 if (PyUnicode_READY(s) == -1) 2179 return NULL; 2180 2181 len = PyUnicode_GET_LENGTH(s); 2182 skind = PyUnicode_KIND(s); 2183 if (skind >= kind) { 2184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2185 return NULL; 2186 } 2187 switch (kind) { 2188 case PyUnicode_2BYTE_KIND: 2189 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2190 if (!result) 2191 return PyErr_NoMemory(); 2192 assert(skind == PyUnicode_1BYTE_KIND); 2193 _PyUnicode_CONVERT_BYTES( 2194 Py_UCS1, Py_UCS2, 2195 PyUnicode_1BYTE_DATA(s), 2196 PyUnicode_1BYTE_DATA(s) + len, 2197 result); 2198 return result; 2199 case PyUnicode_4BYTE_KIND: 2200 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2201 if (!result) 2202 return PyErr_NoMemory(); 2203 if (skind == PyUnicode_2BYTE_KIND) { 2204 _PyUnicode_CONVERT_BYTES( 2205 Py_UCS2, Py_UCS4, 2206 PyUnicode_2BYTE_DATA(s), 2207 PyUnicode_2BYTE_DATA(s) + len, 2208 result); 2209 } 2210 else { 2211 assert(skind == PyUnicode_1BYTE_KIND); 2212 _PyUnicode_CONVERT_BYTES( 2213 Py_UCS1, Py_UCS4, 2214 PyUnicode_1BYTE_DATA(s), 2215 PyUnicode_1BYTE_DATA(s) + len, 2216 result); 2217 } 2218 return result; 2219 default: 2220 break; 2221 } 2222 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2223 return NULL; 2224} 2225 2226static Py_UCS4* 2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2228 int copy_null) 2229{ 2230 int kind; 2231 void *data; 2232 Py_ssize_t len, targetlen; 2233 if (PyUnicode_READY(string) == -1) 2234 return NULL; 2235 kind = PyUnicode_KIND(string); 2236 data = PyUnicode_DATA(string); 2237 len = PyUnicode_GET_LENGTH(string); 2238 targetlen = len; 2239 if (copy_null) 2240 targetlen++; 2241 if (!target) { 2242 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2243 PyErr_NoMemory(); 2244 return NULL; 2245 } 2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2247 if (!target) { 2248 PyErr_NoMemory(); 2249 return NULL; 2250 } 2251 } 2252 else { 2253 if (targetsize < targetlen) { 2254 PyErr_Format(PyExc_SystemError, 2255 "string is longer than the buffer"); 2256 if (copy_null && 0 < targetsize) 2257 target[0] = 0; 2258 return NULL; 2259 } 2260 } 2261 if (kind == PyUnicode_1BYTE_KIND) { 2262 Py_UCS1 *start = (Py_UCS1 *) data; 2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2264 } 2265 else if (kind == PyUnicode_2BYTE_KIND) { 2266 Py_UCS2 *start = (Py_UCS2 *) data; 2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2268 } 2269 else { 2270 assert(kind == PyUnicode_4BYTE_KIND); 2271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2272 } 2273 if (copy_null) 2274 target[len] = 0; 2275 return target; 2276} 2277 2278Py_UCS4* 2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2280 int copy_null) 2281{ 2282 if (target == NULL || targetsize < 0) { 2283 PyErr_BadInternalCall(); 2284 return NULL; 2285 } 2286 return as_ucs4(string, target, targetsize, copy_null); 2287} 2288 2289Py_UCS4* 2290PyUnicode_AsUCS4Copy(PyObject *string) 2291{ 2292 return as_ucs4(string, NULL, 0, 1); 2293} 2294 2295#ifdef HAVE_WCHAR_H 2296 2297PyObject * 2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2299{ 2300 if (w == NULL) { 2301 if (size == 0) 2302 _Py_RETURN_UNICODE_EMPTY(); 2303 PyErr_BadInternalCall(); 2304 return NULL; 2305 } 2306 2307 if (size == -1) { 2308 size = wcslen(w); 2309 } 2310 2311 return PyUnicode_FromUnicode(w, size); 2312} 2313 2314#endif /* HAVE_WCHAR_H */ 2315 2316static void 2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2318 char c) 2319{ 2320 *fmt++ = '%'; 2321 if (longflag) 2322 *fmt++ = 'l'; 2323 else if (longlongflag) { 2324 /* longlongflag should only ever be nonzero on machines with 2325 HAVE_LONG_LONG defined */ 2326#ifdef HAVE_LONG_LONG 2327 char *f = PY_FORMAT_LONG_LONG; 2328 while (*f) 2329 *fmt++ = *f++; 2330#else 2331 /* we shouldn't ever get here */ 2332 assert(0); 2333 *fmt++ = 'l'; 2334#endif 2335 } 2336 else if (size_tflag) { 2337 char *f = PY_FORMAT_SIZE_T; 2338 while (*f) 2339 *fmt++ = *f++; 2340 } 2341 *fmt++ = c; 2342 *fmt = '\0'; 2343} 2344 2345/* maximum number of characters required for output of %lld or %p. 2346 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2347 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2349 2350static int 2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2352 Py_ssize_t width, Py_ssize_t precision) 2353{ 2354 Py_ssize_t length, fill, arglen; 2355 Py_UCS4 maxchar; 2356 2357 if (PyUnicode_READY(str) == -1) 2358 return -1; 2359 2360 length = PyUnicode_GET_LENGTH(str); 2361 if ((precision == -1 || precision >= length) 2362 && width <= length) 2363 return _PyUnicodeWriter_WriteStr(writer, str); 2364 2365 if (precision != -1) 2366 length = Py_MIN(precision, length); 2367 2368 arglen = Py_MAX(length, width); 2369 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2370 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2371 else 2372 maxchar = writer->maxchar; 2373 2374 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2375 return -1; 2376 2377 if (width > length) { 2378 fill = width - length; 2379 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2380 return -1; 2381 writer->pos += fill; 2382 } 2383 2384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2385 str, 0, length); 2386 writer->pos += length; 2387 return 0; 2388} 2389 2390static int 2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2392 Py_ssize_t width, Py_ssize_t precision) 2393{ 2394 /* UTF-8 */ 2395 Py_ssize_t length; 2396 PyObject *unicode; 2397 int res; 2398 2399 length = strlen(str); 2400 if (precision != -1) 2401 length = Py_MIN(length, precision); 2402 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2403 if (unicode == NULL) 2404 return -1; 2405 2406 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2407 Py_DECREF(unicode); 2408 return res; 2409} 2410 2411static const char* 2412unicode_fromformat_arg(_PyUnicodeWriter *writer, 2413 const char *f, va_list *vargs) 2414{ 2415 const char *p; 2416 Py_ssize_t len; 2417 int zeropad; 2418 Py_ssize_t width; 2419 Py_ssize_t precision; 2420 int longflag; 2421 int longlongflag; 2422 int size_tflag; 2423 Py_ssize_t fill; 2424 2425 p = f; 2426 f++; 2427 zeropad = 0; 2428 if (*f == '0') { 2429 zeropad = 1; 2430 f++; 2431 } 2432 2433 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2434 width = -1; 2435 if (Py_ISDIGIT((unsigned)*f)) { 2436 width = *f - '0'; 2437 f++; 2438 while (Py_ISDIGIT((unsigned)*f)) { 2439 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2440 PyErr_SetString(PyExc_ValueError, 2441 "width too big"); 2442 return NULL; 2443 } 2444 width = (width * 10) + (*f - '0'); 2445 f++; 2446 } 2447 } 2448 precision = -1; 2449 if (*f == '.') { 2450 f++; 2451 if (Py_ISDIGIT((unsigned)*f)) { 2452 precision = (*f - '0'); 2453 f++; 2454 while (Py_ISDIGIT((unsigned)*f)) { 2455 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2456 PyErr_SetString(PyExc_ValueError, 2457 "precision too big"); 2458 return NULL; 2459 } 2460 precision = (precision * 10) + (*f - '0'); 2461 f++; 2462 } 2463 } 2464 if (*f == '%') { 2465 /* "%.3%s" => f points to "3" */ 2466 f--; 2467 } 2468 } 2469 if (*f == '\0') { 2470 /* bogus format "%.123" => go backward, f points to "3" */ 2471 f--; 2472 } 2473 2474 /* Handle %ld, %lu, %lld and %llu. */ 2475 longflag = 0; 2476 longlongflag = 0; 2477 size_tflag = 0; 2478 if (*f == 'l') { 2479 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2480 longflag = 1; 2481 ++f; 2482 } 2483#ifdef HAVE_LONG_LONG 2484 else if (f[1] == 'l' && 2485 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2486 longlongflag = 1; 2487 f += 2; 2488 } 2489#endif 2490 } 2491 /* handle the size_t flag. */ 2492 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2493 size_tflag = 1; 2494 ++f; 2495 } 2496 2497 if (f[1] == '\0') 2498 writer->overallocate = 0; 2499 2500 switch (*f) { 2501 case 'c': 2502 { 2503 int ordinal = va_arg(*vargs, int); 2504 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2505 PyErr_SetString(PyExc_OverflowError, 2506 "character argument not in range(0x110000)"); 2507 return NULL; 2508 } 2509 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2510 return NULL; 2511 break; 2512 } 2513 2514 case 'i': 2515 case 'd': 2516 case 'u': 2517 case 'x': 2518 { 2519 /* used by sprintf */ 2520 char fmt[10]; /* should be enough for "%0lld\0" */ 2521 char buffer[MAX_LONG_LONG_CHARS]; 2522 Py_ssize_t arglen; 2523 2524 if (*f == 'u') { 2525 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2526 2527 if (longflag) 2528 len = sprintf(buffer, fmt, 2529 va_arg(*vargs, unsigned long)); 2530#ifdef HAVE_LONG_LONG 2531 else if (longlongflag) 2532 len = sprintf(buffer, fmt, 2533 va_arg(*vargs, unsigned PY_LONG_LONG)); 2534#endif 2535 else if (size_tflag) 2536 len = sprintf(buffer, fmt, 2537 va_arg(*vargs, size_t)); 2538 else 2539 len = sprintf(buffer, fmt, 2540 va_arg(*vargs, unsigned int)); 2541 } 2542 else if (*f == 'x') { 2543 makefmt(fmt, 0, 0, 0, 'x'); 2544 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2545 } 2546 else { 2547 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2548 2549 if (longflag) 2550 len = sprintf(buffer, fmt, 2551 va_arg(*vargs, long)); 2552#ifdef HAVE_LONG_LONG 2553 else if (longlongflag) 2554 len = sprintf(buffer, fmt, 2555 va_arg(*vargs, PY_LONG_LONG)); 2556#endif 2557 else if (size_tflag) 2558 len = sprintf(buffer, fmt, 2559 va_arg(*vargs, Py_ssize_t)); 2560 else 2561 len = sprintf(buffer, fmt, 2562 va_arg(*vargs, int)); 2563 } 2564 assert(len >= 0); 2565 2566 if (precision < len) 2567 precision = len; 2568 2569 arglen = Py_MAX(precision, width); 2570 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2571 return NULL; 2572 2573 if (width > precision) { 2574 Py_UCS4 fillchar; 2575 fill = width - precision; 2576 fillchar = zeropad?'0':' '; 2577 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2578 return NULL; 2579 writer->pos += fill; 2580 } 2581 if (precision > len) { 2582 fill = precision - len; 2583 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2584 return NULL; 2585 writer->pos += fill; 2586 } 2587 2588 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2589 return NULL; 2590 break; 2591 } 2592 2593 case 'p': 2594 { 2595 char number[MAX_LONG_LONG_CHARS]; 2596 2597 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2598 assert(len >= 0); 2599 2600 /* %p is ill-defined: ensure leading 0x. */ 2601 if (number[1] == 'X') 2602 number[1] = 'x'; 2603 else if (number[1] != 'x') { 2604 memmove(number + 2, number, 2605 strlen(number) + 1); 2606 number[0] = '0'; 2607 number[1] = 'x'; 2608 len += 2; 2609 } 2610 2611 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2612 return NULL; 2613 break; 2614 } 2615 2616 case 's': 2617 { 2618 /* UTF-8 */ 2619 const char *s = va_arg(*vargs, const char*); 2620 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2621 return NULL; 2622 break; 2623 } 2624 2625 case 'U': 2626 { 2627 PyObject *obj = va_arg(*vargs, PyObject *); 2628 assert(obj && _PyUnicode_CHECK(obj)); 2629 2630 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2631 return NULL; 2632 break; 2633 } 2634 2635 case 'V': 2636 { 2637 PyObject *obj = va_arg(*vargs, PyObject *); 2638 const char *str = va_arg(*vargs, const char *); 2639 if (obj) { 2640 assert(_PyUnicode_CHECK(obj)); 2641 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2642 return NULL; 2643 } 2644 else { 2645 assert(str != NULL); 2646 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2647 return NULL; 2648 } 2649 break; 2650 } 2651 2652 case 'S': 2653 { 2654 PyObject *obj = va_arg(*vargs, PyObject *); 2655 PyObject *str; 2656 assert(obj); 2657 str = PyObject_Str(obj); 2658 if (!str) 2659 return NULL; 2660 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2661 Py_DECREF(str); 2662 return NULL; 2663 } 2664 Py_DECREF(str); 2665 break; 2666 } 2667 2668 case 'R': 2669 { 2670 PyObject *obj = va_arg(*vargs, PyObject *); 2671 PyObject *repr; 2672 assert(obj); 2673 repr = PyObject_Repr(obj); 2674 if (!repr) 2675 return NULL; 2676 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2677 Py_DECREF(repr); 2678 return NULL; 2679 } 2680 Py_DECREF(repr); 2681 break; 2682 } 2683 2684 case 'A': 2685 { 2686 PyObject *obj = va_arg(*vargs, PyObject *); 2687 PyObject *ascii; 2688 assert(obj); 2689 ascii = PyObject_ASCII(obj); 2690 if (!ascii) 2691 return NULL; 2692 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2693 Py_DECREF(ascii); 2694 return NULL; 2695 } 2696 Py_DECREF(ascii); 2697 break; 2698 } 2699 2700 case '%': 2701 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2702 return NULL; 2703 break; 2704 2705 default: 2706 /* if we stumble upon an unknown formatting code, copy the rest 2707 of the format string to the output string. (we cannot just 2708 skip the code, since there's no way to know what's in the 2709 argument list) */ 2710 len = strlen(p); 2711 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2712 return NULL; 2713 f = p+len; 2714 return f; 2715 } 2716 2717 f++; 2718 return f; 2719} 2720 2721PyObject * 2722PyUnicode_FromFormatV(const char *format, va_list vargs) 2723{ 2724 va_list vargs2; 2725 const char *f; 2726 _PyUnicodeWriter writer; 2727 2728 _PyUnicodeWriter_Init(&writer); 2729 writer.min_length = strlen(format) + 100; 2730 writer.overallocate = 1; 2731 2732 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2733 Copy it to be able to pass a reference to a subfunction. */ 2734 Py_VA_COPY(vargs2, vargs); 2735 2736 for (f = format; *f; ) { 2737 if (*f == '%') { 2738 f = unicode_fromformat_arg(&writer, f, &vargs2); 2739 if (f == NULL) 2740 goto fail; 2741 } 2742 else { 2743 const char *p; 2744 Py_ssize_t len; 2745 2746 p = f; 2747 do 2748 { 2749 if ((unsigned char)*p > 127) { 2750 PyErr_Format(PyExc_ValueError, 2751 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2752 "string, got a non-ASCII byte: 0x%02x", 2753 (unsigned char)*p); 2754 return NULL; 2755 } 2756 p++; 2757 } 2758 while (*p != '\0' && *p != '%'); 2759 len = p - f; 2760 2761 if (*p == '\0') 2762 writer.overallocate = 0; 2763 2764 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2765 goto fail; 2766 2767 f = p; 2768 } 2769 } 2770 return _PyUnicodeWriter_Finish(&writer); 2771 2772 fail: 2773 _PyUnicodeWriter_Dealloc(&writer); 2774 return NULL; 2775} 2776 2777PyObject * 2778PyUnicode_FromFormat(const char *format, ...) 2779{ 2780 PyObject* ret; 2781 va_list vargs; 2782 2783#ifdef HAVE_STDARG_PROTOTYPES 2784 va_start(vargs, format); 2785#else 2786 va_start(vargs); 2787#endif 2788 ret = PyUnicode_FromFormatV(format, vargs); 2789 va_end(vargs); 2790 return ret; 2791} 2792 2793#ifdef HAVE_WCHAR_H 2794 2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2796 convert a Unicode object to a wide character string. 2797 2798 - If w is NULL: return the number of wide characters (including the null 2799 character) required to convert the unicode object. Ignore size argument. 2800 2801 - Otherwise: return the number of wide characters (excluding the null 2802 character) written into w. Write at most size wide characters (including 2803 the null character). */ 2804static Py_ssize_t 2805unicode_aswidechar(PyObject *unicode, 2806 wchar_t *w, 2807 Py_ssize_t size) 2808{ 2809 Py_ssize_t res; 2810 const wchar_t *wstr; 2811 2812 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2813 if (wstr == NULL) 2814 return -1; 2815 2816 if (w != NULL) { 2817 if (size > res) 2818 size = res + 1; 2819 else 2820 res = size; 2821 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2822 return res; 2823 } 2824 else 2825 return res + 1; 2826} 2827 2828Py_ssize_t 2829PyUnicode_AsWideChar(PyObject *unicode, 2830 wchar_t *w, 2831 Py_ssize_t size) 2832{ 2833 if (unicode == NULL) { 2834 PyErr_BadInternalCall(); 2835 return -1; 2836 } 2837 return unicode_aswidechar(unicode, w, size); 2838} 2839 2840wchar_t* 2841PyUnicode_AsWideCharString(PyObject *unicode, 2842 Py_ssize_t *size) 2843{ 2844 wchar_t* buffer; 2845 Py_ssize_t buflen; 2846 2847 if (unicode == NULL) { 2848 PyErr_BadInternalCall(); 2849 return NULL; 2850 } 2851 2852 buflen = unicode_aswidechar(unicode, NULL, 0); 2853 if (buflen == -1) 2854 return NULL; 2855 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2856 PyErr_NoMemory(); 2857 return NULL; 2858 } 2859 2860 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2861 if (buffer == NULL) { 2862 PyErr_NoMemory(); 2863 return NULL; 2864 } 2865 buflen = unicode_aswidechar(unicode, buffer, buflen); 2866 if (buflen == -1) { 2867 PyMem_FREE(buffer); 2868 return NULL; 2869 } 2870 if (size != NULL) 2871 *size = buflen; 2872 return buffer; 2873} 2874 2875#endif /* HAVE_WCHAR_H */ 2876 2877PyObject * 2878PyUnicode_FromOrdinal(int ordinal) 2879{ 2880 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2881 PyErr_SetString(PyExc_ValueError, 2882 "chr() arg not in range(0x110000)"); 2883 return NULL; 2884 } 2885 2886 return unicode_char((Py_UCS4)ordinal); 2887} 2888 2889PyObject * 2890PyUnicode_FromObject(PyObject *obj) 2891{ 2892 /* XXX Perhaps we should make this API an alias of 2893 PyObject_Str() instead ?! */ 2894 if (PyUnicode_CheckExact(obj)) { 2895 if (PyUnicode_READY(obj) == -1) 2896 return NULL; 2897 Py_INCREF(obj); 2898 return obj; 2899 } 2900 if (PyUnicode_Check(obj)) { 2901 /* For a Unicode subtype that's not a Unicode object, 2902 return a true Unicode object with the same data. */ 2903 return _PyUnicode_Copy(obj); 2904 } 2905 PyErr_Format(PyExc_TypeError, 2906 "Can't convert '%.100s' object to str implicitly", 2907 Py_TYPE(obj)->tp_name); 2908 return NULL; 2909} 2910 2911PyObject * 2912PyUnicode_FromEncodedObject(PyObject *obj, 2913 const char *encoding, 2914 const char *errors) 2915{ 2916 Py_buffer buffer; 2917 PyObject *v; 2918 2919 if (obj == NULL) { 2920 PyErr_BadInternalCall(); 2921 return NULL; 2922 } 2923 2924 /* Decoding bytes objects is the most common case and should be fast */ 2925 if (PyBytes_Check(obj)) { 2926 if (PyBytes_GET_SIZE(obj) == 0) 2927 _Py_RETURN_UNICODE_EMPTY(); 2928 v = PyUnicode_Decode( 2929 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2930 encoding, errors); 2931 return v; 2932 } 2933 2934 if (PyUnicode_Check(obj)) { 2935 PyErr_SetString(PyExc_TypeError, 2936 "decoding str is not supported"); 2937 return NULL; 2938 } 2939 2940 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2941 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2942 PyErr_Format(PyExc_TypeError, 2943 "coercing to str: need bytes, bytearray " 2944 "or buffer-like object, %.80s found", 2945 Py_TYPE(obj)->tp_name); 2946 return NULL; 2947 } 2948 2949 if (buffer.len == 0) { 2950 PyBuffer_Release(&buffer); 2951 _Py_RETURN_UNICODE_EMPTY(); 2952 } 2953 2954 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2955 PyBuffer_Release(&buffer); 2956 return v; 2957} 2958 2959/* Convert encoding to lower case and replace '_' with '-' in order to 2960 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2961 1 on success. */ 2962int 2963_Py_normalize_encoding(const char *encoding, 2964 char *lower, 2965 size_t lower_len) 2966{ 2967 const char *e; 2968 char *l; 2969 char *l_end; 2970 2971 if (encoding == NULL) { 2972 /* 6 == strlen("utf-8") + 1 */ 2973 if (lower_len < 6) 2974 return 0; 2975 strcpy(lower, "utf-8"); 2976 return 1; 2977 } 2978 e = encoding; 2979 l = lower; 2980 l_end = &lower[lower_len - 1]; 2981 while (*e) { 2982 if (l == l_end) 2983 return 0; 2984 if (Py_ISUPPER(*e)) { 2985 *l++ = Py_TOLOWER(*e++); 2986 } 2987 else if (*e == '_') { 2988 *l++ = '-'; 2989 e++; 2990 } 2991 else { 2992 *l++ = *e++; 2993 } 2994 } 2995 *l = '\0'; 2996 return 1; 2997} 2998 2999PyObject * 3000PyUnicode_Decode(const char *s, 3001 Py_ssize_t size, 3002 const char *encoding, 3003 const char *errors) 3004{ 3005 PyObject *buffer = NULL, *unicode; 3006 Py_buffer info; 3007 char lower[11]; /* Enough for any encoding shortcut */ 3008 3009 /* Shortcuts for common default encodings */ 3010 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3011 if ((strcmp(lower, "utf-8") == 0) || 3012 (strcmp(lower, "utf8") == 0)) 3013 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3014 else if ((strcmp(lower, "latin-1") == 0) || 3015 (strcmp(lower, "latin1") == 0) || 3016 (strcmp(lower, "iso-8859-1") == 0) || 3017 (strcmp(lower, "iso8859-1") == 0)) 3018 return PyUnicode_DecodeLatin1(s, size, errors); 3019#ifdef HAVE_MBCS 3020 else if (strcmp(lower, "mbcs") == 0) 3021 return PyUnicode_DecodeMBCS(s, size, errors); 3022#endif 3023 else if (strcmp(lower, "ascii") == 0) 3024 return PyUnicode_DecodeASCII(s, size, errors); 3025 else if (strcmp(lower, "utf-16") == 0) 3026 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3027 else if (strcmp(lower, "utf-32") == 0) 3028 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3029 } 3030 3031 /* Decode via the codec registry */ 3032 buffer = NULL; 3033 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3034 goto onError; 3035 buffer = PyMemoryView_FromBuffer(&info); 3036 if (buffer == NULL) 3037 goto onError; 3038 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3039 if (unicode == NULL) 3040 goto onError; 3041 if (!PyUnicode_Check(unicode)) { 3042 PyErr_Format(PyExc_TypeError, 3043 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3044 "use codecs.decode() to decode to arbitrary types", 3045 encoding, 3046 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3047 Py_DECREF(unicode); 3048 goto onError; 3049 } 3050 Py_DECREF(buffer); 3051 return unicode_result(unicode); 3052 3053 onError: 3054 Py_XDECREF(buffer); 3055 return NULL; 3056} 3057 3058PyObject * 3059PyUnicode_AsDecodedObject(PyObject *unicode, 3060 const char *encoding, 3061 const char *errors) 3062{ 3063 PyObject *v; 3064 3065 if (!PyUnicode_Check(unicode)) { 3066 PyErr_BadArgument(); 3067 goto onError; 3068 } 3069 3070 if (encoding == NULL) 3071 encoding = PyUnicode_GetDefaultEncoding(); 3072 3073 /* Decode via the codec registry */ 3074 v = PyCodec_Decode(unicode, encoding, errors); 3075 if (v == NULL) 3076 goto onError; 3077 return unicode_result(v); 3078 3079 onError: 3080 return NULL; 3081} 3082 3083PyObject * 3084PyUnicode_AsDecodedUnicode(PyObject *unicode, 3085 const char *encoding, 3086 const char *errors) 3087{ 3088 PyObject *v; 3089 3090 if (!PyUnicode_Check(unicode)) { 3091 PyErr_BadArgument(); 3092 goto onError; 3093 } 3094 3095 if (encoding == NULL) 3096 encoding = PyUnicode_GetDefaultEncoding(); 3097 3098 /* Decode via the codec registry */ 3099 v = PyCodec_Decode(unicode, encoding, errors); 3100 if (v == NULL) 3101 goto onError; 3102 if (!PyUnicode_Check(v)) { 3103 PyErr_Format(PyExc_TypeError, 3104 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3105 "use codecs.decode() to decode to arbitrary types", 3106 encoding, 3107 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3108 Py_DECREF(v); 3109 goto onError; 3110 } 3111 return unicode_result(v); 3112 3113 onError: 3114 return NULL; 3115} 3116 3117PyObject * 3118PyUnicode_Encode(const Py_UNICODE *s, 3119 Py_ssize_t size, 3120 const char *encoding, 3121 const char *errors) 3122{ 3123 PyObject *v, *unicode; 3124 3125 unicode = PyUnicode_FromUnicode(s, size); 3126 if (unicode == NULL) 3127 return NULL; 3128 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3129 Py_DECREF(unicode); 3130 return v; 3131} 3132 3133PyObject * 3134PyUnicode_AsEncodedObject(PyObject *unicode, 3135 const char *encoding, 3136 const char *errors) 3137{ 3138 PyObject *v; 3139 3140 if (!PyUnicode_Check(unicode)) { 3141 PyErr_BadArgument(); 3142 goto onError; 3143 } 3144 3145 if (encoding == NULL) 3146 encoding = PyUnicode_GetDefaultEncoding(); 3147 3148 /* Encode via the codec registry */ 3149 v = PyCodec_Encode(unicode, encoding, errors); 3150 if (v == NULL) 3151 goto onError; 3152 return v; 3153 3154 onError: 3155 return NULL; 3156} 3157 3158static size_t 3159wcstombs_errorpos(const wchar_t *wstr) 3160{ 3161 size_t len; 3162#if SIZEOF_WCHAR_T == 2 3163 wchar_t buf[3]; 3164#else 3165 wchar_t buf[2]; 3166#endif 3167 char outbuf[MB_LEN_MAX]; 3168 const wchar_t *start, *previous; 3169 3170#if SIZEOF_WCHAR_T == 2 3171 buf[2] = 0; 3172#else 3173 buf[1] = 0; 3174#endif 3175 start = wstr; 3176 while (*wstr != L'\0') 3177 { 3178 previous = wstr; 3179#if SIZEOF_WCHAR_T == 2 3180 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3181 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3182 { 3183 buf[0] = wstr[0]; 3184 buf[1] = wstr[1]; 3185 wstr += 2; 3186 } 3187 else { 3188 buf[0] = *wstr; 3189 buf[1] = 0; 3190 wstr++; 3191 } 3192#else 3193 buf[0] = *wstr; 3194 wstr++; 3195#endif 3196 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3197 if (len == (size_t)-1) 3198 return previous - start; 3199 } 3200 3201 /* failed to find the unencodable character */ 3202 return 0; 3203} 3204 3205static int 3206locale_error_handler(const char *errors, int *surrogateescape) 3207{ 3208 if (errors == NULL) { 3209 *surrogateescape = 0; 3210 return 0; 3211 } 3212 3213 if (strcmp(errors, "strict") == 0) { 3214 *surrogateescape = 0; 3215 return 0; 3216 } 3217 if (strcmp(errors, "surrogateescape") == 0) { 3218 *surrogateescape = 1; 3219 return 0; 3220 } 3221 PyErr_Format(PyExc_ValueError, 3222 "only 'strict' and 'surrogateescape' error handlers " 3223 "are supported, not '%s'", 3224 errors); 3225 return -1; 3226} 3227 3228PyObject * 3229PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3230{ 3231 Py_ssize_t wlen, wlen2; 3232 wchar_t *wstr; 3233 PyObject *bytes = NULL; 3234 char *errmsg; 3235 PyObject *reason = NULL; 3236 PyObject *exc; 3237 size_t error_pos; 3238 int surrogateescape; 3239 3240 if (locale_error_handler(errors, &surrogateescape) < 0) 3241 return NULL; 3242 3243 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3244 if (wstr == NULL) 3245 return NULL; 3246 3247 wlen2 = wcslen(wstr); 3248 if (wlen2 != wlen) { 3249 PyMem_Free(wstr); 3250 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3251 return NULL; 3252 } 3253 3254 if (surrogateescape) { 3255 /* "surrogateescape" error handler */ 3256 char *str; 3257 3258 str = Py_EncodeLocale(wstr, &error_pos); 3259 if (str == NULL) { 3260 if (error_pos == (size_t)-1) { 3261 PyErr_NoMemory(); 3262 PyMem_Free(wstr); 3263 return NULL; 3264 } 3265 else { 3266 goto encode_error; 3267 } 3268 } 3269 PyMem_Free(wstr); 3270 3271 bytes = PyBytes_FromString(str); 3272 PyMem_Free(str); 3273 } 3274 else { 3275 /* strict mode */ 3276 size_t len, len2; 3277 3278 len = wcstombs(NULL, wstr, 0); 3279 if (len == (size_t)-1) { 3280 error_pos = (size_t)-1; 3281 goto encode_error; 3282 } 3283 3284 bytes = PyBytes_FromStringAndSize(NULL, len); 3285 if (bytes == NULL) { 3286 PyMem_Free(wstr); 3287 return NULL; 3288 } 3289 3290 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3291 if (len2 == (size_t)-1 || len2 > len) { 3292 error_pos = (size_t)-1; 3293 goto encode_error; 3294 } 3295 PyMem_Free(wstr); 3296 } 3297 return bytes; 3298 3299encode_error: 3300 errmsg = strerror(errno); 3301 assert(errmsg != NULL); 3302 3303 if (error_pos == (size_t)-1) 3304 error_pos = wcstombs_errorpos(wstr); 3305 3306 PyMem_Free(wstr); 3307 Py_XDECREF(bytes); 3308 3309 if (errmsg != NULL) { 3310 size_t errlen; 3311 wstr = Py_DecodeLocale(errmsg, &errlen); 3312 if (wstr != NULL) { 3313 reason = PyUnicode_FromWideChar(wstr, errlen); 3314 PyMem_RawFree(wstr); 3315 } else 3316 errmsg = NULL; 3317 } 3318 if (errmsg == NULL) 3319 reason = PyUnicode_FromString( 3320 "wcstombs() encountered an unencodable " 3321 "wide character"); 3322 if (reason == NULL) 3323 return NULL; 3324 3325 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3326 "locale", unicode, 3327 (Py_ssize_t)error_pos, 3328 (Py_ssize_t)(error_pos+1), 3329 reason); 3330 Py_DECREF(reason); 3331 if (exc != NULL) { 3332 PyCodec_StrictErrors(exc); 3333 Py_XDECREF(exc); 3334 } 3335 return NULL; 3336} 3337 3338PyObject * 3339PyUnicode_EncodeFSDefault(PyObject *unicode) 3340{ 3341#ifdef HAVE_MBCS 3342 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3343#elif defined(__APPLE__) 3344 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3345#else 3346 PyInterpreterState *interp = PyThreadState_GET()->interp; 3347 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3348 cannot use it to encode and decode filenames before it is loaded. Load 3349 the Python codec requires to encode at least its own filename. Use the C 3350 version of the locale codec until the codec registry is initialized and 3351 the Python codec is loaded. 3352 3353 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3354 cannot only rely on it: check also interp->fscodec_initialized for 3355 subinterpreters. */ 3356 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3357 return PyUnicode_AsEncodedString(unicode, 3358 Py_FileSystemDefaultEncoding, 3359 "surrogateescape"); 3360 } 3361 else { 3362 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3363 } 3364#endif 3365} 3366 3367PyObject * 3368PyUnicode_AsEncodedString(PyObject *unicode, 3369 const char *encoding, 3370 const char *errors) 3371{ 3372 PyObject *v; 3373 char lower[11]; /* Enough for any encoding shortcut */ 3374 3375 if (!PyUnicode_Check(unicode)) { 3376 PyErr_BadArgument(); 3377 return NULL; 3378 } 3379 3380 /* Shortcuts for common default encodings */ 3381 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3382 if ((strcmp(lower, "utf-8") == 0) || 3383 (strcmp(lower, "utf8") == 0)) 3384 { 3385 if (errors == NULL || strcmp(errors, "strict") == 0) 3386 return _PyUnicode_AsUTF8String(unicode, NULL); 3387 else 3388 return _PyUnicode_AsUTF8String(unicode, errors); 3389 } 3390 else if ((strcmp(lower, "latin-1") == 0) || 3391 (strcmp(lower, "latin1") == 0) || 3392 (strcmp(lower, "iso-8859-1") == 0) || 3393 (strcmp(lower, "iso8859-1") == 0)) 3394 return _PyUnicode_AsLatin1String(unicode, errors); 3395#ifdef HAVE_MBCS 3396 else if (strcmp(lower, "mbcs") == 0) 3397 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3398#endif 3399 else if (strcmp(lower, "ascii") == 0) 3400 return _PyUnicode_AsASCIIString(unicode, errors); 3401 } 3402 3403 /* Encode via the codec registry */ 3404 v = _PyCodec_EncodeText(unicode, encoding, errors); 3405 if (v == NULL) 3406 return NULL; 3407 3408 /* The normal path */ 3409 if (PyBytes_Check(v)) 3410 return v; 3411 3412 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3413 if (PyByteArray_Check(v)) { 3414 int error; 3415 PyObject *b; 3416 3417 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3418 "encoder %s returned bytearray instead of bytes; " 3419 "use codecs.encode() to encode to arbitrary types", 3420 encoding); 3421 if (error) { 3422 Py_DECREF(v); 3423 return NULL; 3424 } 3425 3426 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3427 Py_DECREF(v); 3428 return b; 3429 } 3430 3431 PyErr_Format(PyExc_TypeError, 3432 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3433 "use codecs.encode() to encode to arbitrary types", 3434 encoding, 3435 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3436 Py_DECREF(v); 3437 return NULL; 3438} 3439 3440PyObject * 3441PyUnicode_AsEncodedUnicode(PyObject *unicode, 3442 const char *encoding, 3443 const char *errors) 3444{ 3445 PyObject *v; 3446 3447 if (!PyUnicode_Check(unicode)) { 3448 PyErr_BadArgument(); 3449 goto onError; 3450 } 3451 3452 if (encoding == NULL) 3453 encoding = PyUnicode_GetDefaultEncoding(); 3454 3455 /* Encode via the codec registry */ 3456 v = PyCodec_Encode(unicode, encoding, errors); 3457 if (v == NULL) 3458 goto onError; 3459 if (!PyUnicode_Check(v)) { 3460 PyErr_Format(PyExc_TypeError, 3461 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3462 "use codecs.encode() to encode to arbitrary types", 3463 encoding, 3464 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3465 Py_DECREF(v); 3466 goto onError; 3467 } 3468 return v; 3469 3470 onError: 3471 return NULL; 3472} 3473 3474static size_t 3475mbstowcs_errorpos(const char *str, size_t len) 3476{ 3477#ifdef HAVE_MBRTOWC 3478 const char *start = str; 3479 mbstate_t mbs; 3480 size_t converted; 3481 wchar_t ch; 3482 3483 memset(&mbs, 0, sizeof mbs); 3484 while (len) 3485 { 3486 converted = mbrtowc(&ch, str, len, &mbs); 3487 if (converted == 0) 3488 /* Reached end of string */ 3489 break; 3490 if (converted == (size_t)-1 || converted == (size_t)-2) { 3491 /* Conversion error or incomplete character */ 3492 return str - start; 3493 } 3494 else { 3495 str += converted; 3496 len -= converted; 3497 } 3498 } 3499 /* failed to find the undecodable byte sequence */ 3500 return 0; 3501#endif 3502 return 0; 3503} 3504 3505PyObject* 3506PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3507 const char *errors) 3508{ 3509 wchar_t smallbuf[256]; 3510 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3511 wchar_t *wstr; 3512 size_t wlen, wlen2; 3513 PyObject *unicode; 3514 int surrogateescape; 3515 size_t error_pos; 3516 char *errmsg; 3517 PyObject *reason, *exc; 3518 3519 if (locale_error_handler(errors, &surrogateescape) < 0) 3520 return NULL; 3521 3522 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3523 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3524 return NULL; 3525 } 3526 3527 if (surrogateescape) { 3528 /* "surrogateescape" error handler */ 3529 wstr = Py_DecodeLocale(str, &wlen); 3530 if (wstr == NULL) { 3531 if (wlen == (size_t)-1) 3532 PyErr_NoMemory(); 3533 else 3534 PyErr_SetFromErrno(PyExc_OSError); 3535 return NULL; 3536 } 3537 3538 unicode = PyUnicode_FromWideChar(wstr, wlen); 3539 PyMem_RawFree(wstr); 3540 } 3541 else { 3542 /* strict mode */ 3543#ifndef HAVE_BROKEN_MBSTOWCS 3544 wlen = mbstowcs(NULL, str, 0); 3545#else 3546 wlen = len; 3547#endif 3548 if (wlen == (size_t)-1) 3549 goto decode_error; 3550 if (wlen+1 <= smallbuf_len) { 3551 wstr = smallbuf; 3552 } 3553 else { 3554 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3555 return PyErr_NoMemory(); 3556 3557 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3558 if (!wstr) 3559 return PyErr_NoMemory(); 3560 } 3561 3562 wlen2 = mbstowcs(wstr, str, wlen+1); 3563 if (wlen2 == (size_t)-1) { 3564 if (wstr != smallbuf) 3565 PyMem_Free(wstr); 3566 goto decode_error; 3567 } 3568#ifdef HAVE_BROKEN_MBSTOWCS 3569 assert(wlen2 == wlen); 3570#endif 3571 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3572 if (wstr != smallbuf) 3573 PyMem_Free(wstr); 3574 } 3575 return unicode; 3576 3577decode_error: 3578 errmsg = strerror(errno); 3579 assert(errmsg != NULL); 3580 3581 error_pos = mbstowcs_errorpos(str, len); 3582 if (errmsg != NULL) { 3583 size_t errlen; 3584 wstr = Py_DecodeLocale(errmsg, &errlen); 3585 if (wstr != NULL) { 3586 reason = PyUnicode_FromWideChar(wstr, errlen); 3587 PyMem_RawFree(wstr); 3588 } else 3589 errmsg = NULL; 3590 } 3591 if (errmsg == NULL) 3592 reason = PyUnicode_FromString( 3593 "mbstowcs() encountered an invalid multibyte sequence"); 3594 if (reason == NULL) 3595 return NULL; 3596 3597 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3598 "locale", str, len, 3599 (Py_ssize_t)error_pos, 3600 (Py_ssize_t)(error_pos+1), 3601 reason); 3602 Py_DECREF(reason); 3603 if (exc != NULL) { 3604 PyCodec_StrictErrors(exc); 3605 Py_XDECREF(exc); 3606 } 3607 return NULL; 3608} 3609 3610PyObject* 3611PyUnicode_DecodeLocale(const char *str, const char *errors) 3612{ 3613 Py_ssize_t size = (Py_ssize_t)strlen(str); 3614 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3615} 3616 3617 3618PyObject* 3619PyUnicode_DecodeFSDefault(const char *s) { 3620 Py_ssize_t size = (Py_ssize_t)strlen(s); 3621 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3622} 3623 3624PyObject* 3625PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3626{ 3627#ifdef HAVE_MBCS 3628 return PyUnicode_DecodeMBCS(s, size, NULL); 3629#elif defined(__APPLE__) 3630 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3631#else 3632 PyInterpreterState *interp = PyThreadState_GET()->interp; 3633 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3634 cannot use it to encode and decode filenames before it is loaded. Load 3635 the Python codec requires to encode at least its own filename. Use the C 3636 version of the locale codec until the codec registry is initialized and 3637 the Python codec is loaded. 3638 3639 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3640 cannot only rely on it: check also interp->fscodec_initialized for 3641 subinterpreters. */ 3642 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3643 return PyUnicode_Decode(s, size, 3644 Py_FileSystemDefaultEncoding, 3645 "surrogateescape"); 3646 } 3647 else { 3648 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3649 } 3650#endif 3651} 3652 3653 3654int 3655_PyUnicode_HasNULChars(PyObject* str) 3656{ 3657 Py_ssize_t pos; 3658 3659 if (PyUnicode_READY(str) == -1) 3660 return -1; 3661 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3662 PyUnicode_GET_LENGTH(str), '\0', 1); 3663 if (pos == -1) 3664 return 0; 3665 else 3666 return 1; 3667} 3668 3669int 3670PyUnicode_FSConverter(PyObject* arg, void* addr) 3671{ 3672 PyObject *output = NULL; 3673 Py_ssize_t size; 3674 void *data; 3675 if (arg == NULL) { 3676 Py_DECREF(*(PyObject**)addr); 3677 return 1; 3678 } 3679 if (PyBytes_Check(arg)) { 3680 output = arg; 3681 Py_INCREF(output); 3682 } 3683 else { 3684 arg = PyUnicode_FromObject(arg); 3685 if (!arg) 3686 return 0; 3687 output = PyUnicode_EncodeFSDefault(arg); 3688 Py_DECREF(arg); 3689 if (!output) 3690 return 0; 3691 if (!PyBytes_Check(output)) { 3692 Py_DECREF(output); 3693 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3694 return 0; 3695 } 3696 } 3697 size = PyBytes_GET_SIZE(output); 3698 data = PyBytes_AS_STRING(output); 3699 if ((size_t)size != strlen(data)) { 3700 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3701 Py_DECREF(output); 3702 return 0; 3703 } 3704 *(PyObject**)addr = output; 3705 return Py_CLEANUP_SUPPORTED; 3706} 3707 3708 3709int 3710PyUnicode_FSDecoder(PyObject* arg, void* addr) 3711{ 3712 PyObject *output = NULL; 3713 if (arg == NULL) { 3714 Py_DECREF(*(PyObject**)addr); 3715 return 1; 3716 } 3717 if (PyUnicode_Check(arg)) { 3718 if (PyUnicode_READY(arg) == -1) 3719 return 0; 3720 output = arg; 3721 Py_INCREF(output); 3722 } 3723 else { 3724 arg = PyBytes_FromObject(arg); 3725 if (!arg) 3726 return 0; 3727 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3728 PyBytes_GET_SIZE(arg)); 3729 Py_DECREF(arg); 3730 if (!output) 3731 return 0; 3732 if (!PyUnicode_Check(output)) { 3733 Py_DECREF(output); 3734 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3735 return 0; 3736 } 3737 } 3738 if (PyUnicode_READY(output) == -1) { 3739 Py_DECREF(output); 3740 return 0; 3741 } 3742 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3743 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3744 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3745 Py_DECREF(output); 3746 return 0; 3747 } 3748 *(PyObject**)addr = output; 3749 return Py_CLEANUP_SUPPORTED; 3750} 3751 3752 3753char* 3754PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3755{ 3756 PyObject *bytes; 3757 3758 if (!PyUnicode_Check(unicode)) { 3759 PyErr_BadArgument(); 3760 return NULL; 3761 } 3762 if (PyUnicode_READY(unicode) == -1) 3763 return NULL; 3764 3765 if (PyUnicode_UTF8(unicode) == NULL) { 3766 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3767 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3768 if (bytes == NULL) 3769 return NULL; 3770 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3771 if (_PyUnicode_UTF8(unicode) == NULL) { 3772 PyErr_NoMemory(); 3773 Py_DECREF(bytes); 3774 return NULL; 3775 } 3776 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3777 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3778 PyBytes_AS_STRING(bytes), 3779 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3780 Py_DECREF(bytes); 3781 } 3782 3783 if (psize) 3784 *psize = PyUnicode_UTF8_LENGTH(unicode); 3785 return PyUnicode_UTF8(unicode); 3786} 3787 3788char* 3789PyUnicode_AsUTF8(PyObject *unicode) 3790{ 3791 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3792} 3793 3794Py_UNICODE * 3795PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3796{ 3797 const unsigned char *one_byte; 3798#if SIZEOF_WCHAR_T == 4 3799 const Py_UCS2 *two_bytes; 3800#else 3801 const Py_UCS4 *four_bytes; 3802 const Py_UCS4 *ucs4_end; 3803 Py_ssize_t num_surrogates; 3804#endif 3805 wchar_t *w; 3806 wchar_t *wchar_end; 3807 3808 if (!PyUnicode_Check(unicode)) { 3809 PyErr_BadArgument(); 3810 return NULL; 3811 } 3812 if (_PyUnicode_WSTR(unicode) == NULL) { 3813 /* Non-ASCII compact unicode object */ 3814 assert(_PyUnicode_KIND(unicode) != 0); 3815 assert(PyUnicode_IS_READY(unicode)); 3816 3817 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3818#if SIZEOF_WCHAR_T == 2 3819 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3820 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3821 num_surrogates = 0; 3822 3823 for (; four_bytes < ucs4_end; ++four_bytes) { 3824 if (*four_bytes > 0xFFFF) 3825 ++num_surrogates; 3826 } 3827 3828 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3829 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3830 if (!_PyUnicode_WSTR(unicode)) { 3831 PyErr_NoMemory(); 3832 return NULL; 3833 } 3834 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3835 3836 w = _PyUnicode_WSTR(unicode); 3837 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3838 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3839 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3840 if (*four_bytes > 0xFFFF) { 3841 assert(*four_bytes <= MAX_UNICODE); 3842 /* encode surrogate pair in this case */ 3843 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3844 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3845 } 3846 else 3847 *w = *four_bytes; 3848 3849 if (w > wchar_end) { 3850 assert(0 && "Miscalculated string end"); 3851 } 3852 } 3853 *w = 0; 3854#else 3855 /* sizeof(wchar_t) == 4 */ 3856 Py_FatalError("Impossible unicode object state, wstr and str " 3857 "should share memory already."); 3858 return NULL; 3859#endif 3860 } 3861 else { 3862 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3863 (_PyUnicode_LENGTH(unicode) + 1)); 3864 if (!_PyUnicode_WSTR(unicode)) { 3865 PyErr_NoMemory(); 3866 return NULL; 3867 } 3868 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3869 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3870 w = _PyUnicode_WSTR(unicode); 3871 wchar_end = w + _PyUnicode_LENGTH(unicode); 3872 3873 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3874 one_byte = PyUnicode_1BYTE_DATA(unicode); 3875 for (; w < wchar_end; ++one_byte, ++w) 3876 *w = *one_byte; 3877 /* null-terminate the wstr */ 3878 *w = 0; 3879 } 3880 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3881#if SIZEOF_WCHAR_T == 4 3882 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3883 for (; w < wchar_end; ++two_bytes, ++w) 3884 *w = *two_bytes; 3885 /* null-terminate the wstr */ 3886 *w = 0; 3887#else 3888 /* sizeof(wchar_t) == 2 */ 3889 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3890 _PyUnicode_WSTR(unicode) = NULL; 3891 Py_FatalError("Impossible unicode object state, wstr " 3892 "and str should share memory already."); 3893 return NULL; 3894#endif 3895 } 3896 else { 3897 assert(0 && "This should never happen."); 3898 } 3899 } 3900 } 3901 if (size != NULL) 3902 *size = PyUnicode_WSTR_LENGTH(unicode); 3903 return _PyUnicode_WSTR(unicode); 3904} 3905 3906Py_UNICODE * 3907PyUnicode_AsUnicode(PyObject *unicode) 3908{ 3909 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3910} 3911 3912 3913Py_ssize_t 3914PyUnicode_GetSize(PyObject *unicode) 3915{ 3916 if (!PyUnicode_Check(unicode)) { 3917 PyErr_BadArgument(); 3918 goto onError; 3919 } 3920 return PyUnicode_GET_SIZE(unicode); 3921 3922 onError: 3923 return -1; 3924} 3925 3926Py_ssize_t 3927PyUnicode_GetLength(PyObject *unicode) 3928{ 3929 if (!PyUnicode_Check(unicode)) { 3930 PyErr_BadArgument(); 3931 return -1; 3932 } 3933 if (PyUnicode_READY(unicode) == -1) 3934 return -1; 3935 return PyUnicode_GET_LENGTH(unicode); 3936} 3937 3938Py_UCS4 3939PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3940{ 3941 void *data; 3942 int kind; 3943 3944 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3945 PyErr_BadArgument(); 3946 return (Py_UCS4)-1; 3947 } 3948 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3949 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3950 return (Py_UCS4)-1; 3951 } 3952 data = PyUnicode_DATA(unicode); 3953 kind = PyUnicode_KIND(unicode); 3954 return PyUnicode_READ(kind, data, index); 3955} 3956 3957int 3958PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3959{ 3960 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3961 PyErr_BadArgument(); 3962 return -1; 3963 } 3964 assert(PyUnicode_IS_READY(unicode)); 3965 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3966 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3967 return -1; 3968 } 3969 if (unicode_check_modifiable(unicode)) 3970 return -1; 3971 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3972 PyErr_SetString(PyExc_ValueError, "character out of range"); 3973 return -1; 3974 } 3975 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3976 index, ch); 3977 return 0; 3978} 3979 3980const char * 3981PyUnicode_GetDefaultEncoding(void) 3982{ 3983 return "utf-8"; 3984} 3985 3986/* create or adjust a UnicodeDecodeError */ 3987static void 3988make_decode_exception(PyObject **exceptionObject, 3989 const char *encoding, 3990 const char *input, Py_ssize_t length, 3991 Py_ssize_t startpos, Py_ssize_t endpos, 3992 const char *reason) 3993{ 3994 if (*exceptionObject == NULL) { 3995 *exceptionObject = PyUnicodeDecodeError_Create( 3996 encoding, input, length, startpos, endpos, reason); 3997 } 3998 else { 3999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4000 goto onError; 4001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4002 goto onError; 4003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4004 goto onError; 4005 } 4006 return; 4007 4008onError: 4009 Py_CLEAR(*exceptionObject); 4010} 4011 4012#ifdef HAVE_MBCS 4013/* error handling callback helper: 4014 build arguments, call the callback and check the arguments, 4015 if no exception occurred, copy the replacement to the output 4016 and adjust various state variables. 4017 return 0 on success, -1 on error 4018*/ 4019 4020static int 4021unicode_decode_call_errorhandler_wchar( 4022 const char *errors, PyObject **errorHandler, 4023 const char *encoding, const char *reason, 4024 const char **input, const char **inend, Py_ssize_t *startinpos, 4025 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4026 PyObject **output, Py_ssize_t *outpos) 4027{ 4028 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4029 4030 PyObject *restuple = NULL; 4031 PyObject *repunicode = NULL; 4032 Py_ssize_t outsize; 4033 Py_ssize_t insize; 4034 Py_ssize_t requiredsize; 4035 Py_ssize_t newpos; 4036 PyObject *inputobj = NULL; 4037 wchar_t *repwstr; 4038 Py_ssize_t repwlen; 4039 4040 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4041 outsize = _PyUnicode_WSTR_LENGTH(*output); 4042 4043 if (*errorHandler == NULL) { 4044 *errorHandler = PyCodec_LookupError(errors); 4045 if (*errorHandler == NULL) 4046 goto onError; 4047 } 4048 4049 make_decode_exception(exceptionObject, 4050 encoding, 4051 *input, *inend - *input, 4052 *startinpos, *endinpos, 4053 reason); 4054 if (*exceptionObject == NULL) 4055 goto onError; 4056 4057 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4058 if (restuple == NULL) 4059 goto onError; 4060 if (!PyTuple_Check(restuple)) { 4061 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4062 goto onError; 4063 } 4064 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4065 goto onError; 4066 4067 /* Copy back the bytes variables, which might have been modified by the 4068 callback */ 4069 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4070 if (!inputobj) 4071 goto onError; 4072 if (!PyBytes_Check(inputobj)) { 4073 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4074 } 4075 *input = PyBytes_AS_STRING(inputobj); 4076 insize = PyBytes_GET_SIZE(inputobj); 4077 *inend = *input + insize; 4078 /* we can DECREF safely, as the exception has another reference, 4079 so the object won't go away. */ 4080 Py_DECREF(inputobj); 4081 4082 if (newpos<0) 4083 newpos = insize+newpos; 4084 if (newpos<0 || newpos>insize) { 4085 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4086 goto onError; 4087 } 4088 4089 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4090 if (repwstr == NULL) 4091 goto onError; 4092 /* need more space? (at least enough for what we 4093 have+the replacement+the rest of the string (starting 4094 at the new input position), so we won't have to check space 4095 when there are no errors in the rest of the string) */ 4096 requiredsize = *outpos; 4097 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4098 goto overflow; 4099 requiredsize += repwlen; 4100 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4101 goto overflow; 4102 requiredsize += insize - newpos; 4103 if (requiredsize > outsize) { 4104 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4105 requiredsize = 2*outsize; 4106 if (unicode_resize(output, requiredsize) < 0) 4107 goto onError; 4108 } 4109 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4110 *outpos += repwlen; 4111 *endinpos = newpos; 4112 *inptr = *input + newpos; 4113 4114 /* we made it! */ 4115 Py_XDECREF(restuple); 4116 return 0; 4117 4118 overflow: 4119 PyErr_SetString(PyExc_OverflowError, 4120 "decoded result is too long for a Python string"); 4121 4122 onError: 4123 Py_XDECREF(restuple); 4124 return -1; 4125} 4126#endif /* HAVE_MBCS */ 4127 4128static int 4129unicode_decode_call_errorhandler_writer( 4130 const char *errors, PyObject **errorHandler, 4131 const char *encoding, const char *reason, 4132 const char **input, const char **inend, Py_ssize_t *startinpos, 4133 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4134 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4135{ 4136 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4137 4138 PyObject *restuple = NULL; 4139 PyObject *repunicode = NULL; 4140 Py_ssize_t insize; 4141 Py_ssize_t newpos; 4142 Py_ssize_t replen; 4143 PyObject *inputobj = NULL; 4144 4145 if (*errorHandler == NULL) { 4146 *errorHandler = PyCodec_LookupError(errors); 4147 if (*errorHandler == NULL) 4148 goto onError; 4149 } 4150 4151 make_decode_exception(exceptionObject, 4152 encoding, 4153 *input, *inend - *input, 4154 *startinpos, *endinpos, 4155 reason); 4156 if (*exceptionObject == NULL) 4157 goto onError; 4158 4159 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4160 if (restuple == NULL) 4161 goto onError; 4162 if (!PyTuple_Check(restuple)) { 4163 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4164 goto onError; 4165 } 4166 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4167 goto onError; 4168 4169 /* Copy back the bytes variables, which might have been modified by the 4170 callback */ 4171 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4172 if (!inputobj) 4173 goto onError; 4174 if (!PyBytes_Check(inputobj)) { 4175 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4176 } 4177 *input = PyBytes_AS_STRING(inputobj); 4178 insize = PyBytes_GET_SIZE(inputobj); 4179 *inend = *input + insize; 4180 /* we can DECREF safely, as the exception has another reference, 4181 so the object won't go away. */ 4182 Py_DECREF(inputobj); 4183 4184 if (newpos<0) 4185 newpos = insize+newpos; 4186 if (newpos<0 || newpos>insize) { 4187 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4188 goto onError; 4189 } 4190 4191 if (PyUnicode_READY(repunicode) < 0) 4192 goto onError; 4193 replen = PyUnicode_GET_LENGTH(repunicode); 4194 writer->min_length += replen; 4195 if (replen > 1) 4196 writer->overallocate = 1; 4197 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4198 goto onError; 4199 4200 *endinpos = newpos; 4201 *inptr = *input + newpos; 4202 4203 /* we made it! */ 4204 Py_XDECREF(restuple); 4205 return 0; 4206 4207 onError: 4208 Py_XDECREF(restuple); 4209 return -1; 4210} 4211 4212/* --- UTF-7 Codec -------------------------------------------------------- */ 4213 4214/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4215 4216/* Three simple macros defining base-64. */ 4217 4218/* Is c a base-64 character? */ 4219 4220#define IS_BASE64(c) \ 4221 (((c) >= 'A' && (c) <= 'Z') || \ 4222 ((c) >= 'a' && (c) <= 'z') || \ 4223 ((c) >= '0' && (c) <= '9') || \ 4224 (c) == '+' || (c) == '/') 4225 4226/* given that c is a base-64 character, what is its base-64 value? */ 4227 4228#define FROM_BASE64(c) \ 4229 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4230 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4231 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4232 (c) == '+' ? 62 : 63) 4233 4234/* What is the base-64 character of the bottom 6 bits of n? */ 4235 4236#define TO_BASE64(n) \ 4237 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4238 4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4240 * decoded as itself. We are permissive on decoding; the only ASCII 4241 * byte not decoding to itself is the + which begins a base64 4242 * string. */ 4243 4244#define DECODE_DIRECT(c) \ 4245 ((c) <= 127 && (c) != '+') 4246 4247/* The UTF-7 encoder treats ASCII characters differently according to 4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4249 * the above). See RFC2152. This array identifies these different 4250 * sets: 4251 * 0 : "Set D" 4252 * alphanumeric and '(),-./:? 4253 * 1 : "Set O" 4254 * !"#$%&*;<=>@[]^_`{|} 4255 * 2 : "whitespace" 4256 * ht nl cr sp 4257 * 3 : special (must be base64 encoded) 4258 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4259 */ 4260 4261static 4262char utf7_category[128] = { 4263/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4264 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4266 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4267/* sp ! " # $ % & ' ( ) * + , - . / */ 4268 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4269/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4271/* @ A B C D E F G H I J K L M N O */ 4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4273/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4275/* ` a b c d e f g h i j k l m n o */ 4276 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4277/* p q r s t u v w x y z { | } ~ del */ 4278 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4279}; 4280 4281/* ENCODE_DIRECT: this character should be encoded as itself. The 4282 * answer depends on whether we are encoding set O as itself, and also 4283 * on whether we are encoding whitespace as itself. RFC2152 makes it 4284 * clear that the answers to these questions vary between 4285 * applications, so this code needs to be flexible. */ 4286 4287#define ENCODE_DIRECT(c, directO, directWS) \ 4288 ((c) < 128 && (c) > 0 && \ 4289 ((utf7_category[(c)] == 0) || \ 4290 (directWS && (utf7_category[(c)] == 2)) || \ 4291 (directO && (utf7_category[(c)] == 1)))) 4292 4293PyObject * 4294PyUnicode_DecodeUTF7(const char *s, 4295 Py_ssize_t size, 4296 const char *errors) 4297{ 4298 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4299} 4300 4301/* The decoder. The only state we preserve is our read position, 4302 * i.e. how many characters we have consumed. So if we end in the 4303 * middle of a shift sequence we have to back off the read position 4304 * and the output to the beginning of the sequence, otherwise we lose 4305 * all the shift state (seen bits, number of bits seen, high 4306 * surrogate). */ 4307 4308PyObject * 4309PyUnicode_DecodeUTF7Stateful(const char *s, 4310 Py_ssize_t size, 4311 const char *errors, 4312 Py_ssize_t *consumed) 4313{ 4314 const char *starts = s; 4315 Py_ssize_t startinpos; 4316 Py_ssize_t endinpos; 4317 const char *e; 4318 _PyUnicodeWriter writer; 4319 const char *errmsg = ""; 4320 int inShift = 0; 4321 Py_ssize_t shiftOutStart; 4322 unsigned int base64bits = 0; 4323 unsigned long base64buffer = 0; 4324 Py_UCS4 surrogate = 0; 4325 PyObject *errorHandler = NULL; 4326 PyObject *exc = NULL; 4327 4328 if (size == 0) { 4329 if (consumed) 4330 *consumed = 0; 4331 _Py_RETURN_UNICODE_EMPTY(); 4332 } 4333 4334 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4335 _PyUnicodeWriter_Init(&writer); 4336 writer.min_length = size; 4337 4338 shiftOutStart = 0; 4339 e = s + size; 4340 4341 while (s < e) { 4342 Py_UCS4 ch; 4343 restart: 4344 ch = (unsigned char) *s; 4345 4346 if (inShift) { /* in a base-64 section */ 4347 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4348 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4349 base64bits += 6; 4350 s++; 4351 if (base64bits >= 16) { 4352 /* we have enough bits for a UTF-16 value */ 4353 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4354 base64bits -= 16; 4355 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4356 assert(outCh <= 0xffff); 4357 if (surrogate) { 4358 /* expecting a second surrogate */ 4359 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4360 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4361 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4362 goto onError; 4363 surrogate = 0; 4364 continue; 4365 } 4366 else { 4367 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4368 goto onError; 4369 surrogate = 0; 4370 } 4371 } 4372 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4373 /* first surrogate */ 4374 surrogate = outCh; 4375 } 4376 else { 4377 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4378 goto onError; 4379 } 4380 } 4381 } 4382 else { /* now leaving a base-64 section */ 4383 inShift = 0; 4384 s++; 4385 if (surrogate) { 4386 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4387 goto onError; 4388 surrogate = 0; 4389 } 4390 if (base64bits > 0) { /* left-over bits */ 4391 if (base64bits >= 6) { 4392 /* We've seen at least one base-64 character */ 4393 errmsg = "partial character in shift sequence"; 4394 goto utf7Error; 4395 } 4396 else { 4397 /* Some bits remain; they should be zero */ 4398 if (base64buffer != 0) { 4399 errmsg = "non-zero padding bits in shift sequence"; 4400 goto utf7Error; 4401 } 4402 } 4403 } 4404 if (ch != '-') { 4405 /* '-' is absorbed; other terminating 4406 characters are preserved */ 4407 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4408 goto onError; 4409 } 4410 } 4411 } 4412 else if ( ch == '+' ) { 4413 startinpos = s-starts; 4414 s++; /* consume '+' */ 4415 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4416 s++; 4417 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4418 goto onError; 4419 } 4420 else { /* begin base64-encoded section */ 4421 inShift = 1; 4422 shiftOutStart = writer.pos; 4423 base64bits = 0; 4424 base64buffer = 0; 4425 } 4426 } 4427 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4428 s++; 4429 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4430 goto onError; 4431 } 4432 else { 4433 startinpos = s-starts; 4434 s++; 4435 errmsg = "unexpected special character"; 4436 goto utf7Error; 4437 } 4438 continue; 4439utf7Error: 4440 endinpos = s-starts; 4441 if (unicode_decode_call_errorhandler_writer( 4442 errors, &errorHandler, 4443 "utf7", errmsg, 4444 &starts, &e, &startinpos, &endinpos, &exc, &s, 4445 &writer)) 4446 goto onError; 4447 } 4448 4449 /* end of string */ 4450 4451 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4452 /* if we're in an inconsistent state, that's an error */ 4453 if (surrogate || 4454 (base64bits >= 6) || 4455 (base64bits > 0 && base64buffer != 0)) { 4456 endinpos = size; 4457 if (unicode_decode_call_errorhandler_writer( 4458 errors, &errorHandler, 4459 "utf7", "unterminated shift sequence", 4460 &starts, &e, &startinpos, &endinpos, &exc, &s, 4461 &writer)) 4462 goto onError; 4463 if (s < e) 4464 goto restart; 4465 } 4466 } 4467 4468 /* return state */ 4469 if (consumed) { 4470 if (inShift) { 4471 *consumed = startinpos; 4472 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4473 PyObject *result = PyUnicode_FromKindAndData( 4474 writer.kind, writer.data, shiftOutStart); 4475 Py_XDECREF(errorHandler); 4476 Py_XDECREF(exc); 4477 _PyUnicodeWriter_Dealloc(&writer); 4478 return result; 4479 } 4480 writer.pos = shiftOutStart; /* back off output */ 4481 } 4482 else { 4483 *consumed = s-starts; 4484 } 4485 } 4486 4487 Py_XDECREF(errorHandler); 4488 Py_XDECREF(exc); 4489 return _PyUnicodeWriter_Finish(&writer); 4490 4491 onError: 4492 Py_XDECREF(errorHandler); 4493 Py_XDECREF(exc); 4494 _PyUnicodeWriter_Dealloc(&writer); 4495 return NULL; 4496} 4497 4498 4499PyObject * 4500_PyUnicode_EncodeUTF7(PyObject *str, 4501 int base64SetO, 4502 int base64WhiteSpace, 4503 const char *errors) 4504{ 4505 int kind; 4506 void *data; 4507 Py_ssize_t len; 4508 PyObject *v; 4509 int inShift = 0; 4510 Py_ssize_t i; 4511 unsigned int base64bits = 0; 4512 unsigned long base64buffer = 0; 4513 char * out; 4514 char * start; 4515 4516 if (PyUnicode_READY(str) == -1) 4517 return NULL; 4518 kind = PyUnicode_KIND(str); 4519 data = PyUnicode_DATA(str); 4520 len = PyUnicode_GET_LENGTH(str); 4521 4522 if (len == 0) 4523 return PyBytes_FromStringAndSize(NULL, 0); 4524 4525 /* It might be possible to tighten this worst case */ 4526 if (len > PY_SSIZE_T_MAX / 8) 4527 return PyErr_NoMemory(); 4528 v = PyBytes_FromStringAndSize(NULL, len * 8); 4529 if (v == NULL) 4530 return NULL; 4531 4532 start = out = PyBytes_AS_STRING(v); 4533 for (i = 0; i < len; ++i) { 4534 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4535 4536 if (inShift) { 4537 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4538 /* shifting out */ 4539 if (base64bits) { /* output remaining bits */ 4540 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4541 base64buffer = 0; 4542 base64bits = 0; 4543 } 4544 inShift = 0; 4545 /* Characters not in the BASE64 set implicitly unshift the sequence 4546 so no '-' is required, except if the character is itself a '-' */ 4547 if (IS_BASE64(ch) || ch == '-') { 4548 *out++ = '-'; 4549 } 4550 *out++ = (char) ch; 4551 } 4552 else { 4553 goto encode_char; 4554 } 4555 } 4556 else { /* not in a shift sequence */ 4557 if (ch == '+') { 4558 *out++ = '+'; 4559 *out++ = '-'; 4560 } 4561 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4562 *out++ = (char) ch; 4563 } 4564 else { 4565 *out++ = '+'; 4566 inShift = 1; 4567 goto encode_char; 4568 } 4569 } 4570 continue; 4571encode_char: 4572 if (ch >= 0x10000) { 4573 assert(ch <= MAX_UNICODE); 4574 4575 /* code first surrogate */ 4576 base64bits += 16; 4577 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4578 while (base64bits >= 6) { 4579 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4580 base64bits -= 6; 4581 } 4582 /* prepare second surrogate */ 4583 ch = Py_UNICODE_LOW_SURROGATE(ch); 4584 } 4585 base64bits += 16; 4586 base64buffer = (base64buffer << 16) | ch; 4587 while (base64bits >= 6) { 4588 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4589 base64bits -= 6; 4590 } 4591 } 4592 if (base64bits) 4593 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4594 if (inShift) 4595 *out++ = '-'; 4596 if (_PyBytes_Resize(&v, out - start) < 0) 4597 return NULL; 4598 return v; 4599} 4600PyObject * 4601PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4602 Py_ssize_t size, 4603 int base64SetO, 4604 int base64WhiteSpace, 4605 const char *errors) 4606{ 4607 PyObject *result; 4608 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4609 if (tmp == NULL) 4610 return NULL; 4611 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4612 base64WhiteSpace, errors); 4613 Py_DECREF(tmp); 4614 return result; 4615} 4616 4617#undef IS_BASE64 4618#undef FROM_BASE64 4619#undef TO_BASE64 4620#undef DECODE_DIRECT 4621#undef ENCODE_DIRECT 4622 4623/* --- UTF-8 Codec -------------------------------------------------------- */ 4624 4625PyObject * 4626PyUnicode_DecodeUTF8(const char *s, 4627 Py_ssize_t size, 4628 const char *errors) 4629{ 4630 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4631} 4632 4633#include "stringlib/asciilib.h" 4634#include "stringlib/codecs.h" 4635#include "stringlib/undef.h" 4636 4637#include "stringlib/ucs1lib.h" 4638#include "stringlib/codecs.h" 4639#include "stringlib/undef.h" 4640 4641#include "stringlib/ucs2lib.h" 4642#include "stringlib/codecs.h" 4643#include "stringlib/undef.h" 4644 4645#include "stringlib/ucs4lib.h" 4646#include "stringlib/codecs.h" 4647#include "stringlib/undef.h" 4648 4649/* Mask to quickly check whether a C 'long' contains a 4650 non-ASCII, UTF8-encoded char. */ 4651#if (SIZEOF_LONG == 8) 4652# define ASCII_CHAR_MASK 0x8080808080808080UL 4653#elif (SIZEOF_LONG == 4) 4654# define ASCII_CHAR_MASK 0x80808080UL 4655#else 4656# error C 'long' size should be either 4 or 8! 4657#endif 4658 4659static Py_ssize_t 4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4661{ 4662 const char *p = start; 4663 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4664 4665 /* 4666 * Issue #17237: m68k is a bit different from most architectures in 4667 * that objects do not use "natural alignment" - for example, int and 4668 * long are only aligned at 2-byte boundaries. Therefore the assert() 4669 * won't work; also, tests have shown that skipping the "optimised 4670 * version" will even speed up m68k. 4671 */ 4672#if !defined(__m68k__) 4673#if SIZEOF_LONG <= SIZEOF_VOID_P 4674 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4675 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4676 /* Fast path, see in STRINGLIB(utf8_decode) for 4677 an explanation. */ 4678 /* Help allocation */ 4679 const char *_p = p; 4680 Py_UCS1 * q = dest; 4681 while (_p < aligned_end) { 4682 unsigned long value = *(const unsigned long *) _p; 4683 if (value & ASCII_CHAR_MASK) 4684 break; 4685 *((unsigned long *)q) = value; 4686 _p += SIZEOF_LONG; 4687 q += SIZEOF_LONG; 4688 } 4689 p = _p; 4690 while (p < end) { 4691 if ((unsigned char)*p & 0x80) 4692 break; 4693 *q++ = *p++; 4694 } 4695 return p - start; 4696 } 4697#endif 4698#endif 4699 while (p < end) { 4700 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4701 for an explanation. */ 4702 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4703 /* Help allocation */ 4704 const char *_p = p; 4705 while (_p < aligned_end) { 4706 unsigned long value = *(unsigned long *) _p; 4707 if (value & ASCII_CHAR_MASK) 4708 break; 4709 _p += SIZEOF_LONG; 4710 } 4711 p = _p; 4712 if (_p == end) 4713 break; 4714 } 4715 if ((unsigned char)*p & 0x80) 4716 break; 4717 ++p; 4718 } 4719 memcpy(dest, start, p - start); 4720 return p - start; 4721} 4722 4723PyObject * 4724PyUnicode_DecodeUTF8Stateful(const char *s, 4725 Py_ssize_t size, 4726 const char *errors, 4727 Py_ssize_t *consumed) 4728{ 4729 _PyUnicodeWriter writer; 4730 const char *starts = s; 4731 const char *end = s + size; 4732 4733 Py_ssize_t startinpos; 4734 Py_ssize_t endinpos; 4735 const char *errmsg = ""; 4736 PyObject *errorHandler = NULL; 4737 PyObject *exc = NULL; 4738 4739 if (size == 0) { 4740 if (consumed) 4741 *consumed = 0; 4742 _Py_RETURN_UNICODE_EMPTY(); 4743 } 4744 4745 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4746 if (size == 1 && (unsigned char)s[0] < 128) { 4747 if (consumed) 4748 *consumed = 1; 4749 return get_latin1_char((unsigned char)s[0]); 4750 } 4751 4752 _PyUnicodeWriter_Init(&writer); 4753 writer.min_length = size; 4754 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4755 goto onError; 4756 4757 writer.pos = ascii_decode(s, end, writer.data); 4758 s += writer.pos; 4759 while (s < end) { 4760 Py_UCS4 ch; 4761 int kind = writer.kind; 4762 if (kind == PyUnicode_1BYTE_KIND) { 4763 if (PyUnicode_IS_ASCII(writer.buffer)) 4764 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4765 else 4766 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4767 } else if (kind == PyUnicode_2BYTE_KIND) { 4768 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4769 } else { 4770 assert(kind == PyUnicode_4BYTE_KIND); 4771 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4772 } 4773 4774 switch (ch) { 4775 case 0: 4776 if (s == end || consumed) 4777 goto End; 4778 errmsg = "unexpected end of data"; 4779 startinpos = s - starts; 4780 endinpos = end - starts; 4781 break; 4782 case 1: 4783 errmsg = "invalid start byte"; 4784 startinpos = s - starts; 4785 endinpos = startinpos + 1; 4786 break; 4787 case 2: 4788 case 3: 4789 case 4: 4790 errmsg = "invalid continuation byte"; 4791 startinpos = s - starts; 4792 endinpos = startinpos + ch - 1; 4793 break; 4794 default: 4795 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4796 goto onError; 4797 continue; 4798 } 4799 4800 if (unicode_decode_call_errorhandler_writer( 4801 errors, &errorHandler, 4802 "utf-8", errmsg, 4803 &starts, &end, &startinpos, &endinpos, &exc, &s, 4804 &writer)) 4805 goto onError; 4806 } 4807 4808End: 4809 if (consumed) 4810 *consumed = s - starts; 4811 4812 Py_XDECREF(errorHandler); 4813 Py_XDECREF(exc); 4814 return _PyUnicodeWriter_Finish(&writer); 4815 4816onError: 4817 Py_XDECREF(errorHandler); 4818 Py_XDECREF(exc); 4819 _PyUnicodeWriter_Dealloc(&writer); 4820 return NULL; 4821} 4822 4823#ifdef __APPLE__ 4824 4825/* Simplified UTF-8 decoder using surrogateescape error handler, 4826 used to decode the command line arguments on Mac OS X. 4827 4828 Return a pointer to a newly allocated wide character string (use 4829 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4830 4831wchar_t* 4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4833{ 4834 const char *e; 4835 wchar_t *unicode; 4836 Py_ssize_t outpos; 4837 4838 /* Note: size will always be longer than the resulting Unicode 4839 character count */ 4840 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4841 return NULL; 4842 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4843 if (!unicode) 4844 return NULL; 4845 4846 /* Unpack UTF-8 encoded data */ 4847 e = s + size; 4848 outpos = 0; 4849 while (s < e) { 4850 Py_UCS4 ch; 4851#if SIZEOF_WCHAR_T == 4 4852 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4853#else 4854 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4855#endif 4856 if (ch > 0xFF) { 4857#if SIZEOF_WCHAR_T == 4 4858 assert(0); 4859#else 4860 assert(Py_UNICODE_IS_SURROGATE(ch)); 4861 /* compute and append the two surrogates: */ 4862 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4863 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4864#endif 4865 } 4866 else { 4867 if (!ch && s == e) 4868 break; 4869 /* surrogateescape */ 4870 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4871 } 4872 } 4873 unicode[outpos] = L'\0'; 4874 return unicode; 4875} 4876 4877#endif /* __APPLE__ */ 4878 4879/* Primary internal function which creates utf8 encoded bytes objects. 4880 4881 Allocation strategy: if the string is short, convert into a stack buffer 4882 and allocate exactly as much space needed at the end. Else allocate the 4883 maximum possible needed (4 result bytes per Unicode character), and return 4884 the excess memory at the end. 4885*/ 4886PyObject * 4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4888{ 4889 enum PyUnicode_Kind kind; 4890 void *data; 4891 Py_ssize_t size; 4892 4893 if (!PyUnicode_Check(unicode)) { 4894 PyErr_BadArgument(); 4895 return NULL; 4896 } 4897 4898 if (PyUnicode_READY(unicode) == -1) 4899 return NULL; 4900 4901 if (PyUnicode_UTF8(unicode)) 4902 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4903 PyUnicode_UTF8_LENGTH(unicode)); 4904 4905 kind = PyUnicode_KIND(unicode); 4906 data = PyUnicode_DATA(unicode); 4907 size = PyUnicode_GET_LENGTH(unicode); 4908 4909 switch (kind) { 4910 default: 4911 assert(0); 4912 case PyUnicode_1BYTE_KIND: 4913 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4914 assert(!PyUnicode_IS_ASCII(unicode)); 4915 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4916 case PyUnicode_2BYTE_KIND: 4917 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4918 case PyUnicode_4BYTE_KIND: 4919 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4920 } 4921} 4922 4923PyObject * 4924PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4925 Py_ssize_t size, 4926 const char *errors) 4927{ 4928 PyObject *v, *unicode; 4929 4930 unicode = PyUnicode_FromUnicode(s, size); 4931 if (unicode == NULL) 4932 return NULL; 4933 v = _PyUnicode_AsUTF8String(unicode, errors); 4934 Py_DECREF(unicode); 4935 return v; 4936} 4937 4938PyObject * 4939PyUnicode_AsUTF8String(PyObject *unicode) 4940{ 4941 return _PyUnicode_AsUTF8String(unicode, NULL); 4942} 4943 4944/* --- UTF-32 Codec ------------------------------------------------------- */ 4945 4946PyObject * 4947PyUnicode_DecodeUTF32(const char *s, 4948 Py_ssize_t size, 4949 const char *errors, 4950 int *byteorder) 4951{ 4952 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4953} 4954 4955PyObject * 4956PyUnicode_DecodeUTF32Stateful(const char *s, 4957 Py_ssize_t size, 4958 const char *errors, 4959 int *byteorder, 4960 Py_ssize_t *consumed) 4961{ 4962 const char *starts = s; 4963 Py_ssize_t startinpos; 4964 Py_ssize_t endinpos; 4965 _PyUnicodeWriter writer; 4966 const unsigned char *q, *e; 4967 int le, bo = 0; /* assume native ordering by default */ 4968 const char *encoding; 4969 const char *errmsg = ""; 4970 PyObject *errorHandler = NULL; 4971 PyObject *exc = NULL; 4972 4973 q = (unsigned char *)s; 4974 e = q + size; 4975 4976 if (byteorder) 4977 bo = *byteorder; 4978 4979 /* Check for BOM marks (U+FEFF) in the input and adjust current 4980 byte order setting accordingly. In native mode, the leading BOM 4981 mark is skipped, in all other modes, it is copied to the output 4982 stream as-is (giving a ZWNBSP character). */ 4983 if (bo == 0 && size >= 4) { 4984 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4985 if (bom == 0x0000FEFF) { 4986 bo = -1; 4987 q += 4; 4988 } 4989 else if (bom == 0xFFFE0000) { 4990 bo = 1; 4991 q += 4; 4992 } 4993 if (byteorder) 4994 *byteorder = bo; 4995 } 4996 4997 if (q == e) { 4998 if (consumed) 4999 *consumed = size; 5000 _Py_RETURN_UNICODE_EMPTY(); 5001 } 5002 5003#ifdef WORDS_BIGENDIAN 5004 le = bo < 0; 5005#else 5006 le = bo <= 0; 5007#endif 5008 encoding = le ? "utf-32-le" : "utf-32-be"; 5009 5010 _PyUnicodeWriter_Init(&writer); 5011 writer.min_length = (e - q + 3) / 4; 5012 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5013 goto onError; 5014 5015 while (1) { 5016 Py_UCS4 ch = 0; 5017 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 5018 5019 if (e - q >= 4) { 5020 enum PyUnicode_Kind kind = writer.kind; 5021 void *data = writer.data; 5022 const unsigned char *last = e - 4; 5023 Py_ssize_t pos = writer.pos; 5024 if (le) { 5025 do { 5026 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5027 if (ch > maxch) 5028 break; 5029 if (kind != PyUnicode_1BYTE_KIND && 5030 Py_UNICODE_IS_SURROGATE(ch)) 5031 break; 5032 PyUnicode_WRITE(kind, data, pos++, ch); 5033 q += 4; 5034 } while (q <= last); 5035 } 5036 else { 5037 do { 5038 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5039 if (ch > maxch) 5040 break; 5041 if (kind != PyUnicode_1BYTE_KIND && 5042 Py_UNICODE_IS_SURROGATE(ch)) 5043 break; 5044 PyUnicode_WRITE(kind, data, pos++, ch); 5045 q += 4; 5046 } while (q <= last); 5047 } 5048 writer.pos = pos; 5049 } 5050 5051 if (Py_UNICODE_IS_SURROGATE(ch)) { 5052 errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)"; 5053 startinpos = ((const char *)q) - starts; 5054 endinpos = startinpos + 4; 5055 } 5056 else if (ch <= maxch) { 5057 if (q == e || consumed) 5058 break; 5059 /* remaining bytes at the end? (size should be divisible by 4) */ 5060 errmsg = "truncated data"; 5061 startinpos = ((const char *)q) - starts; 5062 endinpos = ((const char *)e) - starts; 5063 } 5064 else { 5065 if (ch < 0x110000) { 5066 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5067 goto onError; 5068 q += 4; 5069 continue; 5070 } 5071 errmsg = "codepoint not in range(0x110000)"; 5072 startinpos = ((const char *)q) - starts; 5073 endinpos = startinpos + 4; 5074 } 5075 5076 /* The remaining input chars are ignored if the callback 5077 chooses to skip the input */ 5078 if (unicode_decode_call_errorhandler_writer( 5079 errors, &errorHandler, 5080 encoding, errmsg, 5081 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5082 &writer)) 5083 goto onError; 5084 } 5085 5086 if (consumed) 5087 *consumed = (const char *)q-starts; 5088 5089 Py_XDECREF(errorHandler); 5090 Py_XDECREF(exc); 5091 return _PyUnicodeWriter_Finish(&writer); 5092 5093 onError: 5094 _PyUnicodeWriter_Dealloc(&writer); 5095 Py_XDECREF(errorHandler); 5096 Py_XDECREF(exc); 5097 return NULL; 5098} 5099 5100PyObject * 5101_PyUnicode_EncodeUTF32(PyObject *str, 5102 const char *errors, 5103 int byteorder) 5104{ 5105 int kind; 5106 void *data; 5107 Py_ssize_t len; 5108 PyObject *v; 5109 unsigned char *p; 5110 Py_ssize_t nsize, i; 5111 /* Offsets from p for storing byte pairs in the right order. */ 5112#if PY_LITTLE_ENDIAN 5113 int iorder[] = {0, 1, 2, 3}; 5114#else 5115 int iorder[] = {3, 2, 1, 0}; 5116#endif 5117 const char *encoding; 5118 PyObject *errorHandler = NULL; 5119 PyObject *exc = NULL; 5120 PyObject *rep = NULL; 5121 5122#define STORECHAR(CH) \ 5123 do { \ 5124 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5125 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5126 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5127 p[iorder[0]] = (CH) & 0xff; \ 5128 p += 4; \ 5129 } while(0) 5130 5131 if (!PyUnicode_Check(str)) { 5132 PyErr_BadArgument(); 5133 return NULL; 5134 } 5135 if (PyUnicode_READY(str) == -1) 5136 return NULL; 5137 kind = PyUnicode_KIND(str); 5138 data = PyUnicode_DATA(str); 5139 len = PyUnicode_GET_LENGTH(str); 5140 5141 nsize = len + (byteorder == 0); 5142 if (nsize > PY_SSIZE_T_MAX / 4) 5143 return PyErr_NoMemory(); 5144 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5145 if (v == NULL) 5146 return NULL; 5147 5148 p = (unsigned char *)PyBytes_AS_STRING(v); 5149 if (byteorder == 0) 5150 STORECHAR(0xFEFF); 5151 if (len == 0) 5152 return v; 5153 5154 if (byteorder == -1) { 5155 /* force LE */ 5156 iorder[0] = 0; 5157 iorder[1] = 1; 5158 iorder[2] = 2; 5159 iorder[3] = 3; 5160 encoding = "utf-32-le"; 5161 } 5162 else if (byteorder == 1) { 5163 /* force BE */ 5164 iorder[0] = 3; 5165 iorder[1] = 2; 5166 iorder[2] = 1; 5167 iorder[3] = 0; 5168 encoding = "utf-32-be"; 5169 } 5170 else 5171 encoding = "utf-32"; 5172 5173 if (kind == PyUnicode_1BYTE_KIND) { 5174 for (i = 0; i < len; i++) 5175 STORECHAR(PyUnicode_READ(kind, data, i)); 5176 return v; 5177 } 5178 5179 for (i = 0; i < len;) { 5180 Py_ssize_t repsize, moreunits; 5181 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5182 i++; 5183 assert(ch <= MAX_UNICODE); 5184 if (!Py_UNICODE_IS_SURROGATE(ch)) { 5185 STORECHAR(ch); 5186 continue; 5187 } 5188 5189 rep = unicode_encode_call_errorhandler( 5190 errors, &errorHandler, 5191 encoding, "surrogates not allowed", 5192 str, &exc, i-1, i, &i); 5193 5194 if (!rep) 5195 goto error; 5196 5197 if (PyBytes_Check(rep)) { 5198 repsize = PyBytes_GET_SIZE(rep); 5199 if (repsize & 3) { 5200 raise_encode_exception(&exc, encoding, 5201 str, i - 1, i, 5202 "surrogates not allowed"); 5203 goto error; 5204 } 5205 moreunits = repsize / 4; 5206 } 5207 else { 5208 assert(PyUnicode_Check(rep)); 5209 if (PyUnicode_READY(rep) < 0) 5210 goto error; 5211 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5212 if (!PyUnicode_IS_ASCII(rep)) { 5213 raise_encode_exception(&exc, encoding, 5214 str, i - 1, i, 5215 "surrogates not allowed"); 5216 goto error; 5217 } 5218 } 5219 5220 /* four bytes are reserved for each surrogate */ 5221 if (moreunits > 1) { 5222 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v); 5223 Py_ssize_t morebytes = 4 * (moreunits - 1); 5224 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5225 /* integer overflow */ 5226 PyErr_NoMemory(); 5227 goto error; 5228 } 5229 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5230 goto error; 5231 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos; 5232 } 5233 5234 if (PyBytes_Check(rep)) { 5235 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize); 5236 p += repsize; 5237 } else /* rep is unicode */ { 5238 const Py_UCS1 *repdata; 5239 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5240 repdata = PyUnicode_1BYTE_DATA(rep); 5241 while (repsize--) { 5242 Py_UCS4 ch = *repdata++; 5243 STORECHAR(ch); 5244 } 5245 } 5246 5247 Py_CLEAR(rep); 5248 } 5249 5250 /* Cut back to size actually needed. This is necessary for, for example, 5251 encoding of a string containing isolated surrogates and the 'ignore' 5252 handler is used. */ 5253 nsize = p - (unsigned char*) PyBytes_AS_STRING(v); 5254 if (nsize != PyBytes_GET_SIZE(v)) 5255 _PyBytes_Resize(&v, nsize); 5256 Py_XDECREF(errorHandler); 5257 Py_XDECREF(exc); 5258 return v; 5259 error: 5260 Py_XDECREF(rep); 5261 Py_XDECREF(errorHandler); 5262 Py_XDECREF(exc); 5263 Py_XDECREF(v); 5264 return NULL; 5265#undef STORECHAR 5266} 5267 5268PyObject * 5269PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5270 Py_ssize_t size, 5271 const char *errors, 5272 int byteorder) 5273{ 5274 PyObject *result; 5275 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5276 if (tmp == NULL) 5277 return NULL; 5278 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5279 Py_DECREF(tmp); 5280 return result; 5281} 5282 5283PyObject * 5284PyUnicode_AsUTF32String(PyObject *unicode) 5285{ 5286 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5287} 5288 5289/* --- UTF-16 Codec ------------------------------------------------------- */ 5290 5291PyObject * 5292PyUnicode_DecodeUTF16(const char *s, 5293 Py_ssize_t size, 5294 const char *errors, 5295 int *byteorder) 5296{ 5297 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5298} 5299 5300PyObject * 5301PyUnicode_DecodeUTF16Stateful(const char *s, 5302 Py_ssize_t size, 5303 const char *errors, 5304 int *byteorder, 5305 Py_ssize_t *consumed) 5306{ 5307 const char *starts = s; 5308 Py_ssize_t startinpos; 5309 Py_ssize_t endinpos; 5310 _PyUnicodeWriter writer; 5311 const unsigned char *q, *e; 5312 int bo = 0; /* assume native ordering by default */ 5313 int native_ordering; 5314 const char *errmsg = ""; 5315 PyObject *errorHandler = NULL; 5316 PyObject *exc = NULL; 5317 const char *encoding; 5318 5319 q = (unsigned char *)s; 5320 e = q + size; 5321 5322 if (byteorder) 5323 bo = *byteorder; 5324 5325 /* Check for BOM marks (U+FEFF) in the input and adjust current 5326 byte order setting accordingly. In native mode, the leading BOM 5327 mark is skipped, in all other modes, it is copied to the output 5328 stream as-is (giving a ZWNBSP character). */ 5329 if (bo == 0 && size >= 2) { 5330 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5331 if (bom == 0xFEFF) { 5332 q += 2; 5333 bo = -1; 5334 } 5335 else if (bom == 0xFFFE) { 5336 q += 2; 5337 bo = 1; 5338 } 5339 if (byteorder) 5340 *byteorder = bo; 5341 } 5342 5343 if (q == e) { 5344 if (consumed) 5345 *consumed = size; 5346 _Py_RETURN_UNICODE_EMPTY(); 5347 } 5348 5349#if PY_LITTLE_ENDIAN 5350 native_ordering = bo <= 0; 5351 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5352#else 5353 native_ordering = bo >= 0; 5354 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5355#endif 5356 5357 /* Note: size will always be longer than the resulting Unicode 5358 character count */ 5359 _PyUnicodeWriter_Init(&writer); 5360 writer.min_length = (e - q + 1) / 2; 5361 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5362 goto onError; 5363 5364 while (1) { 5365 Py_UCS4 ch = 0; 5366 if (e - q >= 2) { 5367 int kind = writer.kind; 5368 if (kind == PyUnicode_1BYTE_KIND) { 5369 if (PyUnicode_IS_ASCII(writer.buffer)) 5370 ch = asciilib_utf16_decode(&q, e, 5371 (Py_UCS1*)writer.data, &writer.pos, 5372 native_ordering); 5373 else 5374 ch = ucs1lib_utf16_decode(&q, e, 5375 (Py_UCS1*)writer.data, &writer.pos, 5376 native_ordering); 5377 } else if (kind == PyUnicode_2BYTE_KIND) { 5378 ch = ucs2lib_utf16_decode(&q, e, 5379 (Py_UCS2*)writer.data, &writer.pos, 5380 native_ordering); 5381 } else { 5382 assert(kind == PyUnicode_4BYTE_KIND); 5383 ch = ucs4lib_utf16_decode(&q, e, 5384 (Py_UCS4*)writer.data, &writer.pos, 5385 native_ordering); 5386 } 5387 } 5388 5389 switch (ch) 5390 { 5391 case 0: 5392 /* remaining byte at the end? (size should be even) */ 5393 if (q == e || consumed) 5394 goto End; 5395 errmsg = "truncated data"; 5396 startinpos = ((const char *)q) - starts; 5397 endinpos = ((const char *)e) - starts; 5398 break; 5399 /* The remaining input chars are ignored if the callback 5400 chooses to skip the input */ 5401 case 1: 5402 q -= 2; 5403 if (consumed) 5404 goto End; 5405 errmsg = "unexpected end of data"; 5406 startinpos = ((const char *)q) - starts; 5407 endinpos = ((const char *)e) - starts; 5408 break; 5409 case 2: 5410 errmsg = "illegal encoding"; 5411 startinpos = ((const char *)q) - 2 - starts; 5412 endinpos = startinpos + 2; 5413 break; 5414 case 3: 5415 errmsg = "illegal UTF-16 surrogate"; 5416 startinpos = ((const char *)q) - 4 - starts; 5417 endinpos = startinpos + 2; 5418 break; 5419 default: 5420 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5421 goto onError; 5422 continue; 5423 } 5424 5425 if (unicode_decode_call_errorhandler_writer( 5426 errors, 5427 &errorHandler, 5428 encoding, errmsg, 5429 &starts, 5430 (const char **)&e, 5431 &startinpos, 5432 &endinpos, 5433 &exc, 5434 (const char **)&q, 5435 &writer)) 5436 goto onError; 5437 } 5438 5439End: 5440 if (consumed) 5441 *consumed = (const char *)q-starts; 5442 5443 Py_XDECREF(errorHandler); 5444 Py_XDECREF(exc); 5445 return _PyUnicodeWriter_Finish(&writer); 5446 5447 onError: 5448 _PyUnicodeWriter_Dealloc(&writer); 5449 Py_XDECREF(errorHandler); 5450 Py_XDECREF(exc); 5451 return NULL; 5452} 5453 5454PyObject * 5455_PyUnicode_EncodeUTF16(PyObject *str, 5456 const char *errors, 5457 int byteorder) 5458{ 5459 enum PyUnicode_Kind kind; 5460 const void *data; 5461 Py_ssize_t len; 5462 PyObject *v; 5463 unsigned short *out; 5464 Py_ssize_t pairs; 5465#if PY_BIG_ENDIAN 5466 int native_ordering = byteorder >= 0; 5467#else 5468 int native_ordering = byteorder <= 0; 5469#endif 5470 const char *encoding; 5471 Py_ssize_t nsize, pos; 5472 PyObject *errorHandler = NULL; 5473 PyObject *exc = NULL; 5474 PyObject *rep = NULL; 5475 5476 if (!PyUnicode_Check(str)) { 5477 PyErr_BadArgument(); 5478 return NULL; 5479 } 5480 if (PyUnicode_READY(str) == -1) 5481 return NULL; 5482 kind = PyUnicode_KIND(str); 5483 data = PyUnicode_DATA(str); 5484 len = PyUnicode_GET_LENGTH(str); 5485 5486 pairs = 0; 5487 if (kind == PyUnicode_4BYTE_KIND) { 5488 const Py_UCS4 *in = (const Py_UCS4 *)data; 5489 const Py_UCS4 *end = in + len; 5490 while (in < end) 5491 if (*in++ >= 0x10000) 5492 pairs++; 5493 } 5494 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5495 return PyErr_NoMemory(); 5496 nsize = len + pairs + (byteorder == 0); 5497 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5498 if (v == NULL) 5499 return NULL; 5500 5501 /* output buffer is 2-bytes aligned */ 5502 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5503 out = (unsigned short *)PyBytes_AS_STRING(v); 5504 if (byteorder == 0) 5505 *out++ = 0xFEFF; 5506 if (len == 0) 5507 goto done; 5508 5509 if (kind == PyUnicode_1BYTE_KIND) { 5510 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5511 goto done; 5512 } 5513 5514 if (byteorder < 0) 5515 encoding = "utf-16-le"; 5516 else if (byteorder > 0) 5517 encoding = "utf-16-be"; 5518 else 5519 encoding = "utf-16"; 5520 5521 pos = 0; 5522 while (pos < len) { 5523 Py_ssize_t repsize, moreunits; 5524 5525 if (kind == PyUnicode_2BYTE_KIND) { 5526 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5527 &out, native_ordering); 5528 } 5529 else { 5530 assert(kind == PyUnicode_4BYTE_KIND); 5531 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5532 &out, native_ordering); 5533 } 5534 if (pos == len) 5535 break; 5536 5537 rep = unicode_encode_call_errorhandler( 5538 errors, &errorHandler, 5539 encoding, "surrogates not allowed", 5540 str, &exc, pos, pos + 1, &pos); 5541 if (!rep) 5542 goto error; 5543 5544 if (PyBytes_Check(rep)) { 5545 repsize = PyBytes_GET_SIZE(rep); 5546 if (repsize & 1) { 5547 raise_encode_exception(&exc, encoding, 5548 str, pos - 1, pos, 5549 "surrogates not allowed"); 5550 goto error; 5551 } 5552 moreunits = repsize / 2; 5553 } 5554 else { 5555 assert(PyUnicode_Check(rep)); 5556 if (PyUnicode_READY(rep) < 0) 5557 goto error; 5558 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5559 if (!PyUnicode_IS_ASCII(rep)) { 5560 raise_encode_exception(&exc, encoding, 5561 str, pos - 1, pos, 5562 "surrogates not allowed"); 5563 goto error; 5564 } 5565 } 5566 5567 /* two bytes are reserved for each surrogate */ 5568 if (moreunits > 1) { 5569 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5570 Py_ssize_t morebytes = 2 * (moreunits - 1); 5571 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5572 /* integer overflow */ 5573 PyErr_NoMemory(); 5574 goto error; 5575 } 5576 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5577 goto error; 5578 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5579 } 5580 5581 if (PyBytes_Check(rep)) { 5582 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5583 out += moreunits; 5584 } else /* rep is unicode */ { 5585 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5586 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5587 &out, native_ordering); 5588 } 5589 5590 Py_CLEAR(rep); 5591 } 5592 5593 /* Cut back to size actually needed. This is necessary for, for example, 5594 encoding of a string containing isolated surrogates and the 'ignore' handler 5595 is used. */ 5596 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5597 if (nsize != PyBytes_GET_SIZE(v)) 5598 _PyBytes_Resize(&v, nsize); 5599 Py_XDECREF(errorHandler); 5600 Py_XDECREF(exc); 5601 done: 5602 return v; 5603 error: 5604 Py_XDECREF(rep); 5605 Py_XDECREF(errorHandler); 5606 Py_XDECREF(exc); 5607 Py_XDECREF(v); 5608 return NULL; 5609#undef STORECHAR 5610} 5611 5612PyObject * 5613PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5614 Py_ssize_t size, 5615 const char *errors, 5616 int byteorder) 5617{ 5618 PyObject *result; 5619 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5620 if (tmp == NULL) 5621 return NULL; 5622 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5623 Py_DECREF(tmp); 5624 return result; 5625} 5626 5627PyObject * 5628PyUnicode_AsUTF16String(PyObject *unicode) 5629{ 5630 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5631} 5632 5633/* --- Unicode Escape Codec ----------------------------------------------- */ 5634 5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5636 if all the escapes in the string make it still a valid ASCII string. 5637 Returns -1 if any escapes were found which cause the string to 5638 pop out of ASCII range. Otherwise returns the length of the 5639 required buffer to hold the string. 5640 */ 5641static Py_ssize_t 5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5643{ 5644 const unsigned char *p = (const unsigned char *)s; 5645 const unsigned char *end = p + size; 5646 Py_ssize_t length = 0; 5647 5648 if (size < 0) 5649 return -1; 5650 5651 for (; p < end; ++p) { 5652 if (*p > 127) { 5653 /* Non-ASCII */ 5654 return -1; 5655 } 5656 else if (*p != '\\') { 5657 /* Normal character */ 5658 ++length; 5659 } 5660 else { 5661 /* Backslash-escape, check next char */ 5662 ++p; 5663 /* Escape sequence reaches till end of string or 5664 non-ASCII follow-up. */ 5665 if (p >= end || *p > 127) 5666 return -1; 5667 switch (*p) { 5668 case '\n': 5669 /* backslash + \n result in zero characters */ 5670 break; 5671 case '\\': case '\'': case '\"': 5672 case 'b': case 'f': case 't': 5673 case 'n': case 'r': case 'v': case 'a': 5674 ++length; 5675 break; 5676 case '0': case '1': case '2': case '3': 5677 case '4': case '5': case '6': case '7': 5678 case 'x': case 'u': case 'U': case 'N': 5679 /* these do not guarantee ASCII characters */ 5680 return -1; 5681 default: 5682 /* count the backslash + the other character */ 5683 length += 2; 5684 } 5685 } 5686 } 5687 return length; 5688} 5689 5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5691 5692PyObject * 5693PyUnicode_DecodeUnicodeEscape(const char *s, 5694 Py_ssize_t size, 5695 const char *errors) 5696{ 5697 const char *starts = s; 5698 Py_ssize_t startinpos; 5699 Py_ssize_t endinpos; 5700 _PyUnicodeWriter writer; 5701 const char *end; 5702 char* message; 5703 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5704 PyObject *errorHandler = NULL; 5705 PyObject *exc = NULL; 5706 Py_ssize_t len; 5707 5708 len = length_of_escaped_ascii_string(s, size); 5709 if (len == 0) 5710 _Py_RETURN_UNICODE_EMPTY(); 5711 5712 /* After length_of_escaped_ascii_string() there are two alternatives, 5713 either the string is pure ASCII with named escapes like \n, etc. 5714 and we determined it's exact size (common case) 5715 or it contains \x, \u, ... escape sequences. then we create a 5716 legacy wchar string and resize it at the end of this function. */ 5717 _PyUnicodeWriter_Init(&writer); 5718 if (len > 0) { 5719 writer.min_length = len; 5720 } 5721 else { 5722 /* Escaped strings will always be longer than the resulting 5723 Unicode string, so we start with size here and then reduce the 5724 length after conversion to the true value. 5725 (but if the error callback returns a long replacement string 5726 we'll have to allocate more space) */ 5727 writer.min_length = size; 5728 } 5729 5730 if (size == 0) 5731 return _PyUnicodeWriter_Finish(&writer); 5732 end = s + size; 5733 5734 while (s < end) { 5735 unsigned char c; 5736 Py_UCS4 x; 5737 int digits; 5738 5739 /* Non-escape characters are interpreted as Unicode ordinals */ 5740 if (*s != '\\') { 5741 x = (unsigned char)*s; 5742 s++; 5743 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5744 goto onError; 5745 continue; 5746 } 5747 5748 startinpos = s-starts; 5749 /* \ - Escapes */ 5750 s++; 5751 c = *s++; 5752 if (s > end) 5753 c = '\0'; /* Invalid after \ */ 5754 5755 switch (c) { 5756 5757 /* \x escapes */ 5758#define WRITECHAR(ch) \ 5759 do { \ 5760 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5761 goto onError; \ 5762 } while(0) 5763 5764 case '\n': break; 5765 case '\\': WRITECHAR('\\'); break; 5766 case '\'': WRITECHAR('\''); break; 5767 case '\"': WRITECHAR('\"'); break; 5768 case 'b': WRITECHAR('\b'); break; 5769 /* FF */ 5770 case 'f': WRITECHAR('\014'); break; 5771 case 't': WRITECHAR('\t'); break; 5772 case 'n': WRITECHAR('\n'); break; 5773 case 'r': WRITECHAR('\r'); break; 5774 /* VT */ 5775 case 'v': WRITECHAR('\013'); break; 5776 /* BEL, not classic C */ 5777 case 'a': WRITECHAR('\007'); break; 5778 5779 /* \OOO (octal) escapes */ 5780 case '0': case '1': case '2': case '3': 5781 case '4': case '5': case '6': case '7': 5782 x = s[-1] - '0'; 5783 if (s < end && '0' <= *s && *s <= '7') { 5784 x = (x<<3) + *s++ - '0'; 5785 if (s < end && '0' <= *s && *s <= '7') 5786 x = (x<<3) + *s++ - '0'; 5787 } 5788 WRITECHAR(x); 5789 break; 5790 5791 /* hex escapes */ 5792 /* \xXX */ 5793 case 'x': 5794 digits = 2; 5795 message = "truncated \\xXX escape"; 5796 goto hexescape; 5797 5798 /* \uXXXX */ 5799 case 'u': 5800 digits = 4; 5801 message = "truncated \\uXXXX escape"; 5802 goto hexescape; 5803 5804 /* \UXXXXXXXX */ 5805 case 'U': 5806 digits = 8; 5807 message = "truncated \\UXXXXXXXX escape"; 5808 hexescape: 5809 chr = 0; 5810 if (end - s < digits) { 5811 /* count only hex digits */ 5812 for (; s < end; ++s) { 5813 c = (unsigned char)*s; 5814 if (!Py_ISXDIGIT(c)) 5815 goto error; 5816 } 5817 goto error; 5818 } 5819 for (; digits--; ++s) { 5820 c = (unsigned char)*s; 5821 if (!Py_ISXDIGIT(c)) 5822 goto error; 5823 chr = (chr<<4) & ~0xF; 5824 if (c >= '0' && c <= '9') 5825 chr += c - '0'; 5826 else if (c >= 'a' && c <= 'f') 5827 chr += 10 + c - 'a'; 5828 else 5829 chr += 10 + c - 'A'; 5830 } 5831 if (chr == 0xffffffff && PyErr_Occurred()) 5832 /* _decoding_error will have already written into the 5833 target buffer. */ 5834 break; 5835 store: 5836 /* when we get here, chr is a 32-bit unicode character */ 5837 message = "illegal Unicode character"; 5838 if (chr > MAX_UNICODE) 5839 goto error; 5840 WRITECHAR(chr); 5841 break; 5842 5843 /* \N{name} */ 5844 case 'N': 5845 message = "malformed \\N character escape"; 5846 if (ucnhash_CAPI == NULL) { 5847 /* load the unicode data module */ 5848 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5849 PyUnicodeData_CAPSULE_NAME, 1); 5850 if (ucnhash_CAPI == NULL) 5851 goto ucnhashError; 5852 } 5853 if (*s == '{') { 5854 const char *start = s+1; 5855 /* look for the closing brace */ 5856 while (*s != '}' && s < end) 5857 s++; 5858 if (s > start && s < end && *s == '}') { 5859 /* found a name. look it up in the unicode database */ 5860 message = "unknown Unicode character name"; 5861 s++; 5862 if (s - start - 1 <= INT_MAX && 5863 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5864 &chr, 0)) 5865 goto store; 5866 } 5867 } 5868 goto error; 5869 5870 default: 5871 if (s > end) { 5872 message = "\\ at end of string"; 5873 s--; 5874 goto error; 5875 } 5876 else { 5877 WRITECHAR('\\'); 5878 WRITECHAR((unsigned char)s[-1]); 5879 } 5880 break; 5881 } 5882 continue; 5883 5884 error: 5885 endinpos = s-starts; 5886 if (unicode_decode_call_errorhandler_writer( 5887 errors, &errorHandler, 5888 "unicodeescape", message, 5889 &starts, &end, &startinpos, &endinpos, &exc, &s, 5890 &writer)) 5891 goto onError; 5892 continue; 5893 } 5894#undef WRITECHAR 5895 5896 Py_XDECREF(errorHandler); 5897 Py_XDECREF(exc); 5898 return _PyUnicodeWriter_Finish(&writer); 5899 5900 ucnhashError: 5901 PyErr_SetString( 5902 PyExc_UnicodeError, 5903 "\\N escapes not supported (can't load unicodedata module)" 5904 ); 5905 _PyUnicodeWriter_Dealloc(&writer); 5906 Py_XDECREF(errorHandler); 5907 Py_XDECREF(exc); 5908 return NULL; 5909 5910 onError: 5911 _PyUnicodeWriter_Dealloc(&writer); 5912 Py_XDECREF(errorHandler); 5913 Py_XDECREF(exc); 5914 return NULL; 5915} 5916 5917/* Return a Unicode-Escape string version of the Unicode object. 5918 5919 If quotes is true, the string is enclosed in u"" or u'' quotes as 5920 appropriate. 5921 5922*/ 5923 5924PyObject * 5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5926{ 5927 Py_ssize_t i, len; 5928 PyObject *repr; 5929 char *p; 5930 int kind; 5931 void *data; 5932 Py_ssize_t expandsize = 0; 5933 5934 /* Initial allocation is based on the longest-possible character 5935 escape. 5936 5937 For UCS1 strings it's '\xxx', 4 bytes per source character. 5938 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5939 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5940 */ 5941 5942 if (!PyUnicode_Check(unicode)) { 5943 PyErr_BadArgument(); 5944 return NULL; 5945 } 5946 if (PyUnicode_READY(unicode) == -1) 5947 return NULL; 5948 len = PyUnicode_GET_LENGTH(unicode); 5949 kind = PyUnicode_KIND(unicode); 5950 data = PyUnicode_DATA(unicode); 5951 switch (kind) { 5952 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5953 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5954 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5955 } 5956 5957 if (len == 0) 5958 return PyBytes_FromStringAndSize(NULL, 0); 5959 5960 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5961 return PyErr_NoMemory(); 5962 5963 repr = PyBytes_FromStringAndSize(NULL, 5964 2 5965 + expandsize*len 5966 + 1); 5967 if (repr == NULL) 5968 return NULL; 5969 5970 p = PyBytes_AS_STRING(repr); 5971 5972 for (i = 0; i < len; i++) { 5973 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5974 5975 /* Escape backslashes */ 5976 if (ch == '\\') { 5977 *p++ = '\\'; 5978 *p++ = (char) ch; 5979 continue; 5980 } 5981 5982 /* Map 21-bit characters to '\U00xxxxxx' */ 5983 else if (ch >= 0x10000) { 5984 assert(ch <= MAX_UNICODE); 5985 *p++ = '\\'; 5986 *p++ = 'U'; 5987 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5988 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5989 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5990 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5991 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5992 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5993 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5994 *p++ = Py_hexdigits[ch & 0x0000000F]; 5995 continue; 5996 } 5997 5998 /* Map 16-bit characters to '\uxxxx' */ 5999 if (ch >= 256) { 6000 *p++ = '\\'; 6001 *p++ = 'u'; 6002 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6003 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6004 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6005 *p++ = Py_hexdigits[ch & 0x000F]; 6006 } 6007 6008 /* Map special whitespace to '\t', \n', '\r' */ 6009 else if (ch == '\t') { 6010 *p++ = '\\'; 6011 *p++ = 't'; 6012 } 6013 else if (ch == '\n') { 6014 *p++ = '\\'; 6015 *p++ = 'n'; 6016 } 6017 else if (ch == '\r') { 6018 *p++ = '\\'; 6019 *p++ = 'r'; 6020 } 6021 6022 /* Map non-printable US ASCII to '\xhh' */ 6023 else if (ch < ' ' || ch >= 0x7F) { 6024 *p++ = '\\'; 6025 *p++ = 'x'; 6026 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6027 *p++ = Py_hexdigits[ch & 0x000F]; 6028 } 6029 6030 /* Copy everything else as-is */ 6031 else 6032 *p++ = (char) ch; 6033 } 6034 6035 assert(p - PyBytes_AS_STRING(repr) > 0); 6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6037 return NULL; 6038 return repr; 6039} 6040 6041PyObject * 6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6043 Py_ssize_t size) 6044{ 6045 PyObject *result; 6046 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6047 if (tmp == NULL) 6048 return NULL; 6049 result = PyUnicode_AsUnicodeEscapeString(tmp); 6050 Py_DECREF(tmp); 6051 return result; 6052} 6053 6054/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6055 6056PyObject * 6057PyUnicode_DecodeRawUnicodeEscape(const char *s, 6058 Py_ssize_t size, 6059 const char *errors) 6060{ 6061 const char *starts = s; 6062 Py_ssize_t startinpos; 6063 Py_ssize_t endinpos; 6064 _PyUnicodeWriter writer; 6065 const char *end; 6066 const char *bs; 6067 PyObject *errorHandler = NULL; 6068 PyObject *exc = NULL; 6069 6070 if (size == 0) 6071 _Py_RETURN_UNICODE_EMPTY(); 6072 6073 /* Escaped strings will always be longer than the resulting 6074 Unicode string, so we start with size here and then reduce the 6075 length after conversion to the true value. (But decoding error 6076 handler might have to resize the string) */ 6077 _PyUnicodeWriter_Init(&writer); 6078 writer.min_length = size; 6079 6080 end = s + size; 6081 while (s < end) { 6082 unsigned char c; 6083 Py_UCS4 x; 6084 int i; 6085 int count; 6086 6087 /* Non-escape characters are interpreted as Unicode ordinals */ 6088 if (*s != '\\') { 6089 x = (unsigned char)*s++; 6090 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6091 goto onError; 6092 continue; 6093 } 6094 startinpos = s-starts; 6095 6096 /* \u-escapes are only interpreted iff the number of leading 6097 backslashes if odd */ 6098 bs = s; 6099 for (;s < end;) { 6100 if (*s != '\\') 6101 break; 6102 x = (unsigned char)*s++; 6103 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6104 goto onError; 6105 } 6106 if (((s - bs) & 1) == 0 || 6107 s >= end || 6108 (*s != 'u' && *s != 'U')) { 6109 continue; 6110 } 6111 writer.pos--; 6112 count = *s=='u' ? 4 : 8; 6113 s++; 6114 6115 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6116 for (x = 0, i = 0; i < count; ++i, ++s) { 6117 c = (unsigned char)*s; 6118 if (!Py_ISXDIGIT(c)) { 6119 endinpos = s-starts; 6120 if (unicode_decode_call_errorhandler_writer( 6121 errors, &errorHandler, 6122 "rawunicodeescape", "truncated \\uXXXX", 6123 &starts, &end, &startinpos, &endinpos, &exc, &s, 6124 &writer)) 6125 goto onError; 6126 goto nextByte; 6127 } 6128 x = (x<<4) & ~0xF; 6129 if (c >= '0' && c <= '9') 6130 x += c - '0'; 6131 else if (c >= 'a' && c <= 'f') 6132 x += 10 + c - 'a'; 6133 else 6134 x += 10 + c - 'A'; 6135 } 6136 if (x <= MAX_UNICODE) { 6137 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6138 goto onError; 6139 } 6140 else { 6141 endinpos = s-starts; 6142 if (unicode_decode_call_errorhandler_writer( 6143 errors, &errorHandler, 6144 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6145 &starts, &end, &startinpos, &endinpos, &exc, &s, 6146 &writer)) 6147 goto onError; 6148 } 6149 nextByte: 6150 ; 6151 } 6152 Py_XDECREF(errorHandler); 6153 Py_XDECREF(exc); 6154 return _PyUnicodeWriter_Finish(&writer); 6155 6156 onError: 6157 _PyUnicodeWriter_Dealloc(&writer); 6158 Py_XDECREF(errorHandler); 6159 Py_XDECREF(exc); 6160 return NULL; 6161} 6162 6163 6164PyObject * 6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6166{ 6167 PyObject *repr; 6168 char *p; 6169 char *q; 6170 Py_ssize_t expandsize, pos; 6171 int kind; 6172 void *data; 6173 Py_ssize_t len; 6174 6175 if (!PyUnicode_Check(unicode)) { 6176 PyErr_BadArgument(); 6177 return NULL; 6178 } 6179 if (PyUnicode_READY(unicode) == -1) 6180 return NULL; 6181 kind = PyUnicode_KIND(unicode); 6182 data = PyUnicode_DATA(unicode); 6183 len = PyUnicode_GET_LENGTH(unicode); 6184 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6185 bytes, and 1 byte characters 4. */ 6186 expandsize = kind * 2 + 2; 6187 6188 if (len > PY_SSIZE_T_MAX / expandsize) 6189 return PyErr_NoMemory(); 6190 6191 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6192 if (repr == NULL) 6193 return NULL; 6194 if (len == 0) 6195 return repr; 6196 6197 p = q = PyBytes_AS_STRING(repr); 6198 for (pos = 0; pos < len; pos++) { 6199 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6200 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6201 if (ch >= 0x10000) { 6202 assert(ch <= MAX_UNICODE); 6203 *p++ = '\\'; 6204 *p++ = 'U'; 6205 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6206 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6207 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6208 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6209 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6212 *p++ = Py_hexdigits[ch & 15]; 6213 } 6214 /* Map 16-bit characters to '\uxxxx' */ 6215 else if (ch >= 256) { 6216 *p++ = '\\'; 6217 *p++ = 'u'; 6218 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6219 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6220 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6221 *p++ = Py_hexdigits[ch & 15]; 6222 } 6223 /* Copy everything else as-is */ 6224 else 6225 *p++ = (char) ch; 6226 } 6227 6228 assert(p > q); 6229 if (_PyBytes_Resize(&repr, p - q) < 0) 6230 return NULL; 6231 return repr; 6232} 6233 6234PyObject * 6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6236 Py_ssize_t size) 6237{ 6238 PyObject *result; 6239 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6240 if (tmp == NULL) 6241 return NULL; 6242 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6243 Py_DECREF(tmp); 6244 return result; 6245} 6246 6247/* --- Unicode Internal Codec ------------------------------------------- */ 6248 6249PyObject * 6250_PyUnicode_DecodeUnicodeInternal(const char *s, 6251 Py_ssize_t size, 6252 const char *errors) 6253{ 6254 const char *starts = s; 6255 Py_ssize_t startinpos; 6256 Py_ssize_t endinpos; 6257 _PyUnicodeWriter writer; 6258 const char *end; 6259 const char *reason; 6260 PyObject *errorHandler = NULL; 6261 PyObject *exc = NULL; 6262 6263 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6264 "unicode_internal codec has been deprecated", 6265 1)) 6266 return NULL; 6267 6268 if (size == 0) 6269 _Py_RETURN_UNICODE_EMPTY(); 6270 6271 _PyUnicodeWriter_Init(&writer); 6272 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6273 PyErr_NoMemory(); 6274 goto onError; 6275 } 6276 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6277 6278 end = s + size; 6279 while (s < end) { 6280 Py_UNICODE uch; 6281 Py_UCS4 ch; 6282 if (end - s < Py_UNICODE_SIZE) { 6283 endinpos = end-starts; 6284 reason = "truncated input"; 6285 goto error; 6286 } 6287 /* We copy the raw representation one byte at a time because the 6288 pointer may be unaligned (see test_codeccallbacks). */ 6289 ((char *) &uch)[0] = s[0]; 6290 ((char *) &uch)[1] = s[1]; 6291#ifdef Py_UNICODE_WIDE 6292 ((char *) &uch)[2] = s[2]; 6293 ((char *) &uch)[3] = s[3]; 6294#endif 6295 ch = uch; 6296#ifdef Py_UNICODE_WIDE 6297 /* We have to sanity check the raw data, otherwise doom looms for 6298 some malformed UCS-4 data. */ 6299 if (ch > 0x10ffff) { 6300 endinpos = s - starts + Py_UNICODE_SIZE; 6301 reason = "illegal code point (> 0x10FFFF)"; 6302 goto error; 6303 } 6304#endif 6305 s += Py_UNICODE_SIZE; 6306#ifndef Py_UNICODE_WIDE 6307 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6308 { 6309 Py_UNICODE uch2; 6310 ((char *) &uch2)[0] = s[0]; 6311 ((char *) &uch2)[1] = s[1]; 6312 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6313 { 6314 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6315 s += Py_UNICODE_SIZE; 6316 } 6317 } 6318#endif 6319 6320 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6321 goto onError; 6322 continue; 6323 6324 error: 6325 startinpos = s - starts; 6326 if (unicode_decode_call_errorhandler_writer( 6327 errors, &errorHandler, 6328 "unicode_internal", reason, 6329 &starts, &end, &startinpos, &endinpos, &exc, &s, 6330 &writer)) 6331 goto onError; 6332 } 6333 6334 Py_XDECREF(errorHandler); 6335 Py_XDECREF(exc); 6336 return _PyUnicodeWriter_Finish(&writer); 6337 6338 onError: 6339 _PyUnicodeWriter_Dealloc(&writer); 6340 Py_XDECREF(errorHandler); 6341 Py_XDECREF(exc); 6342 return NULL; 6343} 6344 6345/* --- Latin-1 Codec ------------------------------------------------------ */ 6346 6347PyObject * 6348PyUnicode_DecodeLatin1(const char *s, 6349 Py_ssize_t size, 6350 const char *errors) 6351{ 6352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6353 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6354} 6355 6356/* create or adjust a UnicodeEncodeError */ 6357static void 6358make_encode_exception(PyObject **exceptionObject, 6359 const char *encoding, 6360 PyObject *unicode, 6361 Py_ssize_t startpos, Py_ssize_t endpos, 6362 const char *reason) 6363{ 6364 if (*exceptionObject == NULL) { 6365 *exceptionObject = PyObject_CallFunction( 6366 PyExc_UnicodeEncodeError, "sOnns", 6367 encoding, unicode, startpos, endpos, reason); 6368 } 6369 else { 6370 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6371 goto onError; 6372 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6373 goto onError; 6374 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6375 goto onError; 6376 return; 6377 onError: 6378 Py_CLEAR(*exceptionObject); 6379 } 6380} 6381 6382/* raises a UnicodeEncodeError */ 6383static void 6384raise_encode_exception(PyObject **exceptionObject, 6385 const char *encoding, 6386 PyObject *unicode, 6387 Py_ssize_t startpos, Py_ssize_t endpos, 6388 const char *reason) 6389{ 6390 make_encode_exception(exceptionObject, 6391 encoding, unicode, startpos, endpos, reason); 6392 if (*exceptionObject != NULL) 6393 PyCodec_StrictErrors(*exceptionObject); 6394} 6395 6396/* error handling callback helper: 6397 build arguments, call the callback and check the arguments, 6398 put the result into newpos and return the replacement string, which 6399 has to be freed by the caller */ 6400static PyObject * 6401unicode_encode_call_errorhandler(const char *errors, 6402 PyObject **errorHandler, 6403 const char *encoding, const char *reason, 6404 PyObject *unicode, PyObject **exceptionObject, 6405 Py_ssize_t startpos, Py_ssize_t endpos, 6406 Py_ssize_t *newpos) 6407{ 6408 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6409 Py_ssize_t len; 6410 PyObject *restuple; 6411 PyObject *resunicode; 6412 6413 if (*errorHandler == NULL) { 6414 *errorHandler = PyCodec_LookupError(errors); 6415 if (*errorHandler == NULL) 6416 return NULL; 6417 } 6418 6419 if (PyUnicode_READY(unicode) == -1) 6420 return NULL; 6421 len = PyUnicode_GET_LENGTH(unicode); 6422 6423 make_encode_exception(exceptionObject, 6424 encoding, unicode, startpos, endpos, reason); 6425 if (*exceptionObject == NULL) 6426 return NULL; 6427 6428 restuple = PyObject_CallFunctionObjArgs( 6429 *errorHandler, *exceptionObject, NULL); 6430 if (restuple == NULL) 6431 return NULL; 6432 if (!PyTuple_Check(restuple)) { 6433 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6434 Py_DECREF(restuple); 6435 return NULL; 6436 } 6437 if (!PyArg_ParseTuple(restuple, argparse, 6438 &resunicode, newpos)) { 6439 Py_DECREF(restuple); 6440 return NULL; 6441 } 6442 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6443 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6444 Py_DECREF(restuple); 6445 return NULL; 6446 } 6447 if (*newpos<0) 6448 *newpos = len + *newpos; 6449 if (*newpos<0 || *newpos>len) { 6450 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6451 Py_DECREF(restuple); 6452 return NULL; 6453 } 6454 Py_INCREF(resunicode); 6455 Py_DECREF(restuple); 6456 return resunicode; 6457} 6458 6459static PyObject * 6460unicode_encode_ucs1(PyObject *unicode, 6461 const char *errors, 6462 unsigned int limit) 6463{ 6464 /* input state */ 6465 Py_ssize_t pos=0, size; 6466 int kind; 6467 void *data; 6468 /* output object */ 6469 PyObject *res; 6470 /* pointer into the output */ 6471 char *str; 6472 /* current output position */ 6473 Py_ssize_t ressize; 6474 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6475 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6476 PyObject *errorHandler = NULL; 6477 PyObject *exc = NULL; 6478 /* the following variable is used for caching string comparisons 6479 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6480 int known_errorHandler = -1; 6481 6482 if (PyUnicode_READY(unicode) == -1) 6483 return NULL; 6484 size = PyUnicode_GET_LENGTH(unicode); 6485 kind = PyUnicode_KIND(unicode); 6486 data = PyUnicode_DATA(unicode); 6487 /* allocate enough for a simple encoding without 6488 replacements, if we need more, we'll resize */ 6489 if (size == 0) 6490 return PyBytes_FromStringAndSize(NULL, 0); 6491 res = PyBytes_FromStringAndSize(NULL, size); 6492 if (res == NULL) 6493 return NULL; 6494 str = PyBytes_AS_STRING(res); 6495 ressize = size; 6496 6497 while (pos < size) { 6498 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6499 6500 /* can we encode this? */ 6501 if (c<limit) { 6502 /* no overflow check, because we know that the space is enough */ 6503 *str++ = (char)c; 6504 ++pos; 6505 } 6506 else { 6507 Py_ssize_t requiredsize; 6508 PyObject *repunicode; 6509 Py_ssize_t repsize, newpos, respos, i; 6510 /* startpos for collecting unencodable chars */ 6511 Py_ssize_t collstart = pos; 6512 Py_ssize_t collend = pos; 6513 /* find all unecodable characters */ 6514 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6515 ++collend; 6516 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6517 if (known_errorHandler==-1) { 6518 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6519 known_errorHandler = 1; 6520 else if (!strcmp(errors, "replace")) 6521 known_errorHandler = 2; 6522 else if (!strcmp(errors, "ignore")) 6523 known_errorHandler = 3; 6524 else if (!strcmp(errors, "xmlcharrefreplace")) 6525 known_errorHandler = 4; 6526 else 6527 known_errorHandler = 0; 6528 } 6529 switch (known_errorHandler) { 6530 case 1: /* strict */ 6531 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6532 goto onError; 6533 case 2: /* replace */ 6534 while (collstart++ < collend) 6535 *str++ = '?'; /* fall through */ 6536 case 3: /* ignore */ 6537 pos = collend; 6538 break; 6539 case 4: /* xmlcharrefreplace */ 6540 respos = str - PyBytes_AS_STRING(res); 6541 requiredsize = respos; 6542 /* determine replacement size */ 6543 for (i = collstart; i < collend; ++i) { 6544 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6545 Py_ssize_t incr; 6546 if (ch < 10) 6547 incr = 2+1+1; 6548 else if (ch < 100) 6549 incr = 2+2+1; 6550 else if (ch < 1000) 6551 incr = 2+3+1; 6552 else if (ch < 10000) 6553 incr = 2+4+1; 6554 else if (ch < 100000) 6555 incr = 2+5+1; 6556 else if (ch < 1000000) 6557 incr = 2+6+1; 6558 else { 6559 assert(ch <= MAX_UNICODE); 6560 incr = 2+7+1; 6561 } 6562 if (requiredsize > PY_SSIZE_T_MAX - incr) 6563 goto overflow; 6564 requiredsize += incr; 6565 } 6566 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6567 goto overflow; 6568 requiredsize += size - collend; 6569 if (requiredsize > ressize) { 6570 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6571 requiredsize = 2*ressize; 6572 if (_PyBytes_Resize(&res, requiredsize)) 6573 goto onError; 6574 str = PyBytes_AS_STRING(res) + respos; 6575 ressize = requiredsize; 6576 } 6577 /* generate replacement */ 6578 for (i = collstart; i < collend; ++i) { 6579 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6580 } 6581 pos = collend; 6582 break; 6583 default: 6584 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6585 encoding, reason, unicode, &exc, 6586 collstart, collend, &newpos); 6587 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6588 PyUnicode_READY(repunicode) == -1)) 6589 goto onError; 6590 if (PyBytes_Check(repunicode)) { 6591 /* Directly copy bytes result to output. */ 6592 repsize = PyBytes_Size(repunicode); 6593 if (repsize > 1) { 6594 /* Make room for all additional bytes. */ 6595 respos = str - PyBytes_AS_STRING(res); 6596 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6597 Py_DECREF(repunicode); 6598 goto overflow; 6599 } 6600 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6601 Py_DECREF(repunicode); 6602 goto onError; 6603 } 6604 str = PyBytes_AS_STRING(res) + respos; 6605 ressize += repsize-1; 6606 } 6607 memcpy(str, PyBytes_AsString(repunicode), repsize); 6608 str += repsize; 6609 pos = newpos; 6610 Py_DECREF(repunicode); 6611 break; 6612 } 6613 /* need more space? (at least enough for what we 6614 have+the replacement+the rest of the string, so 6615 we won't have to check space for encodable characters) */ 6616 respos = str - PyBytes_AS_STRING(res); 6617 repsize = PyUnicode_GET_LENGTH(repunicode); 6618 requiredsize = respos; 6619 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6620 goto overflow; 6621 requiredsize += repsize; 6622 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6623 goto overflow; 6624 requiredsize += size - collend; 6625 if (requiredsize > ressize) { 6626 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6627 requiredsize = 2*ressize; 6628 if (_PyBytes_Resize(&res, requiredsize)) { 6629 Py_DECREF(repunicode); 6630 goto onError; 6631 } 6632 str = PyBytes_AS_STRING(res) + respos; 6633 ressize = requiredsize; 6634 } 6635 /* check if there is anything unencodable in the replacement 6636 and copy it to the output */ 6637 for (i = 0; repsize-->0; ++i, ++str) { 6638 c = PyUnicode_READ_CHAR(repunicode, i); 6639 if (c >= limit) { 6640 raise_encode_exception(&exc, encoding, unicode, 6641 pos, pos+1, reason); 6642 Py_DECREF(repunicode); 6643 goto onError; 6644 } 6645 *str = (char)c; 6646 } 6647 pos = newpos; 6648 Py_DECREF(repunicode); 6649 } 6650 } 6651 } 6652 /* Resize if we allocated to much */ 6653 size = str - PyBytes_AS_STRING(res); 6654 if (size < ressize) { /* If this falls res will be NULL */ 6655 assert(size >= 0); 6656 if (_PyBytes_Resize(&res, size) < 0) 6657 goto onError; 6658 } 6659 6660 Py_XDECREF(errorHandler); 6661 Py_XDECREF(exc); 6662 return res; 6663 6664 overflow: 6665 PyErr_SetString(PyExc_OverflowError, 6666 "encoded result is too long for a Python string"); 6667 6668 onError: 6669 Py_XDECREF(res); 6670 Py_XDECREF(errorHandler); 6671 Py_XDECREF(exc); 6672 return NULL; 6673} 6674 6675/* Deprecated */ 6676PyObject * 6677PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6678 Py_ssize_t size, 6679 const char *errors) 6680{ 6681 PyObject *result; 6682 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6683 if (unicode == NULL) 6684 return NULL; 6685 result = unicode_encode_ucs1(unicode, errors, 256); 6686 Py_DECREF(unicode); 6687 return result; 6688} 6689 6690PyObject * 6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6692{ 6693 if (!PyUnicode_Check(unicode)) { 6694 PyErr_BadArgument(); 6695 return NULL; 6696 } 6697 if (PyUnicode_READY(unicode) == -1) 6698 return NULL; 6699 /* Fast path: if it is a one-byte string, construct 6700 bytes object directly. */ 6701 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6702 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6703 PyUnicode_GET_LENGTH(unicode)); 6704 /* Non-Latin-1 characters present. Defer to above function to 6705 raise the exception. */ 6706 return unicode_encode_ucs1(unicode, errors, 256); 6707} 6708 6709PyObject* 6710PyUnicode_AsLatin1String(PyObject *unicode) 6711{ 6712 return _PyUnicode_AsLatin1String(unicode, NULL); 6713} 6714 6715/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6716 6717PyObject * 6718PyUnicode_DecodeASCII(const char *s, 6719 Py_ssize_t size, 6720 const char *errors) 6721{ 6722 const char *starts = s; 6723 _PyUnicodeWriter writer; 6724 int kind; 6725 void *data; 6726 Py_ssize_t startinpos; 6727 Py_ssize_t endinpos; 6728 Py_ssize_t outpos; 6729 const char *e; 6730 PyObject *errorHandler = NULL; 6731 PyObject *exc = NULL; 6732 6733 if (size == 0) 6734 _Py_RETURN_UNICODE_EMPTY(); 6735 6736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6737 if (size == 1 && (unsigned char)s[0] < 128) 6738 return get_latin1_char((unsigned char)s[0]); 6739 6740 _PyUnicodeWriter_Init(&writer); 6741 writer.min_length = size; 6742 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6743 return NULL; 6744 6745 e = s + size; 6746 data = writer.data; 6747 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6748 writer.pos = outpos; 6749 if (writer.pos == size) 6750 return _PyUnicodeWriter_Finish(&writer); 6751 6752 s += writer.pos; 6753 kind = writer.kind; 6754 while (s < e) { 6755 unsigned char c = (unsigned char)*s; 6756 if (c < 128) { 6757 PyUnicode_WRITE(kind, data, writer.pos, c); 6758 writer.pos++; 6759 ++s; 6760 } 6761 else { 6762 startinpos = s-starts; 6763 endinpos = startinpos + 1; 6764 if (unicode_decode_call_errorhandler_writer( 6765 errors, &errorHandler, 6766 "ascii", "ordinal not in range(128)", 6767 &starts, &e, &startinpos, &endinpos, &exc, &s, 6768 &writer)) 6769 goto onError; 6770 kind = writer.kind; 6771 data = writer.data; 6772 } 6773 } 6774 Py_XDECREF(errorHandler); 6775 Py_XDECREF(exc); 6776 return _PyUnicodeWriter_Finish(&writer); 6777 6778 onError: 6779 _PyUnicodeWriter_Dealloc(&writer); 6780 Py_XDECREF(errorHandler); 6781 Py_XDECREF(exc); 6782 return NULL; 6783} 6784 6785/* Deprecated */ 6786PyObject * 6787PyUnicode_EncodeASCII(const Py_UNICODE *p, 6788 Py_ssize_t size, 6789 const char *errors) 6790{ 6791 PyObject *result; 6792 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6793 if (unicode == NULL) 6794 return NULL; 6795 result = unicode_encode_ucs1(unicode, errors, 128); 6796 Py_DECREF(unicode); 6797 return result; 6798} 6799 6800PyObject * 6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6802{ 6803 if (!PyUnicode_Check(unicode)) { 6804 PyErr_BadArgument(); 6805 return NULL; 6806 } 6807 if (PyUnicode_READY(unicode) == -1) 6808 return NULL; 6809 /* Fast path: if it is an ASCII-only string, construct bytes object 6810 directly. Else defer to above function to raise the exception. */ 6811 if (PyUnicode_IS_ASCII(unicode)) 6812 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6813 PyUnicode_GET_LENGTH(unicode)); 6814 return unicode_encode_ucs1(unicode, errors, 128); 6815} 6816 6817PyObject * 6818PyUnicode_AsASCIIString(PyObject *unicode) 6819{ 6820 return _PyUnicode_AsASCIIString(unicode, NULL); 6821} 6822 6823#ifdef HAVE_MBCS 6824 6825/* --- MBCS codecs for Windows -------------------------------------------- */ 6826 6827#if SIZEOF_INT < SIZEOF_SIZE_T 6828#define NEED_RETRY 6829#endif 6830 6831#ifndef WC_ERR_INVALID_CHARS 6832# define WC_ERR_INVALID_CHARS 0x0080 6833#endif 6834 6835static char* 6836code_page_name(UINT code_page, PyObject **obj) 6837{ 6838 *obj = NULL; 6839 if (code_page == CP_ACP) 6840 return "mbcs"; 6841 if (code_page == CP_UTF7) 6842 return "CP_UTF7"; 6843 if (code_page == CP_UTF8) 6844 return "CP_UTF8"; 6845 6846 *obj = PyBytes_FromFormat("cp%u", code_page); 6847 if (*obj == NULL) 6848 return NULL; 6849 return PyBytes_AS_STRING(*obj); 6850} 6851 6852static DWORD 6853decode_code_page_flags(UINT code_page) 6854{ 6855 if (code_page == CP_UTF7) { 6856 /* The CP_UTF7 decoder only supports flags=0 */ 6857 return 0; 6858 } 6859 else 6860 return MB_ERR_INVALID_CHARS; 6861} 6862 6863/* 6864 * Decode a byte string from a Windows code page into unicode object in strict 6865 * mode. 6866 * 6867 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6868 * OSError and returns -1 on other error. 6869 */ 6870static int 6871decode_code_page_strict(UINT code_page, 6872 PyObject **v, 6873 const char *in, 6874 int insize) 6875{ 6876 const DWORD flags = decode_code_page_flags(code_page); 6877 wchar_t *out; 6878 DWORD outsize; 6879 6880 /* First get the size of the result */ 6881 assert(insize > 0); 6882 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6883 if (outsize <= 0) 6884 goto error; 6885 6886 if (*v == NULL) { 6887 /* Create unicode object */ 6888 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6889 *v = (PyObject*)_PyUnicode_New(outsize); 6890 if (*v == NULL) 6891 return -1; 6892 out = PyUnicode_AS_UNICODE(*v); 6893 } 6894 else { 6895 /* Extend unicode object */ 6896 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6897 if (unicode_resize(v, n + outsize) < 0) 6898 return -1; 6899 out = PyUnicode_AS_UNICODE(*v) + n; 6900 } 6901 6902 /* Do the conversion */ 6903 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6904 if (outsize <= 0) 6905 goto error; 6906 return insize; 6907 6908error: 6909 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6910 return -2; 6911 PyErr_SetFromWindowsErr(0); 6912 return -1; 6913} 6914 6915/* 6916 * Decode a byte string from a code page into unicode object with an error 6917 * handler. 6918 * 6919 * Returns consumed size if succeed, or raise an OSError or 6920 * UnicodeDecodeError exception and returns -1 on error. 6921 */ 6922static int 6923decode_code_page_errors(UINT code_page, 6924 PyObject **v, 6925 const char *in, const int size, 6926 const char *errors, int final) 6927{ 6928 const char *startin = in; 6929 const char *endin = in + size; 6930 const DWORD flags = decode_code_page_flags(code_page); 6931 /* Ideally, we should get reason from FormatMessage. This is the Windows 6932 2000 English version of the message. */ 6933 const char *reason = "No mapping for the Unicode character exists " 6934 "in the target code page."; 6935 /* each step cannot decode more than 1 character, but a character can be 6936 represented as a surrogate pair */ 6937 wchar_t buffer[2], *startout, *out; 6938 int insize; 6939 Py_ssize_t outsize; 6940 PyObject *errorHandler = NULL; 6941 PyObject *exc = NULL; 6942 PyObject *encoding_obj = NULL; 6943 char *encoding; 6944 DWORD err; 6945 int ret = -1; 6946 6947 assert(size > 0); 6948 6949 encoding = code_page_name(code_page, &encoding_obj); 6950 if (encoding == NULL) 6951 return -1; 6952 6953 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 6954 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6955 UnicodeDecodeError. */ 6956 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6957 if (exc != NULL) { 6958 PyCodec_StrictErrors(exc); 6959 Py_CLEAR(exc); 6960 } 6961 goto error; 6962 } 6963 6964 if (*v == NULL) { 6965 /* Create unicode object */ 6966 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6967 PyErr_NoMemory(); 6968 goto error; 6969 } 6970 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6971 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6972 if (*v == NULL) 6973 goto error; 6974 startout = PyUnicode_AS_UNICODE(*v); 6975 } 6976 else { 6977 /* Extend unicode object */ 6978 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6979 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6980 PyErr_NoMemory(); 6981 goto error; 6982 } 6983 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6984 goto error; 6985 startout = PyUnicode_AS_UNICODE(*v) + n; 6986 } 6987 6988 /* Decode the byte string character per character */ 6989 out = startout; 6990 while (in < endin) 6991 { 6992 /* Decode a character */ 6993 insize = 1; 6994 do 6995 { 6996 outsize = MultiByteToWideChar(code_page, flags, 6997 in, insize, 6998 buffer, Py_ARRAY_LENGTH(buffer)); 6999 if (outsize > 0) 7000 break; 7001 err = GetLastError(); 7002 if (err != ERROR_NO_UNICODE_TRANSLATION 7003 && err != ERROR_INSUFFICIENT_BUFFER) 7004 { 7005 PyErr_SetFromWindowsErr(0); 7006 goto error; 7007 } 7008 insize++; 7009 } 7010 /* 4=maximum length of a UTF-8 sequence */ 7011 while (insize <= 4 && (in + insize) <= endin); 7012 7013 if (outsize <= 0) { 7014 Py_ssize_t startinpos, endinpos, outpos; 7015 7016 /* last character in partial decode? */ 7017 if (in + insize >= endin && !final) 7018 break; 7019 7020 startinpos = in - startin; 7021 endinpos = startinpos + 1; 7022 outpos = out - PyUnicode_AS_UNICODE(*v); 7023 if (unicode_decode_call_errorhandler_wchar( 7024 errors, &errorHandler, 7025 encoding, reason, 7026 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7027 v, &outpos)) 7028 { 7029 goto error; 7030 } 7031 out = PyUnicode_AS_UNICODE(*v) + outpos; 7032 } 7033 else { 7034 in += insize; 7035 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7036 out += outsize; 7037 } 7038 } 7039 7040 /* write a NUL character at the end */ 7041 *out = 0; 7042 7043 /* Extend unicode object */ 7044 outsize = out - startout; 7045 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7046 if (unicode_resize(v, outsize) < 0) 7047 goto error; 7048 /* (in - startin) <= size and size is an int */ 7049 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7050 7051error: 7052 Py_XDECREF(encoding_obj); 7053 Py_XDECREF(errorHandler); 7054 Py_XDECREF(exc); 7055 return ret; 7056} 7057 7058static PyObject * 7059decode_code_page_stateful(int code_page, 7060 const char *s, Py_ssize_t size, 7061 const char *errors, Py_ssize_t *consumed) 7062{ 7063 PyObject *v = NULL; 7064 int chunk_size, final, converted, done; 7065 7066 if (code_page < 0) { 7067 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7068 return NULL; 7069 } 7070 7071 if (consumed) 7072 *consumed = 0; 7073 7074 do 7075 { 7076#ifdef NEED_RETRY 7077 if (size > INT_MAX) { 7078 chunk_size = INT_MAX; 7079 final = 0; 7080 done = 0; 7081 } 7082 else 7083#endif 7084 { 7085 chunk_size = (int)size; 7086 final = (consumed == NULL); 7087 done = 1; 7088 } 7089 7090 if (chunk_size == 0 && done) { 7091 if (v != NULL) 7092 break; 7093 _Py_RETURN_UNICODE_EMPTY(); 7094 } 7095 7096 converted = decode_code_page_strict(code_page, &v, 7097 s, chunk_size); 7098 if (converted == -2) 7099 converted = decode_code_page_errors(code_page, &v, 7100 s, chunk_size, 7101 errors, final); 7102 assert(converted != 0 || done); 7103 7104 if (converted < 0) { 7105 Py_XDECREF(v); 7106 return NULL; 7107 } 7108 7109 if (consumed) 7110 *consumed += converted; 7111 7112 s += converted; 7113 size -= converted; 7114 } while (!done); 7115 7116 return unicode_result(v); 7117} 7118 7119PyObject * 7120PyUnicode_DecodeCodePageStateful(int code_page, 7121 const char *s, 7122 Py_ssize_t size, 7123 const char *errors, 7124 Py_ssize_t *consumed) 7125{ 7126 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7127} 7128 7129PyObject * 7130PyUnicode_DecodeMBCSStateful(const char *s, 7131 Py_ssize_t size, 7132 const char *errors, 7133 Py_ssize_t *consumed) 7134{ 7135 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7136} 7137 7138PyObject * 7139PyUnicode_DecodeMBCS(const char *s, 7140 Py_ssize_t size, 7141 const char *errors) 7142{ 7143 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7144} 7145 7146static DWORD 7147encode_code_page_flags(UINT code_page, const char *errors) 7148{ 7149 if (code_page == CP_UTF8) { 7150 if (winver.dwMajorVersion >= 6) 7151 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7152 and later */ 7153 return WC_ERR_INVALID_CHARS; 7154 else 7155 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7156 return 0; 7157 } 7158 else if (code_page == CP_UTF7) { 7159 /* CP_UTF7 only supports flags=0 */ 7160 return 0; 7161 } 7162 else { 7163 if (errors != NULL && strcmp(errors, "replace") == 0) 7164 return 0; 7165 else 7166 return WC_NO_BEST_FIT_CHARS; 7167 } 7168} 7169 7170/* 7171 * Encode a Unicode string to a Windows code page into a byte string in strict 7172 * mode. 7173 * 7174 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7175 * an OSError and returns -1 on other error. 7176 */ 7177static int 7178encode_code_page_strict(UINT code_page, PyObject **outbytes, 7179 PyObject *unicode, Py_ssize_t offset, int len, 7180 const char* errors) 7181{ 7182 BOOL usedDefaultChar = FALSE; 7183 BOOL *pusedDefaultChar = &usedDefaultChar; 7184 int outsize; 7185 PyObject *exc = NULL; 7186 wchar_t *p; 7187 Py_ssize_t size; 7188 const DWORD flags = encode_code_page_flags(code_page, NULL); 7189 char *out; 7190 /* Create a substring so that we can get the UTF-16 representation 7191 of just the slice under consideration. */ 7192 PyObject *substring; 7193 7194 assert(len > 0); 7195 7196 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7197 pusedDefaultChar = &usedDefaultChar; 7198 else 7199 pusedDefaultChar = NULL; 7200 7201 substring = PyUnicode_Substring(unicode, offset, offset+len); 7202 if (substring == NULL) 7203 return -1; 7204 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7205 if (p == NULL) { 7206 Py_DECREF(substring); 7207 return -1; 7208 } 7209 assert(size <= INT_MAX); 7210 7211 /* First get the size of the result */ 7212 outsize = WideCharToMultiByte(code_page, flags, 7213 p, (int)size, 7214 NULL, 0, 7215 NULL, pusedDefaultChar); 7216 if (outsize <= 0) 7217 goto error; 7218 /* If we used a default char, then we failed! */ 7219 if (pusedDefaultChar && *pusedDefaultChar) { 7220 Py_DECREF(substring); 7221 return -2; 7222 } 7223 7224 if (*outbytes == NULL) { 7225 /* Create string object */ 7226 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7227 if (*outbytes == NULL) { 7228 Py_DECREF(substring); 7229 return -1; 7230 } 7231 out = PyBytes_AS_STRING(*outbytes); 7232 } 7233 else { 7234 /* Extend string object */ 7235 const Py_ssize_t n = PyBytes_Size(*outbytes); 7236 if (outsize > PY_SSIZE_T_MAX - n) { 7237 PyErr_NoMemory(); 7238 Py_DECREF(substring); 7239 return -1; 7240 } 7241 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7242 Py_DECREF(substring); 7243 return -1; 7244 } 7245 out = PyBytes_AS_STRING(*outbytes) + n; 7246 } 7247 7248 /* Do the conversion */ 7249 outsize = WideCharToMultiByte(code_page, flags, 7250 p, (int)size, 7251 out, outsize, 7252 NULL, pusedDefaultChar); 7253 Py_CLEAR(substring); 7254 if (outsize <= 0) 7255 goto error; 7256 if (pusedDefaultChar && *pusedDefaultChar) 7257 return -2; 7258 return 0; 7259 7260error: 7261 Py_XDECREF(substring); 7262 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7263 return -2; 7264 PyErr_SetFromWindowsErr(0); 7265 return -1; 7266} 7267 7268/* 7269 * Encode a Unicode string to a Windows code page into a byte string using a 7270 * error handler. 7271 * 7272 * Returns consumed characters if succeed, or raise an OSError and returns 7273 * -1 on other error. 7274 */ 7275static int 7276encode_code_page_errors(UINT code_page, PyObject **outbytes, 7277 PyObject *unicode, Py_ssize_t unicode_offset, 7278 Py_ssize_t insize, const char* errors) 7279{ 7280 const DWORD flags = encode_code_page_flags(code_page, errors); 7281 Py_ssize_t pos = unicode_offset; 7282 Py_ssize_t endin = unicode_offset + insize; 7283 /* Ideally, we should get reason from FormatMessage. This is the Windows 7284 2000 English version of the message. */ 7285 const char *reason = "invalid character"; 7286 /* 4=maximum length of a UTF-8 sequence */ 7287 char buffer[4]; 7288 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7289 Py_ssize_t outsize; 7290 char *out; 7291 PyObject *errorHandler = NULL; 7292 PyObject *exc = NULL; 7293 PyObject *encoding_obj = NULL; 7294 char *encoding; 7295 Py_ssize_t newpos, newoutsize; 7296 PyObject *rep; 7297 int ret = -1; 7298 7299 assert(insize > 0); 7300 7301 encoding = code_page_name(code_page, &encoding_obj); 7302 if (encoding == NULL) 7303 return -1; 7304 7305 if (errors == NULL || strcmp(errors, "strict") == 0) { 7306 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7307 then we raise a UnicodeEncodeError. */ 7308 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7309 if (exc != NULL) { 7310 PyCodec_StrictErrors(exc); 7311 Py_DECREF(exc); 7312 } 7313 Py_XDECREF(encoding_obj); 7314 return -1; 7315 } 7316 7317 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7318 pusedDefaultChar = &usedDefaultChar; 7319 else 7320 pusedDefaultChar = NULL; 7321 7322 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7323 PyErr_NoMemory(); 7324 goto error; 7325 } 7326 outsize = insize * Py_ARRAY_LENGTH(buffer); 7327 7328 if (*outbytes == NULL) { 7329 /* Create string object */ 7330 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7331 if (*outbytes == NULL) 7332 goto error; 7333 out = PyBytes_AS_STRING(*outbytes); 7334 } 7335 else { 7336 /* Extend string object */ 7337 Py_ssize_t n = PyBytes_Size(*outbytes); 7338 if (n > PY_SSIZE_T_MAX - outsize) { 7339 PyErr_NoMemory(); 7340 goto error; 7341 } 7342 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7343 goto error; 7344 out = PyBytes_AS_STRING(*outbytes) + n; 7345 } 7346 7347 /* Encode the string character per character */ 7348 while (pos < endin) 7349 { 7350 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7351 wchar_t chars[2]; 7352 int charsize; 7353 if (ch < 0x10000) { 7354 chars[0] = (wchar_t)ch; 7355 charsize = 1; 7356 } 7357 else { 7358 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7359 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7360 charsize = 2; 7361 } 7362 7363 outsize = WideCharToMultiByte(code_page, flags, 7364 chars, charsize, 7365 buffer, Py_ARRAY_LENGTH(buffer), 7366 NULL, pusedDefaultChar); 7367 if (outsize > 0) { 7368 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7369 { 7370 pos++; 7371 memcpy(out, buffer, outsize); 7372 out += outsize; 7373 continue; 7374 } 7375 } 7376 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7377 PyErr_SetFromWindowsErr(0); 7378 goto error; 7379 } 7380 7381 rep = unicode_encode_call_errorhandler( 7382 errors, &errorHandler, encoding, reason, 7383 unicode, &exc, 7384 pos, pos + 1, &newpos); 7385 if (rep == NULL) 7386 goto error; 7387 pos = newpos; 7388 7389 if (PyBytes_Check(rep)) { 7390 outsize = PyBytes_GET_SIZE(rep); 7391 if (outsize != 1) { 7392 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7393 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7394 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7395 Py_DECREF(rep); 7396 goto error; 7397 } 7398 out = PyBytes_AS_STRING(*outbytes) + offset; 7399 } 7400 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7401 out += outsize; 7402 } 7403 else { 7404 Py_ssize_t i; 7405 enum PyUnicode_Kind kind; 7406 void *data; 7407 7408 if (PyUnicode_READY(rep) == -1) { 7409 Py_DECREF(rep); 7410 goto error; 7411 } 7412 7413 outsize = PyUnicode_GET_LENGTH(rep); 7414 if (outsize != 1) { 7415 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7416 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7417 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7418 Py_DECREF(rep); 7419 goto error; 7420 } 7421 out = PyBytes_AS_STRING(*outbytes) + offset; 7422 } 7423 kind = PyUnicode_KIND(rep); 7424 data = PyUnicode_DATA(rep); 7425 for (i=0; i < outsize; i++) { 7426 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7427 if (ch > 127) { 7428 raise_encode_exception(&exc, 7429 encoding, unicode, 7430 pos, pos + 1, 7431 "unable to encode error handler result to ASCII"); 7432 Py_DECREF(rep); 7433 goto error; 7434 } 7435 *out = (unsigned char)ch; 7436 out++; 7437 } 7438 } 7439 Py_DECREF(rep); 7440 } 7441 /* write a NUL byte */ 7442 *out = 0; 7443 outsize = out - PyBytes_AS_STRING(*outbytes); 7444 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7445 if (_PyBytes_Resize(outbytes, outsize) < 0) 7446 goto error; 7447 ret = 0; 7448 7449error: 7450 Py_XDECREF(encoding_obj); 7451 Py_XDECREF(errorHandler); 7452 Py_XDECREF(exc); 7453 return ret; 7454} 7455 7456static PyObject * 7457encode_code_page(int code_page, 7458 PyObject *unicode, 7459 const char *errors) 7460{ 7461 Py_ssize_t len; 7462 PyObject *outbytes = NULL; 7463 Py_ssize_t offset; 7464 int chunk_len, ret, done; 7465 7466 if (PyUnicode_READY(unicode) == -1) 7467 return NULL; 7468 len = PyUnicode_GET_LENGTH(unicode); 7469 7470 if (code_page < 0) { 7471 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7472 return NULL; 7473 } 7474 7475 if (len == 0) 7476 return PyBytes_FromStringAndSize(NULL, 0); 7477 7478 offset = 0; 7479 do 7480 { 7481#ifdef NEED_RETRY 7482 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7483 chunks. */ 7484 if (len > INT_MAX/2) { 7485 chunk_len = INT_MAX/2; 7486 done = 0; 7487 } 7488 else 7489#endif 7490 { 7491 chunk_len = (int)len; 7492 done = 1; 7493 } 7494 7495 ret = encode_code_page_strict(code_page, &outbytes, 7496 unicode, offset, chunk_len, 7497 errors); 7498 if (ret == -2) 7499 ret = encode_code_page_errors(code_page, &outbytes, 7500 unicode, offset, 7501 chunk_len, errors); 7502 if (ret < 0) { 7503 Py_XDECREF(outbytes); 7504 return NULL; 7505 } 7506 7507 offset += chunk_len; 7508 len -= chunk_len; 7509 } while (!done); 7510 7511 return outbytes; 7512} 7513 7514PyObject * 7515PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7516 Py_ssize_t size, 7517 const char *errors) 7518{ 7519 PyObject *unicode, *res; 7520 unicode = PyUnicode_FromUnicode(p, size); 7521 if (unicode == NULL) 7522 return NULL; 7523 res = encode_code_page(CP_ACP, unicode, errors); 7524 Py_DECREF(unicode); 7525 return res; 7526} 7527 7528PyObject * 7529PyUnicode_EncodeCodePage(int code_page, 7530 PyObject *unicode, 7531 const char *errors) 7532{ 7533 return encode_code_page(code_page, unicode, errors); 7534} 7535 7536PyObject * 7537PyUnicode_AsMBCSString(PyObject *unicode) 7538{ 7539 if (!PyUnicode_Check(unicode)) { 7540 PyErr_BadArgument(); 7541 return NULL; 7542 } 7543 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7544} 7545 7546#undef NEED_RETRY 7547 7548#endif /* HAVE_MBCS */ 7549 7550/* --- Character Mapping Codec -------------------------------------------- */ 7551 7552static int 7553charmap_decode_string(const char *s, 7554 Py_ssize_t size, 7555 PyObject *mapping, 7556 const char *errors, 7557 _PyUnicodeWriter *writer) 7558{ 7559 const char *starts = s; 7560 const char *e; 7561 Py_ssize_t startinpos, endinpos; 7562 PyObject *errorHandler = NULL, *exc = NULL; 7563 Py_ssize_t maplen; 7564 enum PyUnicode_Kind mapkind; 7565 void *mapdata; 7566 Py_UCS4 x; 7567 unsigned char ch; 7568 7569 if (PyUnicode_READY(mapping) == -1) 7570 return -1; 7571 7572 maplen = PyUnicode_GET_LENGTH(mapping); 7573 mapdata = PyUnicode_DATA(mapping); 7574 mapkind = PyUnicode_KIND(mapping); 7575 7576 e = s + size; 7577 7578 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7579 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7580 * is disabled in encoding aliases, latin1 is preferred because 7581 * its implementation is faster. */ 7582 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7583 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7584 Py_UCS4 maxchar = writer->maxchar; 7585 7586 assert (writer->kind == PyUnicode_1BYTE_KIND); 7587 while (s < e) { 7588 ch = *s; 7589 x = mapdata_ucs1[ch]; 7590 if (x > maxchar) { 7591 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7592 goto onError; 7593 maxchar = writer->maxchar; 7594 outdata = (Py_UCS1 *)writer->data; 7595 } 7596 outdata[writer->pos] = x; 7597 writer->pos++; 7598 ++s; 7599 } 7600 return 0; 7601 } 7602 7603 while (s < e) { 7604 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7605 enum PyUnicode_Kind outkind = writer->kind; 7606 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7607 if (outkind == PyUnicode_1BYTE_KIND) { 7608 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7609 Py_UCS4 maxchar = writer->maxchar; 7610 while (s < e) { 7611 ch = *s; 7612 x = mapdata_ucs2[ch]; 7613 if (x > maxchar) 7614 goto Error; 7615 outdata[writer->pos] = x; 7616 writer->pos++; 7617 ++s; 7618 } 7619 break; 7620 } 7621 else if (outkind == PyUnicode_2BYTE_KIND) { 7622 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7623 while (s < e) { 7624 ch = *s; 7625 x = mapdata_ucs2[ch]; 7626 if (x == 0xFFFE) 7627 goto Error; 7628 outdata[writer->pos] = x; 7629 writer->pos++; 7630 ++s; 7631 } 7632 break; 7633 } 7634 } 7635 ch = *s; 7636 7637 if (ch < maplen) 7638 x = PyUnicode_READ(mapkind, mapdata, ch); 7639 else 7640 x = 0xfffe; /* invalid value */ 7641Error: 7642 if (x == 0xfffe) 7643 { 7644 /* undefined mapping */ 7645 startinpos = s-starts; 7646 endinpos = startinpos+1; 7647 if (unicode_decode_call_errorhandler_writer( 7648 errors, &errorHandler, 7649 "charmap", "character maps to <undefined>", 7650 &starts, &e, &startinpos, &endinpos, &exc, &s, 7651 writer)) { 7652 goto onError; 7653 } 7654 continue; 7655 } 7656 7657 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7658 goto onError; 7659 ++s; 7660 } 7661 Py_XDECREF(errorHandler); 7662 Py_XDECREF(exc); 7663 return 0; 7664 7665onError: 7666 Py_XDECREF(errorHandler); 7667 Py_XDECREF(exc); 7668 return -1; 7669} 7670 7671static int 7672charmap_decode_mapping(const char *s, 7673 Py_ssize_t size, 7674 PyObject *mapping, 7675 const char *errors, 7676 _PyUnicodeWriter *writer) 7677{ 7678 const char *starts = s; 7679 const char *e; 7680 Py_ssize_t startinpos, endinpos; 7681 PyObject *errorHandler = NULL, *exc = NULL; 7682 unsigned char ch; 7683 PyObject *key, *item = NULL; 7684 7685 e = s + size; 7686 7687 while (s < e) { 7688 ch = *s; 7689 7690 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7691 key = PyLong_FromLong((long)ch); 7692 if (key == NULL) 7693 goto onError; 7694 7695 item = PyObject_GetItem(mapping, key); 7696 Py_DECREF(key); 7697 if (item == NULL) { 7698 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7699 /* No mapping found means: mapping is undefined. */ 7700 PyErr_Clear(); 7701 goto Undefined; 7702 } else 7703 goto onError; 7704 } 7705 7706 /* Apply mapping */ 7707 if (item == Py_None) 7708 goto Undefined; 7709 if (PyLong_Check(item)) { 7710 long value = PyLong_AS_LONG(item); 7711 if (value == 0xFFFE) 7712 goto Undefined; 7713 if (value < 0 || value > MAX_UNICODE) { 7714 PyErr_Format(PyExc_TypeError, 7715 "character mapping must be in range(0x%lx)", 7716 (unsigned long)MAX_UNICODE + 1); 7717 goto onError; 7718 } 7719 7720 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7721 goto onError; 7722 } 7723 else if (PyUnicode_Check(item)) { 7724 if (PyUnicode_READY(item) == -1) 7725 goto onError; 7726 if (PyUnicode_GET_LENGTH(item) == 1) { 7727 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7728 if (value == 0xFFFE) 7729 goto Undefined; 7730 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7731 goto onError; 7732 } 7733 else { 7734 writer->overallocate = 1; 7735 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7736 goto onError; 7737 } 7738 } 7739 else { 7740 /* wrong return value */ 7741 PyErr_SetString(PyExc_TypeError, 7742 "character mapping must return integer, None or str"); 7743 goto onError; 7744 } 7745 Py_CLEAR(item); 7746 ++s; 7747 continue; 7748 7749Undefined: 7750 /* undefined mapping */ 7751 Py_CLEAR(item); 7752 startinpos = s-starts; 7753 endinpos = startinpos+1; 7754 if (unicode_decode_call_errorhandler_writer( 7755 errors, &errorHandler, 7756 "charmap", "character maps to <undefined>", 7757 &starts, &e, &startinpos, &endinpos, &exc, &s, 7758 writer)) { 7759 goto onError; 7760 } 7761 } 7762 Py_XDECREF(errorHandler); 7763 Py_XDECREF(exc); 7764 return 0; 7765 7766onError: 7767 Py_XDECREF(item); 7768 Py_XDECREF(errorHandler); 7769 Py_XDECREF(exc); 7770 return -1; 7771} 7772 7773PyObject * 7774PyUnicode_DecodeCharmap(const char *s, 7775 Py_ssize_t size, 7776 PyObject *mapping, 7777 const char *errors) 7778{ 7779 _PyUnicodeWriter writer; 7780 7781 /* Default to Latin-1 */ 7782 if (mapping == NULL) 7783 return PyUnicode_DecodeLatin1(s, size, errors); 7784 7785 if (size == 0) 7786 _Py_RETURN_UNICODE_EMPTY(); 7787 _PyUnicodeWriter_Init(&writer); 7788 writer.min_length = size; 7789 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7790 goto onError; 7791 7792 if (PyUnicode_CheckExact(mapping)) { 7793 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7794 goto onError; 7795 } 7796 else { 7797 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7798 goto onError; 7799 } 7800 return _PyUnicodeWriter_Finish(&writer); 7801 7802 onError: 7803 _PyUnicodeWriter_Dealloc(&writer); 7804 return NULL; 7805} 7806 7807/* Charmap encoding: the lookup table */ 7808 7809struct encoding_map { 7810 PyObject_HEAD 7811 unsigned char level1[32]; 7812 int count2, count3; 7813 unsigned char level23[1]; 7814}; 7815 7816static PyObject* 7817encoding_map_size(PyObject *obj, PyObject* args) 7818{ 7819 struct encoding_map *map = (struct encoding_map*)obj; 7820 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7821 128*map->count3); 7822} 7823 7824static PyMethodDef encoding_map_methods[] = { 7825 {"size", encoding_map_size, METH_NOARGS, 7826 PyDoc_STR("Return the size (in bytes) of this object") }, 7827 { 0 } 7828}; 7829 7830static void 7831encoding_map_dealloc(PyObject* o) 7832{ 7833 PyObject_FREE(o); 7834} 7835 7836static PyTypeObject EncodingMapType = { 7837 PyVarObject_HEAD_INIT(NULL, 0) 7838 "EncodingMap", /*tp_name*/ 7839 sizeof(struct encoding_map), /*tp_basicsize*/ 7840 0, /*tp_itemsize*/ 7841 /* methods */ 7842 encoding_map_dealloc, /*tp_dealloc*/ 7843 0, /*tp_print*/ 7844 0, /*tp_getattr*/ 7845 0, /*tp_setattr*/ 7846 0, /*tp_reserved*/ 7847 0, /*tp_repr*/ 7848 0, /*tp_as_number*/ 7849 0, /*tp_as_sequence*/ 7850 0, /*tp_as_mapping*/ 7851 0, /*tp_hash*/ 7852 0, /*tp_call*/ 7853 0, /*tp_str*/ 7854 0, /*tp_getattro*/ 7855 0, /*tp_setattro*/ 7856 0, /*tp_as_buffer*/ 7857 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7858 0, /*tp_doc*/ 7859 0, /*tp_traverse*/ 7860 0, /*tp_clear*/ 7861 0, /*tp_richcompare*/ 7862 0, /*tp_weaklistoffset*/ 7863 0, /*tp_iter*/ 7864 0, /*tp_iternext*/ 7865 encoding_map_methods, /*tp_methods*/ 7866 0, /*tp_members*/ 7867 0, /*tp_getset*/ 7868 0, /*tp_base*/ 7869 0, /*tp_dict*/ 7870 0, /*tp_descr_get*/ 7871 0, /*tp_descr_set*/ 7872 0, /*tp_dictoffset*/ 7873 0, /*tp_init*/ 7874 0, /*tp_alloc*/ 7875 0, /*tp_new*/ 7876 0, /*tp_free*/ 7877 0, /*tp_is_gc*/ 7878}; 7879 7880PyObject* 7881PyUnicode_BuildEncodingMap(PyObject* string) 7882{ 7883 PyObject *result; 7884 struct encoding_map *mresult; 7885 int i; 7886 int need_dict = 0; 7887 unsigned char level1[32]; 7888 unsigned char level2[512]; 7889 unsigned char *mlevel1, *mlevel2, *mlevel3; 7890 int count2 = 0, count3 = 0; 7891 int kind; 7892 void *data; 7893 Py_ssize_t length; 7894 Py_UCS4 ch; 7895 7896 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7897 PyErr_BadArgument(); 7898 return NULL; 7899 } 7900 kind = PyUnicode_KIND(string); 7901 data = PyUnicode_DATA(string); 7902 length = PyUnicode_GET_LENGTH(string); 7903 length = Py_MIN(length, 256); 7904 memset(level1, 0xFF, sizeof level1); 7905 memset(level2, 0xFF, sizeof level2); 7906 7907 /* If there isn't a one-to-one mapping of NULL to \0, 7908 or if there are non-BMP characters, we need to use 7909 a mapping dictionary. */ 7910 if (PyUnicode_READ(kind, data, 0) != 0) 7911 need_dict = 1; 7912 for (i = 1; i < length; i++) { 7913 int l1, l2; 7914 ch = PyUnicode_READ(kind, data, i); 7915 if (ch == 0 || ch > 0xFFFF) { 7916 need_dict = 1; 7917 break; 7918 } 7919 if (ch == 0xFFFE) 7920 /* unmapped character */ 7921 continue; 7922 l1 = ch >> 11; 7923 l2 = ch >> 7; 7924 if (level1[l1] == 0xFF) 7925 level1[l1] = count2++; 7926 if (level2[l2] == 0xFF) 7927 level2[l2] = count3++; 7928 } 7929 7930 if (count2 >= 0xFF || count3 >= 0xFF) 7931 need_dict = 1; 7932 7933 if (need_dict) { 7934 PyObject *result = PyDict_New(); 7935 PyObject *key, *value; 7936 if (!result) 7937 return NULL; 7938 for (i = 0; i < length; i++) { 7939 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7940 value = PyLong_FromLong(i); 7941 if (!key || !value) 7942 goto failed1; 7943 if (PyDict_SetItem(result, key, value) == -1) 7944 goto failed1; 7945 Py_DECREF(key); 7946 Py_DECREF(value); 7947 } 7948 return result; 7949 failed1: 7950 Py_XDECREF(key); 7951 Py_XDECREF(value); 7952 Py_DECREF(result); 7953 return NULL; 7954 } 7955 7956 /* Create a three-level trie */ 7957 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7958 16*count2 + 128*count3 - 1); 7959 if (!result) 7960 return PyErr_NoMemory(); 7961 PyObject_Init(result, &EncodingMapType); 7962 mresult = (struct encoding_map*)result; 7963 mresult->count2 = count2; 7964 mresult->count3 = count3; 7965 mlevel1 = mresult->level1; 7966 mlevel2 = mresult->level23; 7967 mlevel3 = mresult->level23 + 16*count2; 7968 memcpy(mlevel1, level1, 32); 7969 memset(mlevel2, 0xFF, 16*count2); 7970 memset(mlevel3, 0, 128*count3); 7971 count3 = 0; 7972 for (i = 1; i < length; i++) { 7973 int o1, o2, o3, i2, i3; 7974 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7975 if (ch == 0xFFFE) 7976 /* unmapped character */ 7977 continue; 7978 o1 = ch>>11; 7979 o2 = (ch>>7) & 0xF; 7980 i2 = 16*mlevel1[o1] + o2; 7981 if (mlevel2[i2] == 0xFF) 7982 mlevel2[i2] = count3++; 7983 o3 = ch & 0x7F; 7984 i3 = 128*mlevel2[i2] + o3; 7985 mlevel3[i3] = i; 7986 } 7987 return result; 7988} 7989 7990static int 7991encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7992{ 7993 struct encoding_map *map = (struct encoding_map*)mapping; 7994 int l1 = c>>11; 7995 int l2 = (c>>7) & 0xF; 7996 int l3 = c & 0x7F; 7997 int i; 7998 7999 if (c > 0xFFFF) 8000 return -1; 8001 if (c == 0) 8002 return 0; 8003 /* level 1*/ 8004 i = map->level1[l1]; 8005 if (i == 0xFF) { 8006 return -1; 8007 } 8008 /* level 2*/ 8009 i = map->level23[16*i+l2]; 8010 if (i == 0xFF) { 8011 return -1; 8012 } 8013 /* level 3 */ 8014 i = map->level23[16*map->count2 + 128*i + l3]; 8015 if (i == 0) { 8016 return -1; 8017 } 8018 return i; 8019} 8020 8021/* Lookup the character ch in the mapping. If the character 8022 can't be found, Py_None is returned (or NULL, if another 8023 error occurred). */ 8024static PyObject * 8025charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8026{ 8027 PyObject *w = PyLong_FromLong((long)c); 8028 PyObject *x; 8029 8030 if (w == NULL) 8031 return NULL; 8032 x = PyObject_GetItem(mapping, w); 8033 Py_DECREF(w); 8034 if (x == NULL) { 8035 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8036 /* No mapping found means: mapping is undefined. */ 8037 PyErr_Clear(); 8038 x = Py_None; 8039 Py_INCREF(x); 8040 return x; 8041 } else 8042 return NULL; 8043 } 8044 else if (x == Py_None) 8045 return x; 8046 else if (PyLong_Check(x)) { 8047 long value = PyLong_AS_LONG(x); 8048 if (value < 0 || value > 255) { 8049 PyErr_SetString(PyExc_TypeError, 8050 "character mapping must be in range(256)"); 8051 Py_DECREF(x); 8052 return NULL; 8053 } 8054 return x; 8055 } 8056 else if (PyBytes_Check(x)) 8057 return x; 8058 else { 8059 /* wrong return value */ 8060 PyErr_Format(PyExc_TypeError, 8061 "character mapping must return integer, bytes or None, not %.400s", 8062 x->ob_type->tp_name); 8063 Py_DECREF(x); 8064 return NULL; 8065 } 8066} 8067 8068static int 8069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8070{ 8071 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8072 /* exponentially overallocate to minimize reallocations */ 8073 if (requiredsize < 2*outsize) 8074 requiredsize = 2*outsize; 8075 if (_PyBytes_Resize(outobj, requiredsize)) 8076 return -1; 8077 return 0; 8078} 8079 8080typedef enum charmapencode_result { 8081 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8082} charmapencode_result; 8083/* lookup the character, put the result in the output string and adjust 8084 various state variables. Resize the output bytes object if not enough 8085 space is available. Return a new reference to the object that 8086 was put in the output buffer, or Py_None, if the mapping was undefined 8087 (in which case no character was written) or NULL, if a 8088 reallocation error occurred. The caller must decref the result */ 8089static charmapencode_result 8090charmapencode_output(Py_UCS4 c, PyObject *mapping, 8091 PyObject **outobj, Py_ssize_t *outpos) 8092{ 8093 PyObject *rep; 8094 char *outstart; 8095 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8096 8097 if (Py_TYPE(mapping) == &EncodingMapType) { 8098 int res = encoding_map_lookup(c, mapping); 8099 Py_ssize_t requiredsize = *outpos+1; 8100 if (res == -1) 8101 return enc_FAILED; 8102 if (outsize<requiredsize) 8103 if (charmapencode_resize(outobj, outpos, requiredsize)) 8104 return enc_EXCEPTION; 8105 outstart = PyBytes_AS_STRING(*outobj); 8106 outstart[(*outpos)++] = (char)res; 8107 return enc_SUCCESS; 8108 } 8109 8110 rep = charmapencode_lookup(c, mapping); 8111 if (rep==NULL) 8112 return enc_EXCEPTION; 8113 else if (rep==Py_None) { 8114 Py_DECREF(rep); 8115 return enc_FAILED; 8116 } else { 8117 if (PyLong_Check(rep)) { 8118 Py_ssize_t requiredsize = *outpos+1; 8119 if (outsize<requiredsize) 8120 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8121 Py_DECREF(rep); 8122 return enc_EXCEPTION; 8123 } 8124 outstart = PyBytes_AS_STRING(*outobj); 8125 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8126 } 8127 else { 8128 const char *repchars = PyBytes_AS_STRING(rep); 8129 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8130 Py_ssize_t requiredsize = *outpos+repsize; 8131 if (outsize<requiredsize) 8132 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8133 Py_DECREF(rep); 8134 return enc_EXCEPTION; 8135 } 8136 outstart = PyBytes_AS_STRING(*outobj); 8137 memcpy(outstart + *outpos, repchars, repsize); 8138 *outpos += repsize; 8139 } 8140 } 8141 Py_DECREF(rep); 8142 return enc_SUCCESS; 8143} 8144 8145/* handle an error in PyUnicode_EncodeCharmap 8146 Return 0 on success, -1 on error */ 8147static int 8148charmap_encoding_error( 8149 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8150 PyObject **exceptionObject, 8151 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8152 PyObject **res, Py_ssize_t *respos) 8153{ 8154 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8155 Py_ssize_t size, repsize; 8156 Py_ssize_t newpos; 8157 enum PyUnicode_Kind kind; 8158 void *data; 8159 Py_ssize_t index; 8160 /* startpos for collecting unencodable chars */ 8161 Py_ssize_t collstartpos = *inpos; 8162 Py_ssize_t collendpos = *inpos+1; 8163 Py_ssize_t collpos; 8164 char *encoding = "charmap"; 8165 char *reason = "character maps to <undefined>"; 8166 charmapencode_result x; 8167 Py_UCS4 ch; 8168 int val; 8169 8170 if (PyUnicode_READY(unicode) == -1) 8171 return -1; 8172 size = PyUnicode_GET_LENGTH(unicode); 8173 /* find all unencodable characters */ 8174 while (collendpos < size) { 8175 PyObject *rep; 8176 if (Py_TYPE(mapping) == &EncodingMapType) { 8177 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8178 val = encoding_map_lookup(ch, mapping); 8179 if (val != -1) 8180 break; 8181 ++collendpos; 8182 continue; 8183 } 8184 8185 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8186 rep = charmapencode_lookup(ch, mapping); 8187 if (rep==NULL) 8188 return -1; 8189 else if (rep!=Py_None) { 8190 Py_DECREF(rep); 8191 break; 8192 } 8193 Py_DECREF(rep); 8194 ++collendpos; 8195 } 8196 /* cache callback name lookup 8197 * (if not done yet, i.e. it's the first error) */ 8198 if (*known_errorHandler==-1) { 8199 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8200 *known_errorHandler = 1; 8201 else if (!strcmp(errors, "replace")) 8202 *known_errorHandler = 2; 8203 else if (!strcmp(errors, "ignore")) 8204 *known_errorHandler = 3; 8205 else if (!strcmp(errors, "xmlcharrefreplace")) 8206 *known_errorHandler = 4; 8207 else 8208 *known_errorHandler = 0; 8209 } 8210 switch (*known_errorHandler) { 8211 case 1: /* strict */ 8212 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8213 return -1; 8214 case 2: /* replace */ 8215 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8216 x = charmapencode_output('?', mapping, res, respos); 8217 if (x==enc_EXCEPTION) { 8218 return -1; 8219 } 8220 else if (x==enc_FAILED) { 8221 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8222 return -1; 8223 } 8224 } 8225 /* fall through */ 8226 case 3: /* ignore */ 8227 *inpos = collendpos; 8228 break; 8229 case 4: /* xmlcharrefreplace */ 8230 /* generate replacement (temporarily (mis)uses p) */ 8231 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8232 char buffer[2+29+1+1]; 8233 char *cp; 8234 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8235 for (cp = buffer; *cp; ++cp) { 8236 x = charmapencode_output(*cp, mapping, res, respos); 8237 if (x==enc_EXCEPTION) 8238 return -1; 8239 else if (x==enc_FAILED) { 8240 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8241 return -1; 8242 } 8243 } 8244 } 8245 *inpos = collendpos; 8246 break; 8247 default: 8248 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8249 encoding, reason, unicode, exceptionObject, 8250 collstartpos, collendpos, &newpos); 8251 if (repunicode == NULL) 8252 return -1; 8253 if (PyBytes_Check(repunicode)) { 8254 /* Directly copy bytes result to output. */ 8255 Py_ssize_t outsize = PyBytes_Size(*res); 8256 Py_ssize_t requiredsize; 8257 repsize = PyBytes_Size(repunicode); 8258 requiredsize = *respos + repsize; 8259 if (requiredsize > outsize) 8260 /* Make room for all additional bytes. */ 8261 if (charmapencode_resize(res, respos, requiredsize)) { 8262 Py_DECREF(repunicode); 8263 return -1; 8264 } 8265 memcpy(PyBytes_AsString(*res) + *respos, 8266 PyBytes_AsString(repunicode), repsize); 8267 *respos += repsize; 8268 *inpos = newpos; 8269 Py_DECREF(repunicode); 8270 break; 8271 } 8272 /* generate replacement */ 8273 if (PyUnicode_READY(repunicode) == -1) { 8274 Py_DECREF(repunicode); 8275 return -1; 8276 } 8277 repsize = PyUnicode_GET_LENGTH(repunicode); 8278 data = PyUnicode_DATA(repunicode); 8279 kind = PyUnicode_KIND(repunicode); 8280 for (index = 0; index < repsize; index++) { 8281 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8282 x = charmapencode_output(repch, mapping, res, respos); 8283 if (x==enc_EXCEPTION) { 8284 Py_DECREF(repunicode); 8285 return -1; 8286 } 8287 else if (x==enc_FAILED) { 8288 Py_DECREF(repunicode); 8289 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8290 return -1; 8291 } 8292 } 8293 *inpos = newpos; 8294 Py_DECREF(repunicode); 8295 } 8296 return 0; 8297} 8298 8299PyObject * 8300_PyUnicode_EncodeCharmap(PyObject *unicode, 8301 PyObject *mapping, 8302 const char *errors) 8303{ 8304 /* output object */ 8305 PyObject *res = NULL; 8306 /* current input position */ 8307 Py_ssize_t inpos = 0; 8308 Py_ssize_t size; 8309 /* current output position */ 8310 Py_ssize_t respos = 0; 8311 PyObject *errorHandler = NULL; 8312 PyObject *exc = NULL; 8313 /* the following variable is used for caching string comparisons 8314 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8315 * 3=ignore, 4=xmlcharrefreplace */ 8316 int known_errorHandler = -1; 8317 void *data; 8318 int kind; 8319 8320 if (PyUnicode_READY(unicode) == -1) 8321 return NULL; 8322 size = PyUnicode_GET_LENGTH(unicode); 8323 data = PyUnicode_DATA(unicode); 8324 kind = PyUnicode_KIND(unicode); 8325 8326 /* Default to Latin-1 */ 8327 if (mapping == NULL) 8328 return unicode_encode_ucs1(unicode, errors, 256); 8329 8330 /* allocate enough for a simple encoding without 8331 replacements, if we need more, we'll resize */ 8332 res = PyBytes_FromStringAndSize(NULL, size); 8333 if (res == NULL) 8334 goto onError; 8335 if (size == 0) 8336 return res; 8337 8338 while (inpos<size) { 8339 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8340 /* try to encode it */ 8341 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8342 if (x==enc_EXCEPTION) /* error */ 8343 goto onError; 8344 if (x==enc_FAILED) { /* unencodable character */ 8345 if (charmap_encoding_error(unicode, &inpos, mapping, 8346 &exc, 8347 &known_errorHandler, &errorHandler, errors, 8348 &res, &respos)) { 8349 goto onError; 8350 } 8351 } 8352 else 8353 /* done with this character => adjust input position */ 8354 ++inpos; 8355 } 8356 8357 /* Resize if we allocated to much */ 8358 if (respos<PyBytes_GET_SIZE(res)) 8359 if (_PyBytes_Resize(&res, respos) < 0) 8360 goto onError; 8361 8362 Py_XDECREF(exc); 8363 Py_XDECREF(errorHandler); 8364 return res; 8365 8366 onError: 8367 Py_XDECREF(res); 8368 Py_XDECREF(exc); 8369 Py_XDECREF(errorHandler); 8370 return NULL; 8371} 8372 8373/* Deprecated */ 8374PyObject * 8375PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8376 Py_ssize_t size, 8377 PyObject *mapping, 8378 const char *errors) 8379{ 8380 PyObject *result; 8381 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8382 if (unicode == NULL) 8383 return NULL; 8384 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8385 Py_DECREF(unicode); 8386 return result; 8387} 8388 8389PyObject * 8390PyUnicode_AsCharmapString(PyObject *unicode, 8391 PyObject *mapping) 8392{ 8393 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8394 PyErr_BadArgument(); 8395 return NULL; 8396 } 8397 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8398} 8399 8400/* create or adjust a UnicodeTranslateError */ 8401static void 8402make_translate_exception(PyObject **exceptionObject, 8403 PyObject *unicode, 8404 Py_ssize_t startpos, Py_ssize_t endpos, 8405 const char *reason) 8406{ 8407 if (*exceptionObject == NULL) { 8408 *exceptionObject = _PyUnicodeTranslateError_Create( 8409 unicode, startpos, endpos, reason); 8410 } 8411 else { 8412 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8413 goto onError; 8414 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8415 goto onError; 8416 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8417 goto onError; 8418 return; 8419 onError: 8420 Py_CLEAR(*exceptionObject); 8421 } 8422} 8423 8424/* error handling callback helper: 8425 build arguments, call the callback and check the arguments, 8426 put the result into newpos and return the replacement string, which 8427 has to be freed by the caller */ 8428static PyObject * 8429unicode_translate_call_errorhandler(const char *errors, 8430 PyObject **errorHandler, 8431 const char *reason, 8432 PyObject *unicode, PyObject **exceptionObject, 8433 Py_ssize_t startpos, Py_ssize_t endpos, 8434 Py_ssize_t *newpos) 8435{ 8436 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8437 8438 Py_ssize_t i_newpos; 8439 PyObject *restuple; 8440 PyObject *resunicode; 8441 8442 if (*errorHandler == NULL) { 8443 *errorHandler = PyCodec_LookupError(errors); 8444 if (*errorHandler == NULL) 8445 return NULL; 8446 } 8447 8448 make_translate_exception(exceptionObject, 8449 unicode, startpos, endpos, reason); 8450 if (*exceptionObject == NULL) 8451 return NULL; 8452 8453 restuple = PyObject_CallFunctionObjArgs( 8454 *errorHandler, *exceptionObject, NULL); 8455 if (restuple == NULL) 8456 return NULL; 8457 if (!PyTuple_Check(restuple)) { 8458 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8459 Py_DECREF(restuple); 8460 return NULL; 8461 } 8462 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8463 &resunicode, &i_newpos)) { 8464 Py_DECREF(restuple); 8465 return NULL; 8466 } 8467 if (i_newpos<0) 8468 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8469 else 8470 *newpos = i_newpos; 8471 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8472 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8473 Py_DECREF(restuple); 8474 return NULL; 8475 } 8476 Py_INCREF(resunicode); 8477 Py_DECREF(restuple); 8478 return resunicode; 8479} 8480 8481/* Lookup the character ch in the mapping and put the result in result, 8482 which must be decrefed by the caller. 8483 Return 0 on success, -1 on error */ 8484static int 8485charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8486{ 8487 PyObject *w = PyLong_FromLong((long)c); 8488 PyObject *x; 8489 8490 if (w == NULL) 8491 return -1; 8492 x = PyObject_GetItem(mapping, w); 8493 Py_DECREF(w); 8494 if (x == NULL) { 8495 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8496 /* No mapping found means: use 1:1 mapping. */ 8497 PyErr_Clear(); 8498 *result = NULL; 8499 return 0; 8500 } else 8501 return -1; 8502 } 8503 else if (x == Py_None) { 8504 *result = x; 8505 return 0; 8506 } 8507 else if (PyLong_Check(x)) { 8508 long value = PyLong_AS_LONG(x); 8509 if (value < 0 || value > MAX_UNICODE) { 8510 PyErr_Format(PyExc_ValueError, 8511 "character mapping must be in range(0x%x)", 8512 MAX_UNICODE+1); 8513 Py_DECREF(x); 8514 return -1; 8515 } 8516 *result = x; 8517 return 0; 8518 } 8519 else if (PyUnicode_Check(x)) { 8520 *result = x; 8521 return 0; 8522 } 8523 else { 8524 /* wrong return value */ 8525 PyErr_SetString(PyExc_TypeError, 8526 "character mapping must return integer, None or str"); 8527 Py_DECREF(x); 8528 return -1; 8529 } 8530} 8531 8532/* lookup the character, write the result into the writer. 8533 Return 1 if the result was written into the writer, return 0 if the mapping 8534 was undefined, raise an exception return -1 on error. */ 8535static int 8536charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8537 _PyUnicodeWriter *writer) 8538{ 8539 PyObject *item; 8540 8541 if (charmaptranslate_lookup(ch, mapping, &item)) 8542 return -1; 8543 8544 if (item == NULL) { 8545 /* not found => default to 1:1 mapping */ 8546 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8547 return -1; 8548 } 8549 return 1; 8550 } 8551 8552 if (item == Py_None) { 8553 Py_DECREF(item); 8554 return 0; 8555 } 8556 8557 if (PyLong_Check(item)) { 8558 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8559 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8560 used it */ 8561 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8562 Py_DECREF(item); 8563 return -1; 8564 } 8565 Py_DECREF(item); 8566 return 1; 8567 } 8568 8569 if (!PyUnicode_Check(item)) { 8570 Py_DECREF(item); 8571 return -1; 8572 } 8573 8574 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8575 Py_DECREF(item); 8576 return -1; 8577 } 8578 8579 Py_DECREF(item); 8580 return 1; 8581} 8582 8583static int 8584unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8585 Py_UCS1 *translate) 8586{ 8587 PyObject *item = NULL; 8588 int ret = 0; 8589 8590 if (charmaptranslate_lookup(ch, mapping, &item)) { 8591 return -1; 8592 } 8593 8594 if (item == Py_None) { 8595 /* deletion */ 8596 translate[ch] = 0xfe; 8597 } 8598 else if (item == NULL) { 8599 /* not found => default to 1:1 mapping */ 8600 translate[ch] = ch; 8601 return 1; 8602 } 8603 else if (PyLong_Check(item)) { 8604 long replace = PyLong_AS_LONG(item); 8605 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8606 used it */ 8607 if (127 < replace) { 8608 /* invalid character or character outside ASCII: 8609 skip the fast translate */ 8610 goto exit; 8611 } 8612 translate[ch] = (Py_UCS1)replace; 8613 } 8614 else if (PyUnicode_Check(item)) { 8615 Py_UCS4 replace; 8616 8617 if (PyUnicode_READY(item) == -1) { 8618 Py_DECREF(item); 8619 return -1; 8620 } 8621 if (PyUnicode_GET_LENGTH(item) != 1) 8622 goto exit; 8623 8624 replace = PyUnicode_READ_CHAR(item, 0); 8625 if (replace > 127) 8626 goto exit; 8627 translate[ch] = (Py_UCS1)replace; 8628 } 8629 else { 8630 /* not None, NULL, long or unicode */ 8631 goto exit; 8632 } 8633 ret = 1; 8634 8635 exit: 8636 Py_DECREF(item); 8637 return ret; 8638} 8639 8640/* Fast path for ascii => ascii translation. Return 1 if the whole string 8641 was translated into writer, return 0 if the input string was partially 8642 translated into writer, raise an exception and return -1 on error. */ 8643static int 8644unicode_fast_translate(PyObject *input, PyObject *mapping, 8645 _PyUnicodeWriter *writer, int ignore) 8646{ 8647 Py_UCS1 ascii_table[128], ch, ch2; 8648 Py_ssize_t len; 8649 Py_UCS1 *in, *end, *out; 8650 int res = 0; 8651 8652 if (PyUnicode_READY(input) == -1) 8653 return -1; 8654 if (!PyUnicode_IS_ASCII(input)) 8655 return 0; 8656 len = PyUnicode_GET_LENGTH(input); 8657 8658 memset(ascii_table, 0xff, 128); 8659 8660 in = PyUnicode_1BYTE_DATA(input); 8661 end = in + len; 8662 8663 assert(PyUnicode_IS_ASCII(writer->buffer)); 8664 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8665 out = PyUnicode_1BYTE_DATA(writer->buffer); 8666 8667 for (; in < end; in++) { 8668 ch = *in; 8669 ch2 = ascii_table[ch]; 8670 if (ch2 == 0xff) { 8671 int translate = unicode_fast_translate_lookup(mapping, ch, 8672 ascii_table); 8673 if (translate < 0) 8674 return -1; 8675 if (translate == 0) 8676 goto exit; 8677 ch2 = ascii_table[ch]; 8678 } 8679 if (ch2 == 0xfe) { 8680 if (ignore) 8681 continue; 8682 goto exit; 8683 } 8684 assert(ch2 < 128); 8685 *out = ch2; 8686 out++; 8687 } 8688 res = 1; 8689 8690exit: 8691 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8692 return res; 8693} 8694 8695PyObject * 8696_PyUnicode_TranslateCharmap(PyObject *input, 8697 PyObject *mapping, 8698 const char *errors) 8699{ 8700 /* input object */ 8701 char *data; 8702 Py_ssize_t size, i; 8703 int kind; 8704 /* output buffer */ 8705 _PyUnicodeWriter writer; 8706 /* error handler */ 8707 char *reason = "character maps to <undefined>"; 8708 PyObject *errorHandler = NULL; 8709 PyObject *exc = NULL; 8710 int ignore; 8711 int res; 8712 8713 if (mapping == NULL) { 8714 PyErr_BadArgument(); 8715 return NULL; 8716 } 8717 8718 if (PyUnicode_READY(input) == -1) 8719 return NULL; 8720 data = (char*)PyUnicode_DATA(input); 8721 kind = PyUnicode_KIND(input); 8722 size = PyUnicode_GET_LENGTH(input); 8723 8724 if (size == 0) { 8725 Py_INCREF(input); 8726 return input; 8727 } 8728 8729 /* allocate enough for a simple 1:1 translation without 8730 replacements, if we need more, we'll resize */ 8731 _PyUnicodeWriter_Init(&writer); 8732 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8733 goto onError; 8734 8735 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8736 8737 res = unicode_fast_translate(input, mapping, &writer, ignore); 8738 if (res < 0) { 8739 _PyUnicodeWriter_Dealloc(&writer); 8740 return NULL; 8741 } 8742 if (res == 1) 8743 return _PyUnicodeWriter_Finish(&writer); 8744 8745 i = writer.pos; 8746 while (i<size) { 8747 /* try to encode it */ 8748 int translate; 8749 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8750 Py_ssize_t newpos; 8751 /* startpos for collecting untranslatable chars */ 8752 Py_ssize_t collstart; 8753 Py_ssize_t collend; 8754 Py_UCS4 ch; 8755 8756 ch = PyUnicode_READ(kind, data, i); 8757 translate = charmaptranslate_output(ch, mapping, &writer); 8758 if (translate < 0) 8759 goto onError; 8760 8761 if (translate != 0) { 8762 /* it worked => adjust input pointer */ 8763 ++i; 8764 continue; 8765 } 8766 8767 /* untranslatable character */ 8768 collstart = i; 8769 collend = i+1; 8770 8771 /* find all untranslatable characters */ 8772 while (collend < size) { 8773 PyObject *x; 8774 ch = PyUnicode_READ(kind, data, collend); 8775 if (charmaptranslate_lookup(ch, mapping, &x)) 8776 goto onError; 8777 Py_XDECREF(x); 8778 if (x != Py_None) 8779 break; 8780 ++collend; 8781 } 8782 8783 if (ignore) { 8784 i = collend; 8785 } 8786 else { 8787 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8788 reason, input, &exc, 8789 collstart, collend, &newpos); 8790 if (repunicode == NULL) 8791 goto onError; 8792 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 8793 Py_DECREF(repunicode); 8794 goto onError; 8795 } 8796 Py_DECREF(repunicode); 8797 i = newpos; 8798 } 8799 } 8800 Py_XDECREF(exc); 8801 Py_XDECREF(errorHandler); 8802 return _PyUnicodeWriter_Finish(&writer); 8803 8804 onError: 8805 _PyUnicodeWriter_Dealloc(&writer); 8806 Py_XDECREF(exc); 8807 Py_XDECREF(errorHandler); 8808 return NULL; 8809} 8810 8811/* Deprecated. Use PyUnicode_Translate instead. */ 8812PyObject * 8813PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8814 Py_ssize_t size, 8815 PyObject *mapping, 8816 const char *errors) 8817{ 8818 PyObject *result; 8819 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8820 if (!unicode) 8821 return NULL; 8822 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8823 Py_DECREF(unicode); 8824 return result; 8825} 8826 8827PyObject * 8828PyUnicode_Translate(PyObject *str, 8829 PyObject *mapping, 8830 const char *errors) 8831{ 8832 PyObject *result; 8833 8834 str = PyUnicode_FromObject(str); 8835 if (str == NULL) 8836 return NULL; 8837 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8838 Py_DECREF(str); 8839 return result; 8840} 8841 8842static Py_UCS4 8843fix_decimal_and_space_to_ascii(PyObject *self) 8844{ 8845 /* No need to call PyUnicode_READY(self) because this function is only 8846 called as a callback from fixup() which does it already. */ 8847 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8848 const int kind = PyUnicode_KIND(self); 8849 void *data = PyUnicode_DATA(self); 8850 Py_UCS4 maxchar = 127, ch, fixed; 8851 int modified = 0; 8852 Py_ssize_t i; 8853 8854 for (i = 0; i < len; ++i) { 8855 ch = PyUnicode_READ(kind, data, i); 8856 fixed = 0; 8857 if (ch > 127) { 8858 if (Py_UNICODE_ISSPACE(ch)) 8859 fixed = ' '; 8860 else { 8861 const int decimal = Py_UNICODE_TODECIMAL(ch); 8862 if (decimal >= 0) 8863 fixed = '0' + decimal; 8864 } 8865 if (fixed != 0) { 8866 modified = 1; 8867 maxchar = Py_MAX(maxchar, fixed); 8868 PyUnicode_WRITE(kind, data, i, fixed); 8869 } 8870 else 8871 maxchar = Py_MAX(maxchar, ch); 8872 } 8873 } 8874 8875 return (modified) ? maxchar : 0; 8876} 8877 8878PyObject * 8879_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8880{ 8881 if (!PyUnicode_Check(unicode)) { 8882 PyErr_BadInternalCall(); 8883 return NULL; 8884 } 8885 if (PyUnicode_READY(unicode) == -1) 8886 return NULL; 8887 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8888 /* If the string is already ASCII, just return the same string */ 8889 Py_INCREF(unicode); 8890 return unicode; 8891 } 8892 return fixup(unicode, fix_decimal_and_space_to_ascii); 8893} 8894 8895PyObject * 8896PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8897 Py_ssize_t length) 8898{ 8899 PyObject *decimal; 8900 Py_ssize_t i; 8901 Py_UCS4 maxchar; 8902 enum PyUnicode_Kind kind; 8903 void *data; 8904 8905 maxchar = 127; 8906 for (i = 0; i < length; i++) { 8907 Py_UCS4 ch = s[i]; 8908 if (ch > 127) { 8909 int decimal = Py_UNICODE_TODECIMAL(ch); 8910 if (decimal >= 0) 8911 ch = '0' + decimal; 8912 maxchar = Py_MAX(maxchar, ch); 8913 } 8914 } 8915 8916 /* Copy to a new string */ 8917 decimal = PyUnicode_New(length, maxchar); 8918 if (decimal == NULL) 8919 return decimal; 8920 kind = PyUnicode_KIND(decimal); 8921 data = PyUnicode_DATA(decimal); 8922 /* Iterate over code points */ 8923 for (i = 0; i < length; i++) { 8924 Py_UCS4 ch = s[i]; 8925 if (ch > 127) { 8926 int decimal = Py_UNICODE_TODECIMAL(ch); 8927 if (decimal >= 0) 8928 ch = '0' + decimal; 8929 } 8930 PyUnicode_WRITE(kind, data, i, ch); 8931 } 8932 return unicode_result(decimal); 8933} 8934/* --- Decimal Encoder ---------------------------------------------------- */ 8935 8936int 8937PyUnicode_EncodeDecimal(Py_UNICODE *s, 8938 Py_ssize_t length, 8939 char *output, 8940 const char *errors) 8941{ 8942 PyObject *unicode; 8943 Py_ssize_t i; 8944 enum PyUnicode_Kind kind; 8945 void *data; 8946 8947 if (output == NULL) { 8948 PyErr_BadArgument(); 8949 return -1; 8950 } 8951 8952 unicode = PyUnicode_FromUnicode(s, length); 8953 if (unicode == NULL) 8954 return -1; 8955 8956 if (PyUnicode_READY(unicode) == -1) { 8957 Py_DECREF(unicode); 8958 return -1; 8959 } 8960 kind = PyUnicode_KIND(unicode); 8961 data = PyUnicode_DATA(unicode); 8962 8963 for (i=0; i < length; ) { 8964 PyObject *exc; 8965 Py_UCS4 ch; 8966 int decimal; 8967 Py_ssize_t startpos; 8968 8969 ch = PyUnicode_READ(kind, data, i); 8970 8971 if (Py_UNICODE_ISSPACE(ch)) { 8972 *output++ = ' '; 8973 i++; 8974 continue; 8975 } 8976 decimal = Py_UNICODE_TODECIMAL(ch); 8977 if (decimal >= 0) { 8978 *output++ = '0' + decimal; 8979 i++; 8980 continue; 8981 } 8982 if (0 < ch && ch < 256) { 8983 *output++ = (char)ch; 8984 i++; 8985 continue; 8986 } 8987 8988 startpos = i; 8989 exc = NULL; 8990 raise_encode_exception(&exc, "decimal", unicode, 8991 startpos, startpos+1, 8992 "invalid decimal Unicode string"); 8993 Py_XDECREF(exc); 8994 Py_DECREF(unicode); 8995 return -1; 8996 } 8997 /* 0-terminate the output string */ 8998 *output++ = '\0'; 8999 Py_DECREF(unicode); 9000 return 0; 9001} 9002 9003/* --- Helpers ------------------------------------------------------------ */ 9004 9005static Py_ssize_t 9006any_find_slice(int direction, PyObject* s1, PyObject* s2, 9007 Py_ssize_t start, 9008 Py_ssize_t end) 9009{ 9010 int kind1, kind2, kind; 9011 void *buf1, *buf2; 9012 Py_ssize_t len1, len2, result; 9013 9014 kind1 = PyUnicode_KIND(s1); 9015 kind2 = PyUnicode_KIND(s2); 9016 kind = kind1 > kind2 ? kind1 : kind2; 9017 buf1 = PyUnicode_DATA(s1); 9018 buf2 = PyUnicode_DATA(s2); 9019 if (kind1 != kind) 9020 buf1 = _PyUnicode_AsKind(s1, kind); 9021 if (!buf1) 9022 return -2; 9023 if (kind2 != kind) 9024 buf2 = _PyUnicode_AsKind(s2, kind); 9025 if (!buf2) { 9026 if (kind1 != kind) PyMem_Free(buf1); 9027 return -2; 9028 } 9029 len1 = PyUnicode_GET_LENGTH(s1); 9030 len2 = PyUnicode_GET_LENGTH(s2); 9031 9032 if (direction > 0) { 9033 switch (kind) { 9034 case PyUnicode_1BYTE_KIND: 9035 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9036 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9037 else 9038 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9039 break; 9040 case PyUnicode_2BYTE_KIND: 9041 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9042 break; 9043 case PyUnicode_4BYTE_KIND: 9044 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9045 break; 9046 default: 9047 assert(0); result = -2; 9048 } 9049 } 9050 else { 9051 switch (kind) { 9052 case PyUnicode_1BYTE_KIND: 9053 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9054 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9055 else 9056 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9057 break; 9058 case PyUnicode_2BYTE_KIND: 9059 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9060 break; 9061 case PyUnicode_4BYTE_KIND: 9062 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9063 break; 9064 default: 9065 assert(0); result = -2; 9066 } 9067 } 9068 9069 if (kind1 != kind) 9070 PyMem_Free(buf1); 9071 if (kind2 != kind) 9072 PyMem_Free(buf2); 9073 9074 return result; 9075} 9076 9077Py_ssize_t 9078_PyUnicode_InsertThousandsGrouping( 9079 PyObject *unicode, Py_ssize_t index, 9080 Py_ssize_t n_buffer, 9081 void *digits, Py_ssize_t n_digits, 9082 Py_ssize_t min_width, 9083 const char *grouping, PyObject *thousands_sep, 9084 Py_UCS4 *maxchar) 9085{ 9086 unsigned int kind, thousands_sep_kind; 9087 char *data, *thousands_sep_data; 9088 Py_ssize_t thousands_sep_len; 9089 Py_ssize_t len; 9090 9091 if (unicode != NULL) { 9092 kind = PyUnicode_KIND(unicode); 9093 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9094 } 9095 else { 9096 kind = PyUnicode_1BYTE_KIND; 9097 data = NULL; 9098 } 9099 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9100 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9101 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9102 if (unicode != NULL && thousands_sep_kind != kind) { 9103 if (thousands_sep_kind < kind) { 9104 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9105 if (!thousands_sep_data) 9106 return -1; 9107 } 9108 else { 9109 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9110 if (!data) 9111 return -1; 9112 } 9113 } 9114 9115 switch (kind) { 9116 case PyUnicode_1BYTE_KIND: 9117 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9118 len = asciilib_InsertThousandsGrouping( 9119 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9120 min_width, grouping, 9121 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9122 else 9123 len = ucs1lib_InsertThousandsGrouping( 9124 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9125 min_width, grouping, 9126 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9127 break; 9128 case PyUnicode_2BYTE_KIND: 9129 len = ucs2lib_InsertThousandsGrouping( 9130 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9131 min_width, grouping, 9132 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9133 break; 9134 case PyUnicode_4BYTE_KIND: 9135 len = ucs4lib_InsertThousandsGrouping( 9136 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9137 min_width, grouping, 9138 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9139 break; 9140 default: 9141 assert(0); 9142 return -1; 9143 } 9144 if (unicode != NULL && thousands_sep_kind != kind) { 9145 if (thousands_sep_kind < kind) 9146 PyMem_Free(thousands_sep_data); 9147 else 9148 PyMem_Free(data); 9149 } 9150 if (unicode == NULL) { 9151 *maxchar = 127; 9152 if (len != n_digits) { 9153 *maxchar = Py_MAX(*maxchar, 9154 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9155 } 9156 } 9157 return len; 9158} 9159 9160 9161/* helper macro to fixup start/end slice values */ 9162#define ADJUST_INDICES(start, end, len) \ 9163 if (end > len) \ 9164 end = len; \ 9165 else if (end < 0) { \ 9166 end += len; \ 9167 if (end < 0) \ 9168 end = 0; \ 9169 } \ 9170 if (start < 0) { \ 9171 start += len; \ 9172 if (start < 0) \ 9173 start = 0; \ 9174 } 9175 9176Py_ssize_t 9177PyUnicode_Count(PyObject *str, 9178 PyObject *substr, 9179 Py_ssize_t start, 9180 Py_ssize_t end) 9181{ 9182 Py_ssize_t result; 9183 PyObject* str_obj; 9184 PyObject* sub_obj; 9185 int kind1, kind2, kind; 9186 void *buf1 = NULL, *buf2 = NULL; 9187 Py_ssize_t len1, len2; 9188 9189 str_obj = PyUnicode_FromObject(str); 9190 if (!str_obj) 9191 return -1; 9192 sub_obj = PyUnicode_FromObject(substr); 9193 if (!sub_obj) { 9194 Py_DECREF(str_obj); 9195 return -1; 9196 } 9197 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9198 Py_DECREF(sub_obj); 9199 Py_DECREF(str_obj); 9200 return -1; 9201 } 9202 9203 kind1 = PyUnicode_KIND(str_obj); 9204 kind2 = PyUnicode_KIND(sub_obj); 9205 kind = kind1; 9206 buf1 = PyUnicode_DATA(str_obj); 9207 buf2 = PyUnicode_DATA(sub_obj); 9208 if (kind2 != kind) { 9209 if (kind2 > kind) { 9210 Py_DECREF(sub_obj); 9211 Py_DECREF(str_obj); 9212 return 0; 9213 } 9214 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9215 } 9216 if (!buf2) 9217 goto onError; 9218 len1 = PyUnicode_GET_LENGTH(str_obj); 9219 len2 = PyUnicode_GET_LENGTH(sub_obj); 9220 9221 ADJUST_INDICES(start, end, len1); 9222 switch (kind) { 9223 case PyUnicode_1BYTE_KIND: 9224 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9225 result = asciilib_count( 9226 ((Py_UCS1*)buf1) + start, end - start, 9227 buf2, len2, PY_SSIZE_T_MAX 9228 ); 9229 else 9230 result = ucs1lib_count( 9231 ((Py_UCS1*)buf1) + start, end - start, 9232 buf2, len2, PY_SSIZE_T_MAX 9233 ); 9234 break; 9235 case PyUnicode_2BYTE_KIND: 9236 result = ucs2lib_count( 9237 ((Py_UCS2*)buf1) + start, end - start, 9238 buf2, len2, PY_SSIZE_T_MAX 9239 ); 9240 break; 9241 case PyUnicode_4BYTE_KIND: 9242 result = ucs4lib_count( 9243 ((Py_UCS4*)buf1) + start, end - start, 9244 buf2, len2, PY_SSIZE_T_MAX 9245 ); 9246 break; 9247 default: 9248 assert(0); result = 0; 9249 } 9250 9251 Py_DECREF(sub_obj); 9252 Py_DECREF(str_obj); 9253 9254 if (kind2 != kind) 9255 PyMem_Free(buf2); 9256 9257 return result; 9258 onError: 9259 Py_DECREF(sub_obj); 9260 Py_DECREF(str_obj); 9261 if (kind2 != kind && buf2) 9262 PyMem_Free(buf2); 9263 return -1; 9264} 9265 9266Py_ssize_t 9267PyUnicode_Find(PyObject *str, 9268 PyObject *sub, 9269 Py_ssize_t start, 9270 Py_ssize_t end, 9271 int direction) 9272{ 9273 Py_ssize_t result; 9274 9275 str = PyUnicode_FromObject(str); 9276 if (!str) 9277 return -2; 9278 sub = PyUnicode_FromObject(sub); 9279 if (!sub) { 9280 Py_DECREF(str); 9281 return -2; 9282 } 9283 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9284 Py_DECREF(sub); 9285 Py_DECREF(str); 9286 return -2; 9287 } 9288 9289 result = any_find_slice(direction, 9290 str, sub, start, end 9291 ); 9292 9293 Py_DECREF(str); 9294 Py_DECREF(sub); 9295 9296 return result; 9297} 9298 9299Py_ssize_t 9300PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9301 Py_ssize_t start, Py_ssize_t end, 9302 int direction) 9303{ 9304 int kind; 9305 Py_ssize_t result; 9306 if (PyUnicode_READY(str) == -1) 9307 return -2; 9308 if (start < 0 || end < 0) { 9309 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9310 return -2; 9311 } 9312 if (end > PyUnicode_GET_LENGTH(str)) 9313 end = PyUnicode_GET_LENGTH(str); 9314 kind = PyUnicode_KIND(str); 9315 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9316 kind, end-start, ch, direction); 9317 if (result == -1) 9318 return -1; 9319 else 9320 return start + result; 9321} 9322 9323static int 9324tailmatch(PyObject *self, 9325 PyObject *substring, 9326 Py_ssize_t start, 9327 Py_ssize_t end, 9328 int direction) 9329{ 9330 int kind_self; 9331 int kind_sub; 9332 void *data_self; 9333 void *data_sub; 9334 Py_ssize_t offset; 9335 Py_ssize_t i; 9336 Py_ssize_t end_sub; 9337 9338 if (PyUnicode_READY(self) == -1 || 9339 PyUnicode_READY(substring) == -1) 9340 return -1; 9341 9342 if (PyUnicode_GET_LENGTH(substring) == 0) 9343 return 1; 9344 9345 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9346 end -= PyUnicode_GET_LENGTH(substring); 9347 if (end < start) 9348 return 0; 9349 9350 kind_self = PyUnicode_KIND(self); 9351 data_self = PyUnicode_DATA(self); 9352 kind_sub = PyUnicode_KIND(substring); 9353 data_sub = PyUnicode_DATA(substring); 9354 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9355 9356 if (direction > 0) 9357 offset = end; 9358 else 9359 offset = start; 9360 9361 if (PyUnicode_READ(kind_self, data_self, offset) == 9362 PyUnicode_READ(kind_sub, data_sub, 0) && 9363 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9364 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9365 /* If both are of the same kind, memcmp is sufficient */ 9366 if (kind_self == kind_sub) { 9367 return ! memcmp((char *)data_self + 9368 (offset * PyUnicode_KIND(substring)), 9369 data_sub, 9370 PyUnicode_GET_LENGTH(substring) * 9371 PyUnicode_KIND(substring)); 9372 } 9373 /* otherwise we have to compare each character by first accesing it */ 9374 else { 9375 /* We do not need to compare 0 and len(substring)-1 because 9376 the if statement above ensured already that they are equal 9377 when we end up here. */ 9378 for (i = 1; i < end_sub; ++i) { 9379 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9380 PyUnicode_READ(kind_sub, data_sub, i)) 9381 return 0; 9382 } 9383 return 1; 9384 } 9385 } 9386 9387 return 0; 9388} 9389 9390Py_ssize_t 9391PyUnicode_Tailmatch(PyObject *str, 9392 PyObject *substr, 9393 Py_ssize_t start, 9394 Py_ssize_t end, 9395 int direction) 9396{ 9397 Py_ssize_t result; 9398 9399 str = PyUnicode_FromObject(str); 9400 if (str == NULL) 9401 return -1; 9402 substr = PyUnicode_FromObject(substr); 9403 if (substr == NULL) { 9404 Py_DECREF(str); 9405 return -1; 9406 } 9407 9408 result = tailmatch(str, substr, 9409 start, end, direction); 9410 Py_DECREF(str); 9411 Py_DECREF(substr); 9412 return result; 9413} 9414 9415/* Apply fixfct filter to the Unicode object self and return a 9416 reference to the modified object */ 9417 9418static PyObject * 9419fixup(PyObject *self, 9420 Py_UCS4 (*fixfct)(PyObject *s)) 9421{ 9422 PyObject *u; 9423 Py_UCS4 maxchar_old, maxchar_new = 0; 9424 PyObject *v; 9425 9426 u = _PyUnicode_Copy(self); 9427 if (u == NULL) 9428 return NULL; 9429 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9430 9431 /* fix functions return the new maximum character in a string, 9432 if the kind of the resulting unicode object does not change, 9433 everything is fine. Otherwise we need to change the string kind 9434 and re-run the fix function. */ 9435 maxchar_new = fixfct(u); 9436 9437 if (maxchar_new == 0) { 9438 /* no changes */; 9439 if (PyUnicode_CheckExact(self)) { 9440 Py_DECREF(u); 9441 Py_INCREF(self); 9442 return self; 9443 } 9444 else 9445 return u; 9446 } 9447 9448 maxchar_new = align_maxchar(maxchar_new); 9449 9450 if (maxchar_new == maxchar_old) 9451 return u; 9452 9453 /* In case the maximum character changed, we need to 9454 convert the string to the new category. */ 9455 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9456 if (v == NULL) { 9457 Py_DECREF(u); 9458 return NULL; 9459 } 9460 if (maxchar_new > maxchar_old) { 9461 /* If the maxchar increased so that the kind changed, not all 9462 characters are representable anymore and we need to fix the 9463 string again. This only happens in very few cases. */ 9464 _PyUnicode_FastCopyCharacters(v, 0, 9465 self, 0, PyUnicode_GET_LENGTH(self)); 9466 maxchar_old = fixfct(v); 9467 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9468 } 9469 else { 9470 _PyUnicode_FastCopyCharacters(v, 0, 9471 u, 0, PyUnicode_GET_LENGTH(self)); 9472 } 9473 Py_DECREF(u); 9474 assert(_PyUnicode_CheckConsistency(v, 1)); 9475 return v; 9476} 9477 9478static PyObject * 9479ascii_upper_or_lower(PyObject *self, int lower) 9480{ 9481 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9482 char *resdata, *data = PyUnicode_DATA(self); 9483 PyObject *res; 9484 9485 res = PyUnicode_New(len, 127); 9486 if (res == NULL) 9487 return NULL; 9488 resdata = PyUnicode_DATA(res); 9489 if (lower) 9490 _Py_bytes_lower(resdata, data, len); 9491 else 9492 _Py_bytes_upper(resdata, data, len); 9493 return res; 9494} 9495 9496static Py_UCS4 9497handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9498{ 9499 Py_ssize_t j; 9500 int final_sigma; 9501 Py_UCS4 c; 9502 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9503 9504 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9505 9506 where ! is a negation and \p{xxx} is a character with property xxx. 9507 */ 9508 for (j = i - 1; j >= 0; j--) { 9509 c = PyUnicode_READ(kind, data, j); 9510 if (!_PyUnicode_IsCaseIgnorable(c)) 9511 break; 9512 } 9513 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9514 if (final_sigma) { 9515 for (j = i + 1; j < length; j++) { 9516 c = PyUnicode_READ(kind, data, j); 9517 if (!_PyUnicode_IsCaseIgnorable(c)) 9518 break; 9519 } 9520 final_sigma = j == length || !_PyUnicode_IsCased(c); 9521 } 9522 return (final_sigma) ? 0x3C2 : 0x3C3; 9523} 9524 9525static int 9526lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9527 Py_UCS4 c, Py_UCS4 *mapped) 9528{ 9529 /* Obscure special case. */ 9530 if (c == 0x3A3) { 9531 mapped[0] = handle_capital_sigma(kind, data, length, i); 9532 return 1; 9533 } 9534 return _PyUnicode_ToLowerFull(c, mapped); 9535} 9536 9537static Py_ssize_t 9538do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9539{ 9540 Py_ssize_t i, k = 0; 9541 int n_res, j; 9542 Py_UCS4 c, mapped[3]; 9543 9544 c = PyUnicode_READ(kind, data, 0); 9545 n_res = _PyUnicode_ToUpperFull(c, mapped); 9546 for (j = 0; j < n_res; j++) { 9547 *maxchar = Py_MAX(*maxchar, mapped[j]); 9548 res[k++] = mapped[j]; 9549 } 9550 for (i = 1; i < length; i++) { 9551 c = PyUnicode_READ(kind, data, i); 9552 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9553 for (j = 0; j < n_res; j++) { 9554 *maxchar = Py_MAX(*maxchar, mapped[j]); 9555 res[k++] = mapped[j]; 9556 } 9557 } 9558 return k; 9559} 9560 9561static Py_ssize_t 9562do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9563 Py_ssize_t i, k = 0; 9564 9565 for (i = 0; i < length; i++) { 9566 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9567 int n_res, j; 9568 if (Py_UNICODE_ISUPPER(c)) { 9569 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9570 } 9571 else if (Py_UNICODE_ISLOWER(c)) { 9572 n_res = _PyUnicode_ToUpperFull(c, mapped); 9573 } 9574 else { 9575 n_res = 1; 9576 mapped[0] = c; 9577 } 9578 for (j = 0; j < n_res; j++) { 9579 *maxchar = Py_MAX(*maxchar, mapped[j]); 9580 res[k++] = mapped[j]; 9581 } 9582 } 9583 return k; 9584} 9585 9586static Py_ssize_t 9587do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9588 Py_UCS4 *maxchar, int lower) 9589{ 9590 Py_ssize_t i, k = 0; 9591 9592 for (i = 0; i < length; i++) { 9593 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9594 int n_res, j; 9595 if (lower) 9596 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9597 else 9598 n_res = _PyUnicode_ToUpperFull(c, mapped); 9599 for (j = 0; j < n_res; j++) { 9600 *maxchar = Py_MAX(*maxchar, mapped[j]); 9601 res[k++] = mapped[j]; 9602 } 9603 } 9604 return k; 9605} 9606 9607static Py_ssize_t 9608do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9609{ 9610 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9611} 9612 9613static Py_ssize_t 9614do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9615{ 9616 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9617} 9618 9619static Py_ssize_t 9620do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9621{ 9622 Py_ssize_t i, k = 0; 9623 9624 for (i = 0; i < length; i++) { 9625 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9626 Py_UCS4 mapped[3]; 9627 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9628 for (j = 0; j < n_res; j++) { 9629 *maxchar = Py_MAX(*maxchar, mapped[j]); 9630 res[k++] = mapped[j]; 9631 } 9632 } 9633 return k; 9634} 9635 9636static Py_ssize_t 9637do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9638{ 9639 Py_ssize_t i, k = 0; 9640 int previous_is_cased; 9641 9642 previous_is_cased = 0; 9643 for (i = 0; i < length; i++) { 9644 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9645 Py_UCS4 mapped[3]; 9646 int n_res, j; 9647 9648 if (previous_is_cased) 9649 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9650 else 9651 n_res = _PyUnicode_ToTitleFull(c, mapped); 9652 9653 for (j = 0; j < n_res; j++) { 9654 *maxchar = Py_MAX(*maxchar, mapped[j]); 9655 res[k++] = mapped[j]; 9656 } 9657 9658 previous_is_cased = _PyUnicode_IsCased(c); 9659 } 9660 return k; 9661} 9662 9663static PyObject * 9664case_operation(PyObject *self, 9665 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9666{ 9667 PyObject *res = NULL; 9668 Py_ssize_t length, newlength = 0; 9669 int kind, outkind; 9670 void *data, *outdata; 9671 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9672 9673 assert(PyUnicode_IS_READY(self)); 9674 9675 kind = PyUnicode_KIND(self); 9676 data = PyUnicode_DATA(self); 9677 length = PyUnicode_GET_LENGTH(self); 9678 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9679 if (tmp == NULL) 9680 return PyErr_NoMemory(); 9681 newlength = perform(kind, data, length, tmp, &maxchar); 9682 res = PyUnicode_New(newlength, maxchar); 9683 if (res == NULL) 9684 goto leave; 9685 tmpend = tmp + newlength; 9686 outdata = PyUnicode_DATA(res); 9687 outkind = PyUnicode_KIND(res); 9688 switch (outkind) { 9689 case PyUnicode_1BYTE_KIND: 9690 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9691 break; 9692 case PyUnicode_2BYTE_KIND: 9693 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9694 break; 9695 case PyUnicode_4BYTE_KIND: 9696 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9697 break; 9698 default: 9699 assert(0); 9700 break; 9701 } 9702 leave: 9703 PyMem_FREE(tmp); 9704 return res; 9705} 9706 9707PyObject * 9708PyUnicode_Join(PyObject *separator, PyObject *seq) 9709{ 9710 PyObject *sep = NULL; 9711 Py_ssize_t seplen; 9712 PyObject *res = NULL; /* the result */ 9713 PyObject *fseq; /* PySequence_Fast(seq) */ 9714 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9715 PyObject **items; 9716 PyObject *item; 9717 Py_ssize_t sz, i, res_offset; 9718 Py_UCS4 maxchar; 9719 Py_UCS4 item_maxchar; 9720 int use_memcpy; 9721 unsigned char *res_data = NULL, *sep_data = NULL; 9722 PyObject *last_obj; 9723 unsigned int kind = 0; 9724 9725 fseq = PySequence_Fast(seq, "can only join an iterable"); 9726 if (fseq == NULL) { 9727 return NULL; 9728 } 9729 9730 /* NOTE: the following code can't call back into Python code, 9731 * so we are sure that fseq won't be mutated. 9732 */ 9733 9734 seqlen = PySequence_Fast_GET_SIZE(fseq); 9735 /* If empty sequence, return u"". */ 9736 if (seqlen == 0) { 9737 Py_DECREF(fseq); 9738 _Py_RETURN_UNICODE_EMPTY(); 9739 } 9740 9741 /* If singleton sequence with an exact Unicode, return that. */ 9742 last_obj = NULL; 9743 items = PySequence_Fast_ITEMS(fseq); 9744 if (seqlen == 1) { 9745 if (PyUnicode_CheckExact(items[0])) { 9746 res = items[0]; 9747 Py_INCREF(res); 9748 Py_DECREF(fseq); 9749 return res; 9750 } 9751 seplen = 0; 9752 maxchar = 0; 9753 } 9754 else { 9755 /* Set up sep and seplen */ 9756 if (separator == NULL) { 9757 /* fall back to a blank space separator */ 9758 sep = PyUnicode_FromOrdinal(' '); 9759 if (!sep) 9760 goto onError; 9761 seplen = 1; 9762 maxchar = 32; 9763 } 9764 else { 9765 if (!PyUnicode_Check(separator)) { 9766 PyErr_Format(PyExc_TypeError, 9767 "separator: expected str instance," 9768 " %.80s found", 9769 Py_TYPE(separator)->tp_name); 9770 goto onError; 9771 } 9772 if (PyUnicode_READY(separator)) 9773 goto onError; 9774 sep = separator; 9775 seplen = PyUnicode_GET_LENGTH(separator); 9776 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9777 /* inc refcount to keep this code path symmetric with the 9778 above case of a blank separator */ 9779 Py_INCREF(sep); 9780 } 9781 last_obj = sep; 9782 } 9783 9784 /* There are at least two things to join, or else we have a subclass 9785 * of str in the sequence. 9786 * Do a pre-pass to figure out the total amount of space we'll 9787 * need (sz), and see whether all argument are strings. 9788 */ 9789 sz = 0; 9790#ifdef Py_DEBUG 9791 use_memcpy = 0; 9792#else 9793 use_memcpy = 1; 9794#endif 9795 for (i = 0; i < seqlen; i++) { 9796 const Py_ssize_t old_sz = sz; 9797 item = items[i]; 9798 if (!PyUnicode_Check(item)) { 9799 PyErr_Format(PyExc_TypeError, 9800 "sequence item %zd: expected str instance," 9801 " %.80s found", 9802 i, Py_TYPE(item)->tp_name); 9803 goto onError; 9804 } 9805 if (PyUnicode_READY(item) == -1) 9806 goto onError; 9807 sz += PyUnicode_GET_LENGTH(item); 9808 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9809 maxchar = Py_MAX(maxchar, item_maxchar); 9810 if (i != 0) 9811 sz += seplen; 9812 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9813 PyErr_SetString(PyExc_OverflowError, 9814 "join() result is too long for a Python string"); 9815 goto onError; 9816 } 9817 if (use_memcpy && last_obj != NULL) { 9818 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9819 use_memcpy = 0; 9820 } 9821 last_obj = item; 9822 } 9823 9824 res = PyUnicode_New(sz, maxchar); 9825 if (res == NULL) 9826 goto onError; 9827 9828 /* Catenate everything. */ 9829#ifdef Py_DEBUG 9830 use_memcpy = 0; 9831#else 9832 if (use_memcpy) { 9833 res_data = PyUnicode_1BYTE_DATA(res); 9834 kind = PyUnicode_KIND(res); 9835 if (seplen != 0) 9836 sep_data = PyUnicode_1BYTE_DATA(sep); 9837 } 9838#endif 9839 if (use_memcpy) { 9840 for (i = 0; i < seqlen; ++i) { 9841 Py_ssize_t itemlen; 9842 item = items[i]; 9843 9844 /* Copy item, and maybe the separator. */ 9845 if (i && seplen != 0) { 9846 Py_MEMCPY(res_data, 9847 sep_data, 9848 kind * seplen); 9849 res_data += kind * seplen; 9850 } 9851 9852 itemlen = PyUnicode_GET_LENGTH(item); 9853 if (itemlen != 0) { 9854 Py_MEMCPY(res_data, 9855 PyUnicode_DATA(item), 9856 kind * itemlen); 9857 res_data += kind * itemlen; 9858 } 9859 } 9860 assert(res_data == PyUnicode_1BYTE_DATA(res) 9861 + kind * PyUnicode_GET_LENGTH(res)); 9862 } 9863 else { 9864 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9865 Py_ssize_t itemlen; 9866 item = items[i]; 9867 9868 /* Copy item, and maybe the separator. */ 9869 if (i && seplen != 0) { 9870 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9871 res_offset += seplen; 9872 } 9873 9874 itemlen = PyUnicode_GET_LENGTH(item); 9875 if (itemlen != 0) { 9876 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9877 res_offset += itemlen; 9878 } 9879 } 9880 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9881 } 9882 9883 Py_DECREF(fseq); 9884 Py_XDECREF(sep); 9885 assert(_PyUnicode_CheckConsistency(res, 1)); 9886 return res; 9887 9888 onError: 9889 Py_DECREF(fseq); 9890 Py_XDECREF(sep); 9891 Py_XDECREF(res); 9892 return NULL; 9893} 9894 9895#define FILL(kind, data, value, start, length) \ 9896 do { \ 9897 Py_ssize_t i_ = 0; \ 9898 assert(kind != PyUnicode_WCHAR_KIND); \ 9899 switch ((kind)) { \ 9900 case PyUnicode_1BYTE_KIND: { \ 9901 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9902 memset(to_, (unsigned char)value, (length)); \ 9903 break; \ 9904 } \ 9905 case PyUnicode_2BYTE_KIND: { \ 9906 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9907 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9908 break; \ 9909 } \ 9910 case PyUnicode_4BYTE_KIND: { \ 9911 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9912 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9913 break; \ 9914 default: assert(0); \ 9915 } \ 9916 } \ 9917 } while (0) 9918 9919void 9920_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9921 Py_UCS4 fill_char) 9922{ 9923 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9924 const void *data = PyUnicode_DATA(unicode); 9925 assert(PyUnicode_IS_READY(unicode)); 9926 assert(unicode_modifiable(unicode)); 9927 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9928 assert(start >= 0); 9929 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9930 FILL(kind, data, fill_char, start, length); 9931} 9932 9933Py_ssize_t 9934PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9935 Py_UCS4 fill_char) 9936{ 9937 Py_ssize_t maxlen; 9938 9939 if (!PyUnicode_Check(unicode)) { 9940 PyErr_BadInternalCall(); 9941 return -1; 9942 } 9943 if (PyUnicode_READY(unicode) == -1) 9944 return -1; 9945 if (unicode_check_modifiable(unicode)) 9946 return -1; 9947 9948 if (start < 0) { 9949 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9950 return -1; 9951 } 9952 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9953 PyErr_SetString(PyExc_ValueError, 9954 "fill character is bigger than " 9955 "the string maximum character"); 9956 return -1; 9957 } 9958 9959 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9960 length = Py_MIN(maxlen, length); 9961 if (length <= 0) 9962 return 0; 9963 9964 _PyUnicode_FastFill(unicode, start, length, fill_char); 9965 return length; 9966} 9967 9968static PyObject * 9969pad(PyObject *self, 9970 Py_ssize_t left, 9971 Py_ssize_t right, 9972 Py_UCS4 fill) 9973{ 9974 PyObject *u; 9975 Py_UCS4 maxchar; 9976 int kind; 9977 void *data; 9978 9979 if (left < 0) 9980 left = 0; 9981 if (right < 0) 9982 right = 0; 9983 9984 if (left == 0 && right == 0) 9985 return unicode_result_unchanged(self); 9986 9987 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9988 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9989 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9990 return NULL; 9991 } 9992 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9993 maxchar = Py_MAX(maxchar, fill); 9994 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9995 if (!u) 9996 return NULL; 9997 9998 kind = PyUnicode_KIND(u); 9999 data = PyUnicode_DATA(u); 10000 if (left) 10001 FILL(kind, data, fill, 0, left); 10002 if (right) 10003 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10004 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10005 assert(_PyUnicode_CheckConsistency(u, 1)); 10006 return u; 10007} 10008 10009PyObject * 10010PyUnicode_Splitlines(PyObject *string, int keepends) 10011{ 10012 PyObject *list; 10013 10014 string = PyUnicode_FromObject(string); 10015 if (string == NULL) 10016 return NULL; 10017 if (PyUnicode_READY(string) == -1) { 10018 Py_DECREF(string); 10019 return NULL; 10020 } 10021 10022 switch (PyUnicode_KIND(string)) { 10023 case PyUnicode_1BYTE_KIND: 10024 if (PyUnicode_IS_ASCII(string)) 10025 list = asciilib_splitlines( 10026 string, PyUnicode_1BYTE_DATA(string), 10027 PyUnicode_GET_LENGTH(string), keepends); 10028 else 10029 list = ucs1lib_splitlines( 10030 string, PyUnicode_1BYTE_DATA(string), 10031 PyUnicode_GET_LENGTH(string), keepends); 10032 break; 10033 case PyUnicode_2BYTE_KIND: 10034 list = ucs2lib_splitlines( 10035 string, PyUnicode_2BYTE_DATA(string), 10036 PyUnicode_GET_LENGTH(string), keepends); 10037 break; 10038 case PyUnicode_4BYTE_KIND: 10039 list = ucs4lib_splitlines( 10040 string, PyUnicode_4BYTE_DATA(string), 10041 PyUnicode_GET_LENGTH(string), keepends); 10042 break; 10043 default: 10044 assert(0); 10045 list = 0; 10046 } 10047 Py_DECREF(string); 10048 return list; 10049} 10050 10051static PyObject * 10052split(PyObject *self, 10053 PyObject *substring, 10054 Py_ssize_t maxcount) 10055{ 10056 int kind1, kind2, kind; 10057 void *buf1, *buf2; 10058 Py_ssize_t len1, len2; 10059 PyObject* out; 10060 10061 if (maxcount < 0) 10062 maxcount = PY_SSIZE_T_MAX; 10063 10064 if (PyUnicode_READY(self) == -1) 10065 return NULL; 10066 10067 if (substring == NULL) 10068 switch (PyUnicode_KIND(self)) { 10069 case PyUnicode_1BYTE_KIND: 10070 if (PyUnicode_IS_ASCII(self)) 10071 return asciilib_split_whitespace( 10072 self, PyUnicode_1BYTE_DATA(self), 10073 PyUnicode_GET_LENGTH(self), maxcount 10074 ); 10075 else 10076 return ucs1lib_split_whitespace( 10077 self, PyUnicode_1BYTE_DATA(self), 10078 PyUnicode_GET_LENGTH(self), maxcount 10079 ); 10080 case PyUnicode_2BYTE_KIND: 10081 return ucs2lib_split_whitespace( 10082 self, PyUnicode_2BYTE_DATA(self), 10083 PyUnicode_GET_LENGTH(self), maxcount 10084 ); 10085 case PyUnicode_4BYTE_KIND: 10086 return ucs4lib_split_whitespace( 10087 self, PyUnicode_4BYTE_DATA(self), 10088 PyUnicode_GET_LENGTH(self), maxcount 10089 ); 10090 default: 10091 assert(0); 10092 return NULL; 10093 } 10094 10095 if (PyUnicode_READY(substring) == -1) 10096 return NULL; 10097 10098 kind1 = PyUnicode_KIND(self); 10099 kind2 = PyUnicode_KIND(substring); 10100 kind = kind1 > kind2 ? kind1 : kind2; 10101 buf1 = PyUnicode_DATA(self); 10102 buf2 = PyUnicode_DATA(substring); 10103 if (kind1 != kind) 10104 buf1 = _PyUnicode_AsKind(self, kind); 10105 if (!buf1) 10106 return NULL; 10107 if (kind2 != kind) 10108 buf2 = _PyUnicode_AsKind(substring, kind); 10109 if (!buf2) { 10110 if (kind1 != kind) PyMem_Free(buf1); 10111 return NULL; 10112 } 10113 len1 = PyUnicode_GET_LENGTH(self); 10114 len2 = PyUnicode_GET_LENGTH(substring); 10115 10116 switch (kind) { 10117 case PyUnicode_1BYTE_KIND: 10118 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10119 out = asciilib_split( 10120 self, buf1, len1, buf2, len2, maxcount); 10121 else 10122 out = ucs1lib_split( 10123 self, buf1, len1, buf2, len2, maxcount); 10124 break; 10125 case PyUnicode_2BYTE_KIND: 10126 out = ucs2lib_split( 10127 self, buf1, len1, buf2, len2, maxcount); 10128 break; 10129 case PyUnicode_4BYTE_KIND: 10130 out = ucs4lib_split( 10131 self, buf1, len1, buf2, len2, maxcount); 10132 break; 10133 default: 10134 out = NULL; 10135 } 10136 if (kind1 != kind) 10137 PyMem_Free(buf1); 10138 if (kind2 != kind) 10139 PyMem_Free(buf2); 10140 return out; 10141} 10142 10143static PyObject * 10144rsplit(PyObject *self, 10145 PyObject *substring, 10146 Py_ssize_t maxcount) 10147{ 10148 int kind1, kind2, kind; 10149 void *buf1, *buf2; 10150 Py_ssize_t len1, len2; 10151 PyObject* out; 10152 10153 if (maxcount < 0) 10154 maxcount = PY_SSIZE_T_MAX; 10155 10156 if (PyUnicode_READY(self) == -1) 10157 return NULL; 10158 10159 if (substring == NULL) 10160 switch (PyUnicode_KIND(self)) { 10161 case PyUnicode_1BYTE_KIND: 10162 if (PyUnicode_IS_ASCII(self)) 10163 return asciilib_rsplit_whitespace( 10164 self, PyUnicode_1BYTE_DATA(self), 10165 PyUnicode_GET_LENGTH(self), maxcount 10166 ); 10167 else 10168 return ucs1lib_rsplit_whitespace( 10169 self, PyUnicode_1BYTE_DATA(self), 10170 PyUnicode_GET_LENGTH(self), maxcount 10171 ); 10172 case PyUnicode_2BYTE_KIND: 10173 return ucs2lib_rsplit_whitespace( 10174 self, PyUnicode_2BYTE_DATA(self), 10175 PyUnicode_GET_LENGTH(self), maxcount 10176 ); 10177 case PyUnicode_4BYTE_KIND: 10178 return ucs4lib_rsplit_whitespace( 10179 self, PyUnicode_4BYTE_DATA(self), 10180 PyUnicode_GET_LENGTH(self), maxcount 10181 ); 10182 default: 10183 assert(0); 10184 return NULL; 10185 } 10186 10187 if (PyUnicode_READY(substring) == -1) 10188 return NULL; 10189 10190 kind1 = PyUnicode_KIND(self); 10191 kind2 = PyUnicode_KIND(substring); 10192 kind = kind1 > kind2 ? kind1 : kind2; 10193 buf1 = PyUnicode_DATA(self); 10194 buf2 = PyUnicode_DATA(substring); 10195 if (kind1 != kind) 10196 buf1 = _PyUnicode_AsKind(self, kind); 10197 if (!buf1) 10198 return NULL; 10199 if (kind2 != kind) 10200 buf2 = _PyUnicode_AsKind(substring, kind); 10201 if (!buf2) { 10202 if (kind1 != kind) PyMem_Free(buf1); 10203 return NULL; 10204 } 10205 len1 = PyUnicode_GET_LENGTH(self); 10206 len2 = PyUnicode_GET_LENGTH(substring); 10207 10208 switch (kind) { 10209 case PyUnicode_1BYTE_KIND: 10210 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10211 out = asciilib_rsplit( 10212 self, buf1, len1, buf2, len2, maxcount); 10213 else 10214 out = ucs1lib_rsplit( 10215 self, buf1, len1, buf2, len2, maxcount); 10216 break; 10217 case PyUnicode_2BYTE_KIND: 10218 out = ucs2lib_rsplit( 10219 self, buf1, len1, buf2, len2, maxcount); 10220 break; 10221 case PyUnicode_4BYTE_KIND: 10222 out = ucs4lib_rsplit( 10223 self, buf1, len1, buf2, len2, maxcount); 10224 break; 10225 default: 10226 out = NULL; 10227 } 10228 if (kind1 != kind) 10229 PyMem_Free(buf1); 10230 if (kind2 != kind) 10231 PyMem_Free(buf2); 10232 return out; 10233} 10234 10235static Py_ssize_t 10236anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10237 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10238{ 10239 switch (kind) { 10240 case PyUnicode_1BYTE_KIND: 10241 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10242 return asciilib_find(buf1, len1, buf2, len2, offset); 10243 else 10244 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10245 case PyUnicode_2BYTE_KIND: 10246 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10247 case PyUnicode_4BYTE_KIND: 10248 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10249 } 10250 assert(0); 10251 return -1; 10252} 10253 10254static Py_ssize_t 10255anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10256 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10257{ 10258 switch (kind) { 10259 case PyUnicode_1BYTE_KIND: 10260 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10261 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10262 else 10263 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10264 case PyUnicode_2BYTE_KIND: 10265 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10266 case PyUnicode_4BYTE_KIND: 10267 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10268 } 10269 assert(0); 10270 return 0; 10271} 10272 10273static void 10274replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10275 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10276{ 10277 int kind = PyUnicode_KIND(u); 10278 void *data = PyUnicode_DATA(u); 10279 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10280 if (kind == PyUnicode_1BYTE_KIND) { 10281 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10282 (Py_UCS1 *)data + len, 10283 u1, u2, maxcount); 10284 } 10285 else if (kind == PyUnicode_2BYTE_KIND) { 10286 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10287 (Py_UCS2 *)data + len, 10288 u1, u2, maxcount); 10289 } 10290 else { 10291 assert(kind == PyUnicode_4BYTE_KIND); 10292 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10293 (Py_UCS4 *)data + len, 10294 u1, u2, maxcount); 10295 } 10296} 10297 10298static PyObject * 10299replace(PyObject *self, PyObject *str1, 10300 PyObject *str2, Py_ssize_t maxcount) 10301{ 10302 PyObject *u; 10303 char *sbuf = PyUnicode_DATA(self); 10304 char *buf1 = PyUnicode_DATA(str1); 10305 char *buf2 = PyUnicode_DATA(str2); 10306 int srelease = 0, release1 = 0, release2 = 0; 10307 int skind = PyUnicode_KIND(self); 10308 int kind1 = PyUnicode_KIND(str1); 10309 int kind2 = PyUnicode_KIND(str2); 10310 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10311 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10312 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10313 int mayshrink; 10314 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10315 10316 if (maxcount < 0) 10317 maxcount = PY_SSIZE_T_MAX; 10318 else if (maxcount == 0 || slen == 0) 10319 goto nothing; 10320 10321 if (str1 == str2) 10322 goto nothing; 10323 10324 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10325 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10326 if (maxchar < maxchar_str1) 10327 /* substring too wide to be present */ 10328 goto nothing; 10329 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10330 /* Replacing str1 with str2 may cause a maxchar reduction in the 10331 result string. */ 10332 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10333 maxchar = Py_MAX(maxchar, maxchar_str2); 10334 10335 if (len1 == len2) { 10336 /* same length */ 10337 if (len1 == 0) 10338 goto nothing; 10339 if (len1 == 1) { 10340 /* replace characters */ 10341 Py_UCS4 u1, u2; 10342 Py_ssize_t pos; 10343 10344 u1 = PyUnicode_READ(kind1, buf1, 0); 10345 pos = findchar(sbuf, skind, slen, u1, 1); 10346 if (pos < 0) 10347 goto nothing; 10348 u2 = PyUnicode_READ(kind2, buf2, 0); 10349 u = PyUnicode_New(slen, maxchar); 10350 if (!u) 10351 goto error; 10352 10353 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10354 replace_1char_inplace(u, pos, u1, u2, maxcount); 10355 } 10356 else { 10357 int rkind = skind; 10358 char *res; 10359 Py_ssize_t i; 10360 10361 if (kind1 < rkind) { 10362 /* widen substring */ 10363 buf1 = _PyUnicode_AsKind(str1, rkind); 10364 if (!buf1) goto error; 10365 release1 = 1; 10366 } 10367 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10368 if (i < 0) 10369 goto nothing; 10370 if (rkind > kind2) { 10371 /* widen replacement */ 10372 buf2 = _PyUnicode_AsKind(str2, rkind); 10373 if (!buf2) goto error; 10374 release2 = 1; 10375 } 10376 else if (rkind < kind2) { 10377 /* widen self and buf1 */ 10378 rkind = kind2; 10379 if (release1) PyMem_Free(buf1); 10380 release1 = 0; 10381 sbuf = _PyUnicode_AsKind(self, rkind); 10382 if (!sbuf) goto error; 10383 srelease = 1; 10384 buf1 = _PyUnicode_AsKind(str1, rkind); 10385 if (!buf1) goto error; 10386 release1 = 1; 10387 } 10388 u = PyUnicode_New(slen, maxchar); 10389 if (!u) 10390 goto error; 10391 assert(PyUnicode_KIND(u) == rkind); 10392 res = PyUnicode_DATA(u); 10393 10394 memcpy(res, sbuf, rkind * slen); 10395 /* change everything in-place, starting with this one */ 10396 memcpy(res + rkind * i, 10397 buf2, 10398 rkind * len2); 10399 i += len1; 10400 10401 while ( --maxcount > 0) { 10402 i = anylib_find(rkind, self, 10403 sbuf+rkind*i, slen-i, 10404 str1, buf1, len1, i); 10405 if (i == -1) 10406 break; 10407 memcpy(res + rkind * i, 10408 buf2, 10409 rkind * len2); 10410 i += len1; 10411 } 10412 } 10413 } 10414 else { 10415 Py_ssize_t n, i, j, ires; 10416 Py_ssize_t new_size; 10417 int rkind = skind; 10418 char *res; 10419 10420 if (kind1 < rkind) { 10421 /* widen substring */ 10422 buf1 = _PyUnicode_AsKind(str1, rkind); 10423 if (!buf1) goto error; 10424 release1 = 1; 10425 } 10426 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10427 if (n == 0) 10428 goto nothing; 10429 if (kind2 < rkind) { 10430 /* widen replacement */ 10431 buf2 = _PyUnicode_AsKind(str2, rkind); 10432 if (!buf2) goto error; 10433 release2 = 1; 10434 } 10435 else if (kind2 > rkind) { 10436 /* widen self and buf1 */ 10437 rkind = kind2; 10438 sbuf = _PyUnicode_AsKind(self, rkind); 10439 if (!sbuf) goto error; 10440 srelease = 1; 10441 if (release1) PyMem_Free(buf1); 10442 release1 = 0; 10443 buf1 = _PyUnicode_AsKind(str1, rkind); 10444 if (!buf1) goto error; 10445 release1 = 1; 10446 } 10447 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10448 PyUnicode_GET_LENGTH(str1))); */ 10449 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10450 PyErr_SetString(PyExc_OverflowError, 10451 "replace string is too long"); 10452 goto error; 10453 } 10454 new_size = slen + n * (len2 - len1); 10455 if (new_size == 0) { 10456 _Py_INCREF_UNICODE_EMPTY(); 10457 if (!unicode_empty) 10458 goto error; 10459 u = unicode_empty; 10460 goto done; 10461 } 10462 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10463 PyErr_SetString(PyExc_OverflowError, 10464 "replace string is too long"); 10465 goto error; 10466 } 10467 u = PyUnicode_New(new_size, maxchar); 10468 if (!u) 10469 goto error; 10470 assert(PyUnicode_KIND(u) == rkind); 10471 res = PyUnicode_DATA(u); 10472 ires = i = 0; 10473 if (len1 > 0) { 10474 while (n-- > 0) { 10475 /* look for next match */ 10476 j = anylib_find(rkind, self, 10477 sbuf + rkind * i, slen-i, 10478 str1, buf1, len1, i); 10479 if (j == -1) 10480 break; 10481 else if (j > i) { 10482 /* copy unchanged part [i:j] */ 10483 memcpy(res + rkind * ires, 10484 sbuf + rkind * i, 10485 rkind * (j-i)); 10486 ires += j - i; 10487 } 10488 /* copy substitution string */ 10489 if (len2 > 0) { 10490 memcpy(res + rkind * ires, 10491 buf2, 10492 rkind * len2); 10493 ires += len2; 10494 } 10495 i = j + len1; 10496 } 10497 if (i < slen) 10498 /* copy tail [i:] */ 10499 memcpy(res + rkind * ires, 10500 sbuf + rkind * i, 10501 rkind * (slen-i)); 10502 } 10503 else { 10504 /* interleave */ 10505 while (n > 0) { 10506 memcpy(res + rkind * ires, 10507 buf2, 10508 rkind * len2); 10509 ires += len2; 10510 if (--n <= 0) 10511 break; 10512 memcpy(res + rkind * ires, 10513 sbuf + rkind * i, 10514 rkind); 10515 ires++; 10516 i++; 10517 } 10518 memcpy(res + rkind * ires, 10519 sbuf + rkind * i, 10520 rkind * (slen-i)); 10521 } 10522 } 10523 10524 if (mayshrink) { 10525 unicode_adjust_maxchar(&u); 10526 if (u == NULL) 10527 goto error; 10528 } 10529 10530 done: 10531 if (srelease) 10532 PyMem_FREE(sbuf); 10533 if (release1) 10534 PyMem_FREE(buf1); 10535 if (release2) 10536 PyMem_FREE(buf2); 10537 assert(_PyUnicode_CheckConsistency(u, 1)); 10538 return u; 10539 10540 nothing: 10541 /* nothing to replace; return original string (when possible) */ 10542 if (srelease) 10543 PyMem_FREE(sbuf); 10544 if (release1) 10545 PyMem_FREE(buf1); 10546 if (release2) 10547 PyMem_FREE(buf2); 10548 return unicode_result_unchanged(self); 10549 10550 error: 10551 if (srelease && sbuf) 10552 PyMem_FREE(sbuf); 10553 if (release1 && buf1) 10554 PyMem_FREE(buf1); 10555 if (release2 && buf2) 10556 PyMem_FREE(buf2); 10557 return NULL; 10558} 10559 10560/* --- Unicode Object Methods --------------------------------------------- */ 10561 10562PyDoc_STRVAR(title__doc__, 10563 "S.title() -> str\n\ 10564\n\ 10565Return a titlecased version of S, i.e. words start with title case\n\ 10566characters, all remaining cased characters have lower case."); 10567 10568static PyObject* 10569unicode_title(PyObject *self) 10570{ 10571 if (PyUnicode_READY(self) == -1) 10572 return NULL; 10573 return case_operation(self, do_title); 10574} 10575 10576PyDoc_STRVAR(capitalize__doc__, 10577 "S.capitalize() -> str\n\ 10578\n\ 10579Return a capitalized version of S, i.e. make the first character\n\ 10580have upper case and the rest lower case."); 10581 10582static PyObject* 10583unicode_capitalize(PyObject *self) 10584{ 10585 if (PyUnicode_READY(self) == -1) 10586 return NULL; 10587 if (PyUnicode_GET_LENGTH(self) == 0) 10588 return unicode_result_unchanged(self); 10589 return case_operation(self, do_capitalize); 10590} 10591 10592PyDoc_STRVAR(casefold__doc__, 10593 "S.casefold() -> str\n\ 10594\n\ 10595Return a version of S suitable for caseless comparisons."); 10596 10597static PyObject * 10598unicode_casefold(PyObject *self) 10599{ 10600 if (PyUnicode_READY(self) == -1) 10601 return NULL; 10602 if (PyUnicode_IS_ASCII(self)) 10603 return ascii_upper_or_lower(self, 1); 10604 return case_operation(self, do_casefold); 10605} 10606 10607 10608/* Argument converter. Coerces to a single unicode character */ 10609 10610static int 10611convert_uc(PyObject *obj, void *addr) 10612{ 10613 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10614 PyObject *uniobj; 10615 10616 uniobj = PyUnicode_FromObject(obj); 10617 if (uniobj == NULL) { 10618 PyErr_SetString(PyExc_TypeError, 10619 "The fill character cannot be converted to Unicode"); 10620 return 0; 10621 } 10622 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10623 PyErr_SetString(PyExc_TypeError, 10624 "The fill character must be exactly one character long"); 10625 Py_DECREF(uniobj); 10626 return 0; 10627 } 10628 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10629 Py_DECREF(uniobj); 10630 return 1; 10631} 10632 10633PyDoc_STRVAR(center__doc__, 10634 "S.center(width[, fillchar]) -> str\n\ 10635\n\ 10636Return S centered in a string of length width. Padding is\n\ 10637done using the specified fill character (default is a space)"); 10638 10639static PyObject * 10640unicode_center(PyObject *self, PyObject *args) 10641{ 10642 Py_ssize_t marg, left; 10643 Py_ssize_t width; 10644 Py_UCS4 fillchar = ' '; 10645 10646 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10647 return NULL; 10648 10649 if (PyUnicode_READY(self) == -1) 10650 return NULL; 10651 10652 if (PyUnicode_GET_LENGTH(self) >= width) 10653 return unicode_result_unchanged(self); 10654 10655 marg = width - PyUnicode_GET_LENGTH(self); 10656 left = marg / 2 + (marg & width & 1); 10657 10658 return pad(self, left, marg - left, fillchar); 10659} 10660 10661/* This function assumes that str1 and str2 are readied by the caller. */ 10662 10663static int 10664unicode_compare(PyObject *str1, PyObject *str2) 10665{ 10666#define COMPARE(TYPE1, TYPE2) \ 10667 do { \ 10668 TYPE1* p1 = (TYPE1 *)data1; \ 10669 TYPE2* p2 = (TYPE2 *)data2; \ 10670 TYPE1* end = p1 + len; \ 10671 Py_UCS4 c1, c2; \ 10672 for (; p1 != end; p1++, p2++) { \ 10673 c1 = *p1; \ 10674 c2 = *p2; \ 10675 if (c1 != c2) \ 10676 return (c1 < c2) ? -1 : 1; \ 10677 } \ 10678 } \ 10679 while (0) 10680 10681 int kind1, kind2; 10682 void *data1, *data2; 10683 Py_ssize_t len1, len2, len; 10684 10685 kind1 = PyUnicode_KIND(str1); 10686 kind2 = PyUnicode_KIND(str2); 10687 data1 = PyUnicode_DATA(str1); 10688 data2 = PyUnicode_DATA(str2); 10689 len1 = PyUnicode_GET_LENGTH(str1); 10690 len2 = PyUnicode_GET_LENGTH(str2); 10691 len = Py_MIN(len1, len2); 10692 10693 switch(kind1) { 10694 case PyUnicode_1BYTE_KIND: 10695 { 10696 switch(kind2) { 10697 case PyUnicode_1BYTE_KIND: 10698 { 10699 int cmp = memcmp(data1, data2, len); 10700 /* normalize result of memcmp() into the range [-1; 1] */ 10701 if (cmp < 0) 10702 return -1; 10703 if (cmp > 0) 10704 return 1; 10705 break; 10706 } 10707 case PyUnicode_2BYTE_KIND: 10708 COMPARE(Py_UCS1, Py_UCS2); 10709 break; 10710 case PyUnicode_4BYTE_KIND: 10711 COMPARE(Py_UCS1, Py_UCS4); 10712 break; 10713 default: 10714 assert(0); 10715 } 10716 break; 10717 } 10718 case PyUnicode_2BYTE_KIND: 10719 { 10720 switch(kind2) { 10721 case PyUnicode_1BYTE_KIND: 10722 COMPARE(Py_UCS2, Py_UCS1); 10723 break; 10724 case PyUnicode_2BYTE_KIND: 10725 { 10726 COMPARE(Py_UCS2, Py_UCS2); 10727 break; 10728 } 10729 case PyUnicode_4BYTE_KIND: 10730 COMPARE(Py_UCS2, Py_UCS4); 10731 break; 10732 default: 10733 assert(0); 10734 } 10735 break; 10736 } 10737 case PyUnicode_4BYTE_KIND: 10738 { 10739 switch(kind2) { 10740 case PyUnicode_1BYTE_KIND: 10741 COMPARE(Py_UCS4, Py_UCS1); 10742 break; 10743 case PyUnicode_2BYTE_KIND: 10744 COMPARE(Py_UCS4, Py_UCS2); 10745 break; 10746 case PyUnicode_4BYTE_KIND: 10747 { 10748#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10749 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10750 /* normalize result of wmemcmp() into the range [-1; 1] */ 10751 if (cmp < 0) 10752 return -1; 10753 if (cmp > 0) 10754 return 1; 10755#else 10756 COMPARE(Py_UCS4, Py_UCS4); 10757#endif 10758 break; 10759 } 10760 default: 10761 assert(0); 10762 } 10763 break; 10764 } 10765 default: 10766 assert(0); 10767 } 10768 10769 if (len1 == len2) 10770 return 0; 10771 if (len1 < len2) 10772 return -1; 10773 else 10774 return 1; 10775 10776#undef COMPARE 10777} 10778 10779Py_LOCAL(int) 10780unicode_compare_eq(PyObject *str1, PyObject *str2) 10781{ 10782 int kind; 10783 void *data1, *data2; 10784 Py_ssize_t len; 10785 int cmp; 10786 10787 len = PyUnicode_GET_LENGTH(str1); 10788 if (PyUnicode_GET_LENGTH(str2) != len) 10789 return 0; 10790 kind = PyUnicode_KIND(str1); 10791 if (PyUnicode_KIND(str2) != kind) 10792 return 0; 10793 data1 = PyUnicode_DATA(str1); 10794 data2 = PyUnicode_DATA(str2); 10795 10796 cmp = memcmp(data1, data2, len * kind); 10797 return (cmp == 0); 10798} 10799 10800 10801int 10802PyUnicode_Compare(PyObject *left, PyObject *right) 10803{ 10804 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10805 if (PyUnicode_READY(left) == -1 || 10806 PyUnicode_READY(right) == -1) 10807 return -1; 10808 10809 /* a string is equal to itself */ 10810 if (left == right) 10811 return 0; 10812 10813 return unicode_compare(left, right); 10814 } 10815 PyErr_Format(PyExc_TypeError, 10816 "Can't compare %.100s and %.100s", 10817 left->ob_type->tp_name, 10818 right->ob_type->tp_name); 10819 return -1; 10820} 10821 10822int 10823_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10824{ 10825 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10826 if (right_str == NULL) 10827 return -1; 10828 return PyUnicode_Compare(left, right_str); 10829} 10830 10831int 10832PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10833{ 10834 Py_ssize_t i; 10835 int kind; 10836 Py_UCS4 chr; 10837 10838 assert(_PyUnicode_CHECK(uni)); 10839 if (PyUnicode_READY(uni) == -1) 10840 return -1; 10841 kind = PyUnicode_KIND(uni); 10842 if (kind == PyUnicode_1BYTE_KIND) { 10843 const void *data = PyUnicode_1BYTE_DATA(uni); 10844 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10845 size_t len, len2 = strlen(str); 10846 int cmp; 10847 10848 len = Py_MIN(len1, len2); 10849 cmp = memcmp(data, str, len); 10850 if (cmp != 0) { 10851 if (cmp < 0) 10852 return -1; 10853 else 10854 return 1; 10855 } 10856 if (len1 > len2) 10857 return 1; /* uni is longer */ 10858 if (len2 > len1) 10859 return -1; /* str is longer */ 10860 return 0; 10861 } 10862 else { 10863 void *data = PyUnicode_DATA(uni); 10864 /* Compare Unicode string and source character set string */ 10865 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10866 if (chr != (unsigned char)str[i]) 10867 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10868 /* This check keeps Python strings that end in '\0' from comparing equal 10869 to C strings identical up to that point. */ 10870 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10871 return 1; /* uni is longer */ 10872 if (str[i]) 10873 return -1; /* str is longer */ 10874 return 0; 10875 } 10876} 10877 10878 10879#define TEST_COND(cond) \ 10880 ((cond) ? Py_True : Py_False) 10881 10882PyObject * 10883PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10884{ 10885 int result; 10886 PyObject *v; 10887 10888 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10889 Py_RETURN_NOTIMPLEMENTED; 10890 10891 if (PyUnicode_READY(left) == -1 || 10892 PyUnicode_READY(right) == -1) 10893 return NULL; 10894 10895 if (left == right) { 10896 switch (op) { 10897 case Py_EQ: 10898 case Py_LE: 10899 case Py_GE: 10900 /* a string is equal to itself */ 10901 v = Py_True; 10902 break; 10903 case Py_NE: 10904 case Py_LT: 10905 case Py_GT: 10906 v = Py_False; 10907 break; 10908 default: 10909 PyErr_BadArgument(); 10910 return NULL; 10911 } 10912 } 10913 else if (op == Py_EQ || op == Py_NE) { 10914 result = unicode_compare_eq(left, right); 10915 result ^= (op == Py_NE); 10916 v = TEST_COND(result); 10917 } 10918 else { 10919 result = unicode_compare(left, right); 10920 10921 /* Convert the return value to a Boolean */ 10922 switch (op) { 10923 case Py_LE: 10924 v = TEST_COND(result <= 0); 10925 break; 10926 case Py_GE: 10927 v = TEST_COND(result >= 0); 10928 break; 10929 case Py_LT: 10930 v = TEST_COND(result == -1); 10931 break; 10932 case Py_GT: 10933 v = TEST_COND(result == 1); 10934 break; 10935 default: 10936 PyErr_BadArgument(); 10937 return NULL; 10938 } 10939 } 10940 Py_INCREF(v); 10941 return v; 10942} 10943 10944int 10945PyUnicode_Contains(PyObject *container, PyObject *element) 10946{ 10947 PyObject *str, *sub; 10948 int kind1, kind2; 10949 void *buf1, *buf2; 10950 Py_ssize_t len1, len2; 10951 int result; 10952 10953 /* Coerce the two arguments */ 10954 sub = PyUnicode_FromObject(element); 10955 if (!sub) { 10956 PyErr_Format(PyExc_TypeError, 10957 "'in <string>' requires string as left operand, not %s", 10958 element->ob_type->tp_name); 10959 return -1; 10960 } 10961 10962 str = PyUnicode_FromObject(container); 10963 if (!str) { 10964 Py_DECREF(sub); 10965 return -1; 10966 } 10967 10968 kind1 = PyUnicode_KIND(str); 10969 kind2 = PyUnicode_KIND(sub); 10970 buf1 = PyUnicode_DATA(str); 10971 buf2 = PyUnicode_DATA(sub); 10972 if (kind2 != kind1) { 10973 if (kind2 > kind1) { 10974 Py_DECREF(sub); 10975 Py_DECREF(str); 10976 return 0; 10977 } 10978 buf2 = _PyUnicode_AsKind(sub, kind1); 10979 } 10980 if (!buf2) { 10981 Py_DECREF(sub); 10982 Py_DECREF(str); 10983 return -1; 10984 } 10985 len1 = PyUnicode_GET_LENGTH(str); 10986 len2 = PyUnicode_GET_LENGTH(sub); 10987 10988 switch (kind1) { 10989 case PyUnicode_1BYTE_KIND: 10990 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10991 break; 10992 case PyUnicode_2BYTE_KIND: 10993 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10994 break; 10995 case PyUnicode_4BYTE_KIND: 10996 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10997 break; 10998 default: 10999 result = -1; 11000 assert(0); 11001 } 11002 11003 Py_DECREF(str); 11004 Py_DECREF(sub); 11005 11006 if (kind2 != kind1) 11007 PyMem_Free(buf2); 11008 11009 return result; 11010} 11011 11012/* Concat to string or Unicode object giving a new Unicode object. */ 11013 11014PyObject * 11015PyUnicode_Concat(PyObject *left, PyObject *right) 11016{ 11017 PyObject *u = NULL, *v = NULL, *w; 11018 Py_UCS4 maxchar, maxchar2; 11019 Py_ssize_t u_len, v_len, new_len; 11020 11021 /* Coerce the two arguments */ 11022 u = PyUnicode_FromObject(left); 11023 if (u == NULL) 11024 goto onError; 11025 v = PyUnicode_FromObject(right); 11026 if (v == NULL) 11027 goto onError; 11028 11029 /* Shortcuts */ 11030 if (v == unicode_empty) { 11031 Py_DECREF(v); 11032 return u; 11033 } 11034 if (u == unicode_empty) { 11035 Py_DECREF(u); 11036 return v; 11037 } 11038 11039 u_len = PyUnicode_GET_LENGTH(u); 11040 v_len = PyUnicode_GET_LENGTH(v); 11041 if (u_len > PY_SSIZE_T_MAX - v_len) { 11042 PyErr_SetString(PyExc_OverflowError, 11043 "strings are too large to concat"); 11044 goto onError; 11045 } 11046 new_len = u_len + v_len; 11047 11048 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11049 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11050 maxchar = Py_MAX(maxchar, maxchar2); 11051 11052 /* Concat the two Unicode strings */ 11053 w = PyUnicode_New(new_len, maxchar); 11054 if (w == NULL) 11055 goto onError; 11056 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11057 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11058 Py_DECREF(u); 11059 Py_DECREF(v); 11060 assert(_PyUnicode_CheckConsistency(w, 1)); 11061 return w; 11062 11063 onError: 11064 Py_XDECREF(u); 11065 Py_XDECREF(v); 11066 return NULL; 11067} 11068 11069void 11070PyUnicode_Append(PyObject **p_left, PyObject *right) 11071{ 11072 PyObject *left, *res; 11073 Py_UCS4 maxchar, maxchar2; 11074 Py_ssize_t left_len, right_len, new_len; 11075 11076 if (p_left == NULL) { 11077 if (!PyErr_Occurred()) 11078 PyErr_BadInternalCall(); 11079 return; 11080 } 11081 left = *p_left; 11082 if (right == NULL || left == NULL 11083 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11084 if (!PyErr_Occurred()) 11085 PyErr_BadInternalCall(); 11086 goto error; 11087 } 11088 11089 if (PyUnicode_READY(left) == -1) 11090 goto error; 11091 if (PyUnicode_READY(right) == -1) 11092 goto error; 11093 11094 /* Shortcuts */ 11095 if (left == unicode_empty) { 11096 Py_DECREF(left); 11097 Py_INCREF(right); 11098 *p_left = right; 11099 return; 11100 } 11101 if (right == unicode_empty) 11102 return; 11103 11104 left_len = PyUnicode_GET_LENGTH(left); 11105 right_len = PyUnicode_GET_LENGTH(right); 11106 if (left_len > PY_SSIZE_T_MAX - right_len) { 11107 PyErr_SetString(PyExc_OverflowError, 11108 "strings are too large to concat"); 11109 goto error; 11110 } 11111 new_len = left_len + right_len; 11112 11113 if (unicode_modifiable(left) 11114 && PyUnicode_CheckExact(right) 11115 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11116 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11117 to change the structure size, but characters are stored just after 11118 the structure, and so it requires to move all characters which is 11119 not so different than duplicating the string. */ 11120 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11121 { 11122 /* append inplace */ 11123 if (unicode_resize(p_left, new_len) != 0) 11124 goto error; 11125 11126 /* copy 'right' into the newly allocated area of 'left' */ 11127 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11128 } 11129 else { 11130 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11131 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11132 maxchar = Py_MAX(maxchar, maxchar2); 11133 11134 /* Concat the two Unicode strings */ 11135 res = PyUnicode_New(new_len, maxchar); 11136 if (res == NULL) 11137 goto error; 11138 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11139 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11140 Py_DECREF(left); 11141 *p_left = res; 11142 } 11143 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11144 return; 11145 11146error: 11147 Py_CLEAR(*p_left); 11148} 11149 11150void 11151PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11152{ 11153 PyUnicode_Append(pleft, right); 11154 Py_XDECREF(right); 11155} 11156 11157PyDoc_STRVAR(count__doc__, 11158 "S.count(sub[, start[, end]]) -> int\n\ 11159\n\ 11160Return the number of non-overlapping occurrences of substring sub in\n\ 11161string S[start:end]. Optional arguments start and end are\n\ 11162interpreted as in slice notation."); 11163 11164static PyObject * 11165unicode_count(PyObject *self, PyObject *args) 11166{ 11167 PyObject *substring; 11168 Py_ssize_t start = 0; 11169 Py_ssize_t end = PY_SSIZE_T_MAX; 11170 PyObject *result; 11171 int kind1, kind2, kind; 11172 void *buf1, *buf2; 11173 Py_ssize_t len1, len2, iresult; 11174 11175 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11176 &start, &end)) 11177 return NULL; 11178 11179 kind1 = PyUnicode_KIND(self); 11180 kind2 = PyUnicode_KIND(substring); 11181 if (kind2 > kind1) { 11182 Py_DECREF(substring); 11183 return PyLong_FromLong(0); 11184 } 11185 kind = kind1; 11186 buf1 = PyUnicode_DATA(self); 11187 buf2 = PyUnicode_DATA(substring); 11188 if (kind2 != kind) 11189 buf2 = _PyUnicode_AsKind(substring, kind); 11190 if (!buf2) { 11191 Py_DECREF(substring); 11192 return NULL; 11193 } 11194 len1 = PyUnicode_GET_LENGTH(self); 11195 len2 = PyUnicode_GET_LENGTH(substring); 11196 11197 ADJUST_INDICES(start, end, len1); 11198 switch (kind) { 11199 case PyUnicode_1BYTE_KIND: 11200 iresult = ucs1lib_count( 11201 ((Py_UCS1*)buf1) + start, end - start, 11202 buf2, len2, PY_SSIZE_T_MAX 11203 ); 11204 break; 11205 case PyUnicode_2BYTE_KIND: 11206 iresult = ucs2lib_count( 11207 ((Py_UCS2*)buf1) + start, end - start, 11208 buf2, len2, PY_SSIZE_T_MAX 11209 ); 11210 break; 11211 case PyUnicode_4BYTE_KIND: 11212 iresult = ucs4lib_count( 11213 ((Py_UCS4*)buf1) + start, end - start, 11214 buf2, len2, PY_SSIZE_T_MAX 11215 ); 11216 break; 11217 default: 11218 assert(0); iresult = 0; 11219 } 11220 11221 result = PyLong_FromSsize_t(iresult); 11222 11223 if (kind2 != kind) 11224 PyMem_Free(buf2); 11225 11226 Py_DECREF(substring); 11227 11228 return result; 11229} 11230 11231PyDoc_STRVAR(encode__doc__, 11232 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11233\n\ 11234Encode S using the codec registered for encoding. Default encoding\n\ 11235is 'utf-8'. errors may be given to set a different error\n\ 11236handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11237a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11238'xmlcharrefreplace' as well as any other name registered with\n\ 11239codecs.register_error that can handle UnicodeEncodeErrors."); 11240 11241static PyObject * 11242unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11243{ 11244 static char *kwlist[] = {"encoding", "errors", 0}; 11245 char *encoding = NULL; 11246 char *errors = NULL; 11247 11248 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11249 kwlist, &encoding, &errors)) 11250 return NULL; 11251 return PyUnicode_AsEncodedString(self, encoding, errors); 11252} 11253 11254PyDoc_STRVAR(expandtabs__doc__, 11255 "S.expandtabs(tabsize=8) -> str\n\ 11256\n\ 11257Return a copy of S where all tab characters are expanded using spaces.\n\ 11258If tabsize is not given, a tab size of 8 characters is assumed."); 11259 11260static PyObject* 11261unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11262{ 11263 Py_ssize_t i, j, line_pos, src_len, incr; 11264 Py_UCS4 ch; 11265 PyObject *u; 11266 void *src_data, *dest_data; 11267 static char *kwlist[] = {"tabsize", 0}; 11268 int tabsize = 8; 11269 int kind; 11270 int found; 11271 11272 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11273 kwlist, &tabsize)) 11274 return NULL; 11275 11276 if (PyUnicode_READY(self) == -1) 11277 return NULL; 11278 11279 /* First pass: determine size of output string */ 11280 src_len = PyUnicode_GET_LENGTH(self); 11281 i = j = line_pos = 0; 11282 kind = PyUnicode_KIND(self); 11283 src_data = PyUnicode_DATA(self); 11284 found = 0; 11285 for (; i < src_len; i++) { 11286 ch = PyUnicode_READ(kind, src_data, i); 11287 if (ch == '\t') { 11288 found = 1; 11289 if (tabsize > 0) { 11290 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11291 if (j > PY_SSIZE_T_MAX - incr) 11292 goto overflow; 11293 line_pos += incr; 11294 j += incr; 11295 } 11296 } 11297 else { 11298 if (j > PY_SSIZE_T_MAX - 1) 11299 goto overflow; 11300 line_pos++; 11301 j++; 11302 if (ch == '\n' || ch == '\r') 11303 line_pos = 0; 11304 } 11305 } 11306 if (!found) 11307 return unicode_result_unchanged(self); 11308 11309 /* Second pass: create output string and fill it */ 11310 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11311 if (!u) 11312 return NULL; 11313 dest_data = PyUnicode_DATA(u); 11314 11315 i = j = line_pos = 0; 11316 11317 for (; i < src_len; i++) { 11318 ch = PyUnicode_READ(kind, src_data, i); 11319 if (ch == '\t') { 11320 if (tabsize > 0) { 11321 incr = tabsize - (line_pos % tabsize); 11322 line_pos += incr; 11323 FILL(kind, dest_data, ' ', j, incr); 11324 j += incr; 11325 } 11326 } 11327 else { 11328 line_pos++; 11329 PyUnicode_WRITE(kind, dest_data, j, ch); 11330 j++; 11331 if (ch == '\n' || ch == '\r') 11332 line_pos = 0; 11333 } 11334 } 11335 assert (j == PyUnicode_GET_LENGTH(u)); 11336 return unicode_result(u); 11337 11338 overflow: 11339 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11340 return NULL; 11341} 11342 11343PyDoc_STRVAR(find__doc__, 11344 "S.find(sub[, start[, end]]) -> int\n\ 11345\n\ 11346Return the lowest index in S where substring sub is found,\n\ 11347such that sub is contained within S[start:end]. Optional\n\ 11348arguments start and end are interpreted as in slice notation.\n\ 11349\n\ 11350Return -1 on failure."); 11351 11352static PyObject * 11353unicode_find(PyObject *self, PyObject *args) 11354{ 11355 PyObject *substring; 11356 Py_ssize_t start; 11357 Py_ssize_t end; 11358 Py_ssize_t result; 11359 11360 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11361 &start, &end)) 11362 return NULL; 11363 11364 if (PyUnicode_READY(self) == -1) { 11365 Py_DECREF(substring); 11366 return NULL; 11367 } 11368 if (PyUnicode_READY(substring) == -1) { 11369 Py_DECREF(substring); 11370 return NULL; 11371 } 11372 11373 result = any_find_slice(1, self, substring, start, end); 11374 11375 Py_DECREF(substring); 11376 11377 if (result == -2) 11378 return NULL; 11379 11380 return PyLong_FromSsize_t(result); 11381} 11382 11383static PyObject * 11384unicode_getitem(PyObject *self, Py_ssize_t index) 11385{ 11386 void *data; 11387 enum PyUnicode_Kind kind; 11388 Py_UCS4 ch; 11389 11390 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11391 PyErr_BadArgument(); 11392 return NULL; 11393 } 11394 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11395 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11396 return NULL; 11397 } 11398 kind = PyUnicode_KIND(self); 11399 data = PyUnicode_DATA(self); 11400 ch = PyUnicode_READ(kind, data, index); 11401 return unicode_char(ch); 11402} 11403 11404/* Believe it or not, this produces the same value for ASCII strings 11405 as bytes_hash(). */ 11406static Py_hash_t 11407unicode_hash(PyObject *self) 11408{ 11409 Py_ssize_t len; 11410 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11411 11412#ifdef Py_DEBUG 11413 assert(_Py_HashSecret_Initialized); 11414#endif 11415 if (_PyUnicode_HASH(self) != -1) 11416 return _PyUnicode_HASH(self); 11417 if (PyUnicode_READY(self) == -1) 11418 return -1; 11419 len = PyUnicode_GET_LENGTH(self); 11420 /* 11421 We make the hash of the empty string be 0, rather than using 11422 (prefix ^ suffix), since this slightly obfuscates the hash secret 11423 */ 11424 if (len == 0) { 11425 _PyUnicode_HASH(self) = 0; 11426 return 0; 11427 } 11428 x = _Py_HashBytes(PyUnicode_DATA(self), 11429 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11430 _PyUnicode_HASH(self) = x; 11431 return x; 11432} 11433 11434PyDoc_STRVAR(index__doc__, 11435 "S.index(sub[, start[, end]]) -> int\n\ 11436\n\ 11437Like S.find() but raise ValueError when the substring is not found."); 11438 11439static PyObject * 11440unicode_index(PyObject *self, PyObject *args) 11441{ 11442 Py_ssize_t result; 11443 PyObject *substring; 11444 Py_ssize_t start; 11445 Py_ssize_t end; 11446 11447 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11448 &start, &end)) 11449 return NULL; 11450 11451 if (PyUnicode_READY(self) == -1) { 11452 Py_DECREF(substring); 11453 return NULL; 11454 } 11455 if (PyUnicode_READY(substring) == -1) { 11456 Py_DECREF(substring); 11457 return NULL; 11458 } 11459 11460 result = any_find_slice(1, self, substring, start, end); 11461 11462 Py_DECREF(substring); 11463 11464 if (result == -2) 11465 return NULL; 11466 11467 if (result < 0) { 11468 PyErr_SetString(PyExc_ValueError, "substring not found"); 11469 return NULL; 11470 } 11471 11472 return PyLong_FromSsize_t(result); 11473} 11474 11475PyDoc_STRVAR(islower__doc__, 11476 "S.islower() -> bool\n\ 11477\n\ 11478Return True if all cased characters in S are lowercase and there is\n\ 11479at least one cased character in S, False otherwise."); 11480 11481static PyObject* 11482unicode_islower(PyObject *self) 11483{ 11484 Py_ssize_t i, length; 11485 int kind; 11486 void *data; 11487 int cased; 11488 11489 if (PyUnicode_READY(self) == -1) 11490 return NULL; 11491 length = PyUnicode_GET_LENGTH(self); 11492 kind = PyUnicode_KIND(self); 11493 data = PyUnicode_DATA(self); 11494 11495 /* Shortcut for single character strings */ 11496 if (length == 1) 11497 return PyBool_FromLong( 11498 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11499 11500 /* Special case for empty strings */ 11501 if (length == 0) 11502 return PyBool_FromLong(0); 11503 11504 cased = 0; 11505 for (i = 0; i < length; i++) { 11506 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11507 11508 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11509 return PyBool_FromLong(0); 11510 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11511 cased = 1; 11512 } 11513 return PyBool_FromLong(cased); 11514} 11515 11516PyDoc_STRVAR(isupper__doc__, 11517 "S.isupper() -> bool\n\ 11518\n\ 11519Return True if all cased characters in S are uppercase and there is\n\ 11520at least one cased character in S, False otherwise."); 11521 11522static PyObject* 11523unicode_isupper(PyObject *self) 11524{ 11525 Py_ssize_t i, length; 11526 int kind; 11527 void *data; 11528 int cased; 11529 11530 if (PyUnicode_READY(self) == -1) 11531 return NULL; 11532 length = PyUnicode_GET_LENGTH(self); 11533 kind = PyUnicode_KIND(self); 11534 data = PyUnicode_DATA(self); 11535 11536 /* Shortcut for single character strings */ 11537 if (length == 1) 11538 return PyBool_FromLong( 11539 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11540 11541 /* Special case for empty strings */ 11542 if (length == 0) 11543 return PyBool_FromLong(0); 11544 11545 cased = 0; 11546 for (i = 0; i < length; i++) { 11547 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11548 11549 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11550 return PyBool_FromLong(0); 11551 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11552 cased = 1; 11553 } 11554 return PyBool_FromLong(cased); 11555} 11556 11557PyDoc_STRVAR(istitle__doc__, 11558 "S.istitle() -> bool\n\ 11559\n\ 11560Return True if S is a titlecased string and there is at least one\n\ 11561character in S, i.e. upper- and titlecase characters may only\n\ 11562follow uncased characters and lowercase characters only cased ones.\n\ 11563Return False otherwise."); 11564 11565static PyObject* 11566unicode_istitle(PyObject *self) 11567{ 11568 Py_ssize_t i, length; 11569 int kind; 11570 void *data; 11571 int cased, previous_is_cased; 11572 11573 if (PyUnicode_READY(self) == -1) 11574 return NULL; 11575 length = PyUnicode_GET_LENGTH(self); 11576 kind = PyUnicode_KIND(self); 11577 data = PyUnicode_DATA(self); 11578 11579 /* Shortcut for single character strings */ 11580 if (length == 1) { 11581 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11582 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11583 (Py_UNICODE_ISUPPER(ch) != 0)); 11584 } 11585 11586 /* Special case for empty strings */ 11587 if (length == 0) 11588 return PyBool_FromLong(0); 11589 11590 cased = 0; 11591 previous_is_cased = 0; 11592 for (i = 0; i < length; i++) { 11593 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11594 11595 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11596 if (previous_is_cased) 11597 return PyBool_FromLong(0); 11598 previous_is_cased = 1; 11599 cased = 1; 11600 } 11601 else if (Py_UNICODE_ISLOWER(ch)) { 11602 if (!previous_is_cased) 11603 return PyBool_FromLong(0); 11604 previous_is_cased = 1; 11605 cased = 1; 11606 } 11607 else 11608 previous_is_cased = 0; 11609 } 11610 return PyBool_FromLong(cased); 11611} 11612 11613PyDoc_STRVAR(isspace__doc__, 11614 "S.isspace() -> bool\n\ 11615\n\ 11616Return True if all characters in S are whitespace\n\ 11617and there is at least one character in S, False otherwise."); 11618 11619static PyObject* 11620unicode_isspace(PyObject *self) 11621{ 11622 Py_ssize_t i, length; 11623 int kind; 11624 void *data; 11625 11626 if (PyUnicode_READY(self) == -1) 11627 return NULL; 11628 length = PyUnicode_GET_LENGTH(self); 11629 kind = PyUnicode_KIND(self); 11630 data = PyUnicode_DATA(self); 11631 11632 /* Shortcut for single character strings */ 11633 if (length == 1) 11634 return PyBool_FromLong( 11635 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11636 11637 /* Special case for empty strings */ 11638 if (length == 0) 11639 return PyBool_FromLong(0); 11640 11641 for (i = 0; i < length; i++) { 11642 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11643 if (!Py_UNICODE_ISSPACE(ch)) 11644 return PyBool_FromLong(0); 11645 } 11646 return PyBool_FromLong(1); 11647} 11648 11649PyDoc_STRVAR(isalpha__doc__, 11650 "S.isalpha() -> bool\n\ 11651\n\ 11652Return True if all characters in S are alphabetic\n\ 11653and there is at least one character in S, False otherwise."); 11654 11655static PyObject* 11656unicode_isalpha(PyObject *self) 11657{ 11658 Py_ssize_t i, length; 11659 int kind; 11660 void *data; 11661 11662 if (PyUnicode_READY(self) == -1) 11663 return NULL; 11664 length = PyUnicode_GET_LENGTH(self); 11665 kind = PyUnicode_KIND(self); 11666 data = PyUnicode_DATA(self); 11667 11668 /* Shortcut for single character strings */ 11669 if (length == 1) 11670 return PyBool_FromLong( 11671 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11672 11673 /* Special case for empty strings */ 11674 if (length == 0) 11675 return PyBool_FromLong(0); 11676 11677 for (i = 0; i < length; i++) { 11678 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11679 return PyBool_FromLong(0); 11680 } 11681 return PyBool_FromLong(1); 11682} 11683 11684PyDoc_STRVAR(isalnum__doc__, 11685 "S.isalnum() -> bool\n\ 11686\n\ 11687Return True if all characters in S are alphanumeric\n\ 11688and there is at least one character in S, False otherwise."); 11689 11690static PyObject* 11691unicode_isalnum(PyObject *self) 11692{ 11693 int kind; 11694 void *data; 11695 Py_ssize_t len, i; 11696 11697 if (PyUnicode_READY(self) == -1) 11698 return NULL; 11699 11700 kind = PyUnicode_KIND(self); 11701 data = PyUnicode_DATA(self); 11702 len = PyUnicode_GET_LENGTH(self); 11703 11704 /* Shortcut for single character strings */ 11705 if (len == 1) { 11706 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11707 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11708 } 11709 11710 /* Special case for empty strings */ 11711 if (len == 0) 11712 return PyBool_FromLong(0); 11713 11714 for (i = 0; i < len; i++) { 11715 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11716 if (!Py_UNICODE_ISALNUM(ch)) 11717 return PyBool_FromLong(0); 11718 } 11719 return PyBool_FromLong(1); 11720} 11721 11722PyDoc_STRVAR(isdecimal__doc__, 11723 "S.isdecimal() -> bool\n\ 11724\n\ 11725Return True if there are only decimal characters in S,\n\ 11726False otherwise."); 11727 11728static PyObject* 11729unicode_isdecimal(PyObject *self) 11730{ 11731 Py_ssize_t i, length; 11732 int kind; 11733 void *data; 11734 11735 if (PyUnicode_READY(self) == -1) 11736 return NULL; 11737 length = PyUnicode_GET_LENGTH(self); 11738 kind = PyUnicode_KIND(self); 11739 data = PyUnicode_DATA(self); 11740 11741 /* Shortcut for single character strings */ 11742 if (length == 1) 11743 return PyBool_FromLong( 11744 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11745 11746 /* Special case for empty strings */ 11747 if (length == 0) 11748 return PyBool_FromLong(0); 11749 11750 for (i = 0; i < length; i++) { 11751 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11752 return PyBool_FromLong(0); 11753 } 11754 return PyBool_FromLong(1); 11755} 11756 11757PyDoc_STRVAR(isdigit__doc__, 11758 "S.isdigit() -> bool\n\ 11759\n\ 11760Return True if all characters in S are digits\n\ 11761and there is at least one character in S, False otherwise."); 11762 11763static PyObject* 11764unicode_isdigit(PyObject *self) 11765{ 11766 Py_ssize_t i, length; 11767 int kind; 11768 void *data; 11769 11770 if (PyUnicode_READY(self) == -1) 11771 return NULL; 11772 length = PyUnicode_GET_LENGTH(self); 11773 kind = PyUnicode_KIND(self); 11774 data = PyUnicode_DATA(self); 11775 11776 /* Shortcut for single character strings */ 11777 if (length == 1) { 11778 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11779 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11780 } 11781 11782 /* Special case for empty strings */ 11783 if (length == 0) 11784 return PyBool_FromLong(0); 11785 11786 for (i = 0; i < length; i++) { 11787 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11788 return PyBool_FromLong(0); 11789 } 11790 return PyBool_FromLong(1); 11791} 11792 11793PyDoc_STRVAR(isnumeric__doc__, 11794 "S.isnumeric() -> bool\n\ 11795\n\ 11796Return True if there are only numeric characters in S,\n\ 11797False otherwise."); 11798 11799static PyObject* 11800unicode_isnumeric(PyObject *self) 11801{ 11802 Py_ssize_t i, length; 11803 int kind; 11804 void *data; 11805 11806 if (PyUnicode_READY(self) == -1) 11807 return NULL; 11808 length = PyUnicode_GET_LENGTH(self); 11809 kind = PyUnicode_KIND(self); 11810 data = PyUnicode_DATA(self); 11811 11812 /* Shortcut for single character strings */ 11813 if (length == 1) 11814 return PyBool_FromLong( 11815 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11816 11817 /* Special case for empty strings */ 11818 if (length == 0) 11819 return PyBool_FromLong(0); 11820 11821 for (i = 0; i < length; i++) { 11822 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11823 return PyBool_FromLong(0); 11824 } 11825 return PyBool_FromLong(1); 11826} 11827 11828int 11829PyUnicode_IsIdentifier(PyObject *self) 11830{ 11831 int kind; 11832 void *data; 11833 Py_ssize_t i; 11834 Py_UCS4 first; 11835 11836 if (PyUnicode_READY(self) == -1) { 11837 Py_FatalError("identifier not ready"); 11838 return 0; 11839 } 11840 11841 /* Special case for empty strings */ 11842 if (PyUnicode_GET_LENGTH(self) == 0) 11843 return 0; 11844 kind = PyUnicode_KIND(self); 11845 data = PyUnicode_DATA(self); 11846 11847 /* PEP 3131 says that the first character must be in 11848 XID_Start and subsequent characters in XID_Continue, 11849 and for the ASCII range, the 2.x rules apply (i.e 11850 start with letters and underscore, continue with 11851 letters, digits, underscore). However, given the current 11852 definition of XID_Start and XID_Continue, it is sufficient 11853 to check just for these, except that _ must be allowed 11854 as starting an identifier. */ 11855 first = PyUnicode_READ(kind, data, 0); 11856 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11857 return 0; 11858 11859 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11860 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11861 return 0; 11862 return 1; 11863} 11864 11865PyDoc_STRVAR(isidentifier__doc__, 11866 "S.isidentifier() -> bool\n\ 11867\n\ 11868Return True if S is a valid identifier according\n\ 11869to the language definition.\n\ 11870\n\ 11871Use keyword.iskeyword() to test for reserved identifiers\n\ 11872such as \"def\" and \"class\".\n"); 11873 11874static PyObject* 11875unicode_isidentifier(PyObject *self) 11876{ 11877 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11878} 11879 11880PyDoc_STRVAR(isprintable__doc__, 11881 "S.isprintable() -> bool\n\ 11882\n\ 11883Return True if all characters in S are considered\n\ 11884printable in repr() or S is empty, False otherwise."); 11885 11886static PyObject* 11887unicode_isprintable(PyObject *self) 11888{ 11889 Py_ssize_t i, length; 11890 int kind; 11891 void *data; 11892 11893 if (PyUnicode_READY(self) == -1) 11894 return NULL; 11895 length = PyUnicode_GET_LENGTH(self); 11896 kind = PyUnicode_KIND(self); 11897 data = PyUnicode_DATA(self); 11898 11899 /* Shortcut for single character strings */ 11900 if (length == 1) 11901 return PyBool_FromLong( 11902 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11903 11904 for (i = 0; i < length; i++) { 11905 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11906 Py_RETURN_FALSE; 11907 } 11908 } 11909 Py_RETURN_TRUE; 11910} 11911 11912PyDoc_STRVAR(join__doc__, 11913 "S.join(iterable) -> str\n\ 11914\n\ 11915Return a string which is the concatenation of the strings in the\n\ 11916iterable. The separator between elements is S."); 11917 11918static PyObject* 11919unicode_join(PyObject *self, PyObject *data) 11920{ 11921 return PyUnicode_Join(self, data); 11922} 11923 11924static Py_ssize_t 11925unicode_length(PyObject *self) 11926{ 11927 if (PyUnicode_READY(self) == -1) 11928 return -1; 11929 return PyUnicode_GET_LENGTH(self); 11930} 11931 11932PyDoc_STRVAR(ljust__doc__, 11933 "S.ljust(width[, fillchar]) -> str\n\ 11934\n\ 11935Return S left-justified in a Unicode string of length width. Padding is\n\ 11936done using the specified fill character (default is a space)."); 11937 11938static PyObject * 11939unicode_ljust(PyObject *self, PyObject *args) 11940{ 11941 Py_ssize_t width; 11942 Py_UCS4 fillchar = ' '; 11943 11944 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11945 return NULL; 11946 11947 if (PyUnicode_READY(self) == -1) 11948 return NULL; 11949 11950 if (PyUnicode_GET_LENGTH(self) >= width) 11951 return unicode_result_unchanged(self); 11952 11953 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11954} 11955 11956PyDoc_STRVAR(lower__doc__, 11957 "S.lower() -> str\n\ 11958\n\ 11959Return a copy of the string S converted to lowercase."); 11960 11961static PyObject* 11962unicode_lower(PyObject *self) 11963{ 11964 if (PyUnicode_READY(self) == -1) 11965 return NULL; 11966 if (PyUnicode_IS_ASCII(self)) 11967 return ascii_upper_or_lower(self, 1); 11968 return case_operation(self, do_lower); 11969} 11970 11971#define LEFTSTRIP 0 11972#define RIGHTSTRIP 1 11973#define BOTHSTRIP 2 11974 11975/* Arrays indexed by above */ 11976static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11977 11978#define STRIPNAME(i) (stripformat[i]+3) 11979 11980/* externally visible for str.strip(unicode) */ 11981PyObject * 11982_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11983{ 11984 void *data; 11985 int kind; 11986 Py_ssize_t i, j, len; 11987 BLOOM_MASK sepmask; 11988 Py_ssize_t seplen; 11989 11990 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11991 return NULL; 11992 11993 kind = PyUnicode_KIND(self); 11994 data = PyUnicode_DATA(self); 11995 len = PyUnicode_GET_LENGTH(self); 11996 seplen = PyUnicode_GET_LENGTH(sepobj); 11997 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11998 PyUnicode_DATA(sepobj), 11999 seplen); 12000 12001 i = 0; 12002 if (striptype != RIGHTSTRIP) { 12003 while (i < len) { 12004 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12005 if (!BLOOM(sepmask, ch)) 12006 break; 12007 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12008 break; 12009 i++; 12010 } 12011 } 12012 12013 j = len; 12014 if (striptype != LEFTSTRIP) { 12015 j--; 12016 while (j >= i) { 12017 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12018 if (!BLOOM(sepmask, ch)) 12019 break; 12020 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12021 break; 12022 j--; 12023 } 12024 12025 j++; 12026 } 12027 12028 return PyUnicode_Substring(self, i, j); 12029} 12030 12031PyObject* 12032PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12033{ 12034 unsigned char *data; 12035 int kind; 12036 Py_ssize_t length; 12037 12038 if (PyUnicode_READY(self) == -1) 12039 return NULL; 12040 12041 length = PyUnicode_GET_LENGTH(self); 12042 end = Py_MIN(end, length); 12043 12044 if (start == 0 && end == length) 12045 return unicode_result_unchanged(self); 12046 12047 if (start < 0 || end < 0) { 12048 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12049 return NULL; 12050 } 12051 if (start >= length || end < start) 12052 _Py_RETURN_UNICODE_EMPTY(); 12053 12054 length = end - start; 12055 if (PyUnicode_IS_ASCII(self)) { 12056 data = PyUnicode_1BYTE_DATA(self); 12057 return _PyUnicode_FromASCII((char*)(data + start), length); 12058 } 12059 else { 12060 kind = PyUnicode_KIND(self); 12061 data = PyUnicode_1BYTE_DATA(self); 12062 return PyUnicode_FromKindAndData(kind, 12063 data + kind * start, 12064 length); 12065 } 12066} 12067 12068static PyObject * 12069do_strip(PyObject *self, int striptype) 12070{ 12071 Py_ssize_t len, i, j; 12072 12073 if (PyUnicode_READY(self) == -1) 12074 return NULL; 12075 12076 len = PyUnicode_GET_LENGTH(self); 12077 12078 if (PyUnicode_IS_ASCII(self)) { 12079 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12080 12081 i = 0; 12082 if (striptype != RIGHTSTRIP) { 12083 while (i < len) { 12084 Py_UCS1 ch = data[i]; 12085 if (!_Py_ascii_whitespace[ch]) 12086 break; 12087 i++; 12088 } 12089 } 12090 12091 j = len; 12092 if (striptype != LEFTSTRIP) { 12093 j--; 12094 while (j >= i) { 12095 Py_UCS1 ch = data[j]; 12096 if (!_Py_ascii_whitespace[ch]) 12097 break; 12098 j--; 12099 } 12100 j++; 12101 } 12102 } 12103 else { 12104 int kind = PyUnicode_KIND(self); 12105 void *data = PyUnicode_DATA(self); 12106 12107 i = 0; 12108 if (striptype != RIGHTSTRIP) { 12109 while (i < len) { 12110 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12111 if (!Py_UNICODE_ISSPACE(ch)) 12112 break; 12113 i++; 12114 } 12115 } 12116 12117 j = len; 12118 if (striptype != LEFTSTRIP) { 12119 j--; 12120 while (j >= i) { 12121 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12122 if (!Py_UNICODE_ISSPACE(ch)) 12123 break; 12124 j--; 12125 } 12126 j++; 12127 } 12128 } 12129 12130 return PyUnicode_Substring(self, i, j); 12131} 12132 12133 12134static PyObject * 12135do_argstrip(PyObject *self, int striptype, PyObject *args) 12136{ 12137 PyObject *sep = NULL; 12138 12139 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12140 return NULL; 12141 12142 if (sep != NULL && sep != Py_None) { 12143 if (PyUnicode_Check(sep)) 12144 return _PyUnicode_XStrip(self, striptype, sep); 12145 else { 12146 PyErr_Format(PyExc_TypeError, 12147 "%s arg must be None or str", 12148 STRIPNAME(striptype)); 12149 return NULL; 12150 } 12151 } 12152 12153 return do_strip(self, striptype); 12154} 12155 12156 12157PyDoc_STRVAR(strip__doc__, 12158 "S.strip([chars]) -> str\n\ 12159\n\ 12160Return a copy of the string S with leading and trailing\n\ 12161whitespace removed.\n\ 12162If chars is given and not None, remove characters in chars instead."); 12163 12164static PyObject * 12165unicode_strip(PyObject *self, PyObject *args) 12166{ 12167 if (PyTuple_GET_SIZE(args) == 0) 12168 return do_strip(self, BOTHSTRIP); /* Common case */ 12169 else 12170 return do_argstrip(self, BOTHSTRIP, args); 12171} 12172 12173 12174PyDoc_STRVAR(lstrip__doc__, 12175 "S.lstrip([chars]) -> str\n\ 12176\n\ 12177Return a copy of the string S with leading whitespace removed.\n\ 12178If chars is given and not None, remove characters in chars instead."); 12179 12180static PyObject * 12181unicode_lstrip(PyObject *self, PyObject *args) 12182{ 12183 if (PyTuple_GET_SIZE(args) == 0) 12184 return do_strip(self, LEFTSTRIP); /* Common case */ 12185 else 12186 return do_argstrip(self, LEFTSTRIP, args); 12187} 12188 12189 12190PyDoc_STRVAR(rstrip__doc__, 12191 "S.rstrip([chars]) -> str\n\ 12192\n\ 12193Return a copy of the string S with trailing whitespace removed.\n\ 12194If chars is given and not None, remove characters in chars instead."); 12195 12196static PyObject * 12197unicode_rstrip(PyObject *self, PyObject *args) 12198{ 12199 if (PyTuple_GET_SIZE(args) == 0) 12200 return do_strip(self, RIGHTSTRIP); /* Common case */ 12201 else 12202 return do_argstrip(self, RIGHTSTRIP, args); 12203} 12204 12205 12206static PyObject* 12207unicode_repeat(PyObject *str, Py_ssize_t len) 12208{ 12209 PyObject *u; 12210 Py_ssize_t nchars, n; 12211 12212 if (len < 1) 12213 _Py_RETURN_UNICODE_EMPTY(); 12214 12215 /* no repeat, return original string */ 12216 if (len == 1) 12217 return unicode_result_unchanged(str); 12218 12219 if (PyUnicode_READY(str) == -1) 12220 return NULL; 12221 12222 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12223 PyErr_SetString(PyExc_OverflowError, 12224 "repeated string is too long"); 12225 return NULL; 12226 } 12227 nchars = len * PyUnicode_GET_LENGTH(str); 12228 12229 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12230 if (!u) 12231 return NULL; 12232 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12233 12234 if (PyUnicode_GET_LENGTH(str) == 1) { 12235 const int kind = PyUnicode_KIND(str); 12236 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12237 if (kind == PyUnicode_1BYTE_KIND) { 12238 void *to = PyUnicode_DATA(u); 12239 memset(to, (unsigned char)fill_char, len); 12240 } 12241 else if (kind == PyUnicode_2BYTE_KIND) { 12242 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12243 for (n = 0; n < len; ++n) 12244 ucs2[n] = fill_char; 12245 } else { 12246 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12247 assert(kind == PyUnicode_4BYTE_KIND); 12248 for (n = 0; n < len; ++n) 12249 ucs4[n] = fill_char; 12250 } 12251 } 12252 else { 12253 /* number of characters copied this far */ 12254 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12255 const Py_ssize_t char_size = PyUnicode_KIND(str); 12256 char *to = (char *) PyUnicode_DATA(u); 12257 Py_MEMCPY(to, PyUnicode_DATA(str), 12258 PyUnicode_GET_LENGTH(str) * char_size); 12259 while (done < nchars) { 12260 n = (done <= nchars-done) ? done : nchars-done; 12261 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12262 done += n; 12263 } 12264 } 12265 12266 assert(_PyUnicode_CheckConsistency(u, 1)); 12267 return u; 12268} 12269 12270PyObject * 12271PyUnicode_Replace(PyObject *obj, 12272 PyObject *subobj, 12273 PyObject *replobj, 12274 Py_ssize_t maxcount) 12275{ 12276 PyObject *self; 12277 PyObject *str1; 12278 PyObject *str2; 12279 PyObject *result; 12280 12281 self = PyUnicode_FromObject(obj); 12282 if (self == NULL) 12283 return NULL; 12284 str1 = PyUnicode_FromObject(subobj); 12285 if (str1 == NULL) { 12286 Py_DECREF(self); 12287 return NULL; 12288 } 12289 str2 = PyUnicode_FromObject(replobj); 12290 if (str2 == NULL) { 12291 Py_DECREF(self); 12292 Py_DECREF(str1); 12293 return NULL; 12294 } 12295 if (PyUnicode_READY(self) == -1 || 12296 PyUnicode_READY(str1) == -1 || 12297 PyUnicode_READY(str2) == -1) 12298 result = NULL; 12299 else 12300 result = replace(self, str1, str2, maxcount); 12301 Py_DECREF(self); 12302 Py_DECREF(str1); 12303 Py_DECREF(str2); 12304 return result; 12305} 12306 12307PyDoc_STRVAR(replace__doc__, 12308 "S.replace(old, new[, count]) -> str\n\ 12309\n\ 12310Return a copy of S with all occurrences of substring\n\ 12311old replaced by new. If the optional argument count is\n\ 12312given, only the first count occurrences are replaced."); 12313 12314static PyObject* 12315unicode_replace(PyObject *self, PyObject *args) 12316{ 12317 PyObject *str1; 12318 PyObject *str2; 12319 Py_ssize_t maxcount = -1; 12320 PyObject *result; 12321 12322 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12323 return NULL; 12324 if (PyUnicode_READY(self) == -1) 12325 return NULL; 12326 str1 = PyUnicode_FromObject(str1); 12327 if (str1 == NULL) 12328 return NULL; 12329 str2 = PyUnicode_FromObject(str2); 12330 if (str2 == NULL) { 12331 Py_DECREF(str1); 12332 return NULL; 12333 } 12334 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12335 result = NULL; 12336 else 12337 result = replace(self, str1, str2, maxcount); 12338 12339 Py_DECREF(str1); 12340 Py_DECREF(str2); 12341 return result; 12342} 12343 12344static PyObject * 12345unicode_repr(PyObject *unicode) 12346{ 12347 PyObject *repr; 12348 Py_ssize_t isize; 12349 Py_ssize_t osize, squote, dquote, i, o; 12350 Py_UCS4 max, quote; 12351 int ikind, okind, unchanged; 12352 void *idata, *odata; 12353 12354 if (PyUnicode_READY(unicode) == -1) 12355 return NULL; 12356 12357 isize = PyUnicode_GET_LENGTH(unicode); 12358 idata = PyUnicode_DATA(unicode); 12359 12360 /* Compute length of output, quote characters, and 12361 maximum character */ 12362 osize = 0; 12363 max = 127; 12364 squote = dquote = 0; 12365 ikind = PyUnicode_KIND(unicode); 12366 for (i = 0; i < isize; i++) { 12367 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12368 Py_ssize_t incr = 1; 12369 switch (ch) { 12370 case '\'': squote++; break; 12371 case '"': dquote++; break; 12372 case '\\': case '\t': case '\r': case '\n': 12373 incr = 2; 12374 break; 12375 default: 12376 /* Fast-path ASCII */ 12377 if (ch < ' ' || ch == 0x7f) 12378 incr = 4; /* \xHH */ 12379 else if (ch < 0x7f) 12380 ; 12381 else if (Py_UNICODE_ISPRINTABLE(ch)) 12382 max = ch > max ? ch : max; 12383 else if (ch < 0x100) 12384 incr = 4; /* \xHH */ 12385 else if (ch < 0x10000) 12386 incr = 6; /* \uHHHH */ 12387 else 12388 incr = 10; /* \uHHHHHHHH */ 12389 } 12390 if (osize > PY_SSIZE_T_MAX - incr) { 12391 PyErr_SetString(PyExc_OverflowError, 12392 "string is too long to generate repr"); 12393 return NULL; 12394 } 12395 osize += incr; 12396 } 12397 12398 quote = '\''; 12399 unchanged = (osize == isize); 12400 if (squote) { 12401 unchanged = 0; 12402 if (dquote) 12403 /* Both squote and dquote present. Use squote, 12404 and escape them */ 12405 osize += squote; 12406 else 12407 quote = '"'; 12408 } 12409 osize += 2; /* quotes */ 12410 12411 repr = PyUnicode_New(osize, max); 12412 if (repr == NULL) 12413 return NULL; 12414 okind = PyUnicode_KIND(repr); 12415 odata = PyUnicode_DATA(repr); 12416 12417 PyUnicode_WRITE(okind, odata, 0, quote); 12418 PyUnicode_WRITE(okind, odata, osize-1, quote); 12419 if (unchanged) { 12420 _PyUnicode_FastCopyCharacters(repr, 1, 12421 unicode, 0, 12422 isize); 12423 } 12424 else { 12425 for (i = 0, o = 1; i < isize; i++) { 12426 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12427 12428 /* Escape quotes and backslashes */ 12429 if ((ch == quote) || (ch == '\\')) { 12430 PyUnicode_WRITE(okind, odata, o++, '\\'); 12431 PyUnicode_WRITE(okind, odata, o++, ch); 12432 continue; 12433 } 12434 12435 /* Map special whitespace to '\t', \n', '\r' */ 12436 if (ch == '\t') { 12437 PyUnicode_WRITE(okind, odata, o++, '\\'); 12438 PyUnicode_WRITE(okind, odata, o++, 't'); 12439 } 12440 else if (ch == '\n') { 12441 PyUnicode_WRITE(okind, odata, o++, '\\'); 12442 PyUnicode_WRITE(okind, odata, o++, 'n'); 12443 } 12444 else if (ch == '\r') { 12445 PyUnicode_WRITE(okind, odata, o++, '\\'); 12446 PyUnicode_WRITE(okind, odata, o++, 'r'); 12447 } 12448 12449 /* Map non-printable US ASCII to '\xhh' */ 12450 else if (ch < ' ' || ch == 0x7F) { 12451 PyUnicode_WRITE(okind, odata, o++, '\\'); 12452 PyUnicode_WRITE(okind, odata, o++, 'x'); 12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12455 } 12456 12457 /* Copy ASCII characters as-is */ 12458 else if (ch < 0x7F) { 12459 PyUnicode_WRITE(okind, odata, o++, ch); 12460 } 12461 12462 /* Non-ASCII characters */ 12463 else { 12464 /* Map Unicode whitespace and control characters 12465 (categories Z* and C* except ASCII space) 12466 */ 12467 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12468 PyUnicode_WRITE(okind, odata, o++, '\\'); 12469 /* Map 8-bit characters to '\xhh' */ 12470 if (ch <= 0xff) { 12471 PyUnicode_WRITE(okind, odata, o++, 'x'); 12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12474 } 12475 /* Map 16-bit characters to '\uxxxx' */ 12476 else if (ch <= 0xffff) { 12477 PyUnicode_WRITE(okind, odata, o++, 'u'); 12478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12479 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12480 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12481 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12482 } 12483 /* Map 21-bit characters to '\U00xxxxxx' */ 12484 else { 12485 PyUnicode_WRITE(okind, odata, o++, 'U'); 12486 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12487 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12488 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12489 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12490 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12491 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12492 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12493 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12494 } 12495 } 12496 /* Copy characters as-is */ 12497 else { 12498 PyUnicode_WRITE(okind, odata, o++, ch); 12499 } 12500 } 12501 } 12502 } 12503 /* Closing quote already added at the beginning */ 12504 assert(_PyUnicode_CheckConsistency(repr, 1)); 12505 return repr; 12506} 12507 12508PyDoc_STRVAR(rfind__doc__, 12509 "S.rfind(sub[, start[, end]]) -> int\n\ 12510\n\ 12511Return the highest index in S where substring sub is found,\n\ 12512such that sub is contained within S[start:end]. Optional\n\ 12513arguments start and end are interpreted as in slice notation.\n\ 12514\n\ 12515Return -1 on failure."); 12516 12517static PyObject * 12518unicode_rfind(PyObject *self, PyObject *args) 12519{ 12520 PyObject *substring; 12521 Py_ssize_t start; 12522 Py_ssize_t end; 12523 Py_ssize_t result; 12524 12525 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12526 &start, &end)) 12527 return NULL; 12528 12529 if (PyUnicode_READY(self) == -1) { 12530 Py_DECREF(substring); 12531 return NULL; 12532 } 12533 if (PyUnicode_READY(substring) == -1) { 12534 Py_DECREF(substring); 12535 return NULL; 12536 } 12537 12538 result = any_find_slice(-1, self, substring, start, end); 12539 12540 Py_DECREF(substring); 12541 12542 if (result == -2) 12543 return NULL; 12544 12545 return PyLong_FromSsize_t(result); 12546} 12547 12548PyDoc_STRVAR(rindex__doc__, 12549 "S.rindex(sub[, start[, end]]) -> int\n\ 12550\n\ 12551Like S.rfind() but raise ValueError when the substring is not found."); 12552 12553static PyObject * 12554unicode_rindex(PyObject *self, PyObject *args) 12555{ 12556 PyObject *substring; 12557 Py_ssize_t start; 12558 Py_ssize_t end; 12559 Py_ssize_t result; 12560 12561 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12562 &start, &end)) 12563 return NULL; 12564 12565 if (PyUnicode_READY(self) == -1) { 12566 Py_DECREF(substring); 12567 return NULL; 12568 } 12569 if (PyUnicode_READY(substring) == -1) { 12570 Py_DECREF(substring); 12571 return NULL; 12572 } 12573 12574 result = any_find_slice(-1, self, substring, start, end); 12575 12576 Py_DECREF(substring); 12577 12578 if (result == -2) 12579 return NULL; 12580 12581 if (result < 0) { 12582 PyErr_SetString(PyExc_ValueError, "substring not found"); 12583 return NULL; 12584 } 12585 12586 return PyLong_FromSsize_t(result); 12587} 12588 12589PyDoc_STRVAR(rjust__doc__, 12590 "S.rjust(width[, fillchar]) -> str\n\ 12591\n\ 12592Return S right-justified in a string of length width. Padding is\n\ 12593done using the specified fill character (default is a space)."); 12594 12595static PyObject * 12596unicode_rjust(PyObject *self, PyObject *args) 12597{ 12598 Py_ssize_t width; 12599 Py_UCS4 fillchar = ' '; 12600 12601 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12602 return NULL; 12603 12604 if (PyUnicode_READY(self) == -1) 12605 return NULL; 12606 12607 if (PyUnicode_GET_LENGTH(self) >= width) 12608 return unicode_result_unchanged(self); 12609 12610 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12611} 12612 12613PyObject * 12614PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12615{ 12616 PyObject *result; 12617 12618 s = PyUnicode_FromObject(s); 12619 if (s == NULL) 12620 return NULL; 12621 if (sep != NULL) { 12622 sep = PyUnicode_FromObject(sep); 12623 if (sep == NULL) { 12624 Py_DECREF(s); 12625 return NULL; 12626 } 12627 } 12628 12629 result = split(s, sep, maxsplit); 12630 12631 Py_DECREF(s); 12632 Py_XDECREF(sep); 12633 return result; 12634} 12635 12636PyDoc_STRVAR(split__doc__, 12637 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12638\n\ 12639Return a list of the words in S, using sep as the\n\ 12640delimiter string. If maxsplit is given, at most maxsplit\n\ 12641splits are done. If sep is not specified or is None, any\n\ 12642whitespace string is a separator and empty strings are\n\ 12643removed from the result."); 12644 12645static PyObject* 12646unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12647{ 12648 static char *kwlist[] = {"sep", "maxsplit", 0}; 12649 PyObject *substring = Py_None; 12650 Py_ssize_t maxcount = -1; 12651 12652 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12653 kwlist, &substring, &maxcount)) 12654 return NULL; 12655 12656 if (substring == Py_None) 12657 return split(self, NULL, maxcount); 12658 else if (PyUnicode_Check(substring)) 12659 return split(self, substring, maxcount); 12660 else 12661 return PyUnicode_Split(self, substring, maxcount); 12662} 12663 12664PyObject * 12665PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12666{ 12667 PyObject* str_obj; 12668 PyObject* sep_obj; 12669 PyObject* out; 12670 int kind1, kind2, kind; 12671 void *buf1 = NULL, *buf2 = NULL; 12672 Py_ssize_t len1, len2; 12673 12674 str_obj = PyUnicode_FromObject(str_in); 12675 if (!str_obj) 12676 return NULL; 12677 sep_obj = PyUnicode_FromObject(sep_in); 12678 if (!sep_obj) { 12679 Py_DECREF(str_obj); 12680 return NULL; 12681 } 12682 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12683 Py_DECREF(sep_obj); 12684 Py_DECREF(str_obj); 12685 return NULL; 12686 } 12687 12688 kind1 = PyUnicode_KIND(str_obj); 12689 kind2 = PyUnicode_KIND(sep_obj); 12690 kind = Py_MAX(kind1, kind2); 12691 buf1 = PyUnicode_DATA(str_obj); 12692 if (kind1 != kind) 12693 buf1 = _PyUnicode_AsKind(str_obj, kind); 12694 if (!buf1) 12695 goto onError; 12696 buf2 = PyUnicode_DATA(sep_obj); 12697 if (kind2 != kind) 12698 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12699 if (!buf2) 12700 goto onError; 12701 len1 = PyUnicode_GET_LENGTH(str_obj); 12702 len2 = PyUnicode_GET_LENGTH(sep_obj); 12703 12704 switch (PyUnicode_KIND(str_obj)) { 12705 case PyUnicode_1BYTE_KIND: 12706 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12707 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12708 else 12709 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12710 break; 12711 case PyUnicode_2BYTE_KIND: 12712 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12713 break; 12714 case PyUnicode_4BYTE_KIND: 12715 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12716 break; 12717 default: 12718 assert(0); 12719 out = 0; 12720 } 12721 12722 Py_DECREF(sep_obj); 12723 Py_DECREF(str_obj); 12724 if (kind1 != kind) 12725 PyMem_Free(buf1); 12726 if (kind2 != kind) 12727 PyMem_Free(buf2); 12728 12729 return out; 12730 onError: 12731 Py_DECREF(sep_obj); 12732 Py_DECREF(str_obj); 12733 if (kind1 != kind && buf1) 12734 PyMem_Free(buf1); 12735 if (kind2 != kind && buf2) 12736 PyMem_Free(buf2); 12737 return NULL; 12738} 12739 12740 12741PyObject * 12742PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12743{ 12744 PyObject* str_obj; 12745 PyObject* sep_obj; 12746 PyObject* out; 12747 int kind1, kind2, kind; 12748 void *buf1 = NULL, *buf2 = NULL; 12749 Py_ssize_t len1, len2; 12750 12751 str_obj = PyUnicode_FromObject(str_in); 12752 if (!str_obj) 12753 return NULL; 12754 sep_obj = PyUnicode_FromObject(sep_in); 12755 if (!sep_obj) { 12756 Py_DECREF(str_obj); 12757 return NULL; 12758 } 12759 12760 kind1 = PyUnicode_KIND(str_in); 12761 kind2 = PyUnicode_KIND(sep_obj); 12762 kind = Py_MAX(kind1, kind2); 12763 buf1 = PyUnicode_DATA(str_in); 12764 if (kind1 != kind) 12765 buf1 = _PyUnicode_AsKind(str_in, kind); 12766 if (!buf1) 12767 goto onError; 12768 buf2 = PyUnicode_DATA(sep_obj); 12769 if (kind2 != kind) 12770 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12771 if (!buf2) 12772 goto onError; 12773 len1 = PyUnicode_GET_LENGTH(str_obj); 12774 len2 = PyUnicode_GET_LENGTH(sep_obj); 12775 12776 switch (PyUnicode_KIND(str_in)) { 12777 case PyUnicode_1BYTE_KIND: 12778 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12779 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12780 else 12781 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12782 break; 12783 case PyUnicode_2BYTE_KIND: 12784 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12785 break; 12786 case PyUnicode_4BYTE_KIND: 12787 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12788 break; 12789 default: 12790 assert(0); 12791 out = 0; 12792 } 12793 12794 Py_DECREF(sep_obj); 12795 Py_DECREF(str_obj); 12796 if (kind1 != kind) 12797 PyMem_Free(buf1); 12798 if (kind2 != kind) 12799 PyMem_Free(buf2); 12800 12801 return out; 12802 onError: 12803 Py_DECREF(sep_obj); 12804 Py_DECREF(str_obj); 12805 if (kind1 != kind && buf1) 12806 PyMem_Free(buf1); 12807 if (kind2 != kind && buf2) 12808 PyMem_Free(buf2); 12809 return NULL; 12810} 12811 12812PyDoc_STRVAR(partition__doc__, 12813 "S.partition(sep) -> (head, sep, tail)\n\ 12814\n\ 12815Search for the separator sep in S, and return the part before it,\n\ 12816the separator itself, and the part after it. If the separator is not\n\ 12817found, return S and two empty strings."); 12818 12819static PyObject* 12820unicode_partition(PyObject *self, PyObject *separator) 12821{ 12822 return PyUnicode_Partition(self, separator); 12823} 12824 12825PyDoc_STRVAR(rpartition__doc__, 12826 "S.rpartition(sep) -> (head, sep, tail)\n\ 12827\n\ 12828Search for the separator sep in S, starting at the end of S, and return\n\ 12829the part before it, the separator itself, and the part after it. If the\n\ 12830separator is not found, return two empty strings and S."); 12831 12832static PyObject* 12833unicode_rpartition(PyObject *self, PyObject *separator) 12834{ 12835 return PyUnicode_RPartition(self, separator); 12836} 12837 12838PyObject * 12839PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12840{ 12841 PyObject *result; 12842 12843 s = PyUnicode_FromObject(s); 12844 if (s == NULL) 12845 return NULL; 12846 if (sep != NULL) { 12847 sep = PyUnicode_FromObject(sep); 12848 if (sep == NULL) { 12849 Py_DECREF(s); 12850 return NULL; 12851 } 12852 } 12853 12854 result = rsplit(s, sep, maxsplit); 12855 12856 Py_DECREF(s); 12857 Py_XDECREF(sep); 12858 return result; 12859} 12860 12861PyDoc_STRVAR(rsplit__doc__, 12862 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12863\n\ 12864Return a list of the words in S, using sep as the\n\ 12865delimiter string, starting at the end of the string and\n\ 12866working to the front. If maxsplit is given, at most maxsplit\n\ 12867splits are done. If sep is not specified, any whitespace string\n\ 12868is a separator."); 12869 12870static PyObject* 12871unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12872{ 12873 static char *kwlist[] = {"sep", "maxsplit", 0}; 12874 PyObject *substring = Py_None; 12875 Py_ssize_t maxcount = -1; 12876 12877 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12878 kwlist, &substring, &maxcount)) 12879 return NULL; 12880 12881 if (substring == Py_None) 12882 return rsplit(self, NULL, maxcount); 12883 else if (PyUnicode_Check(substring)) 12884 return rsplit(self, substring, maxcount); 12885 else 12886 return PyUnicode_RSplit(self, substring, maxcount); 12887} 12888 12889PyDoc_STRVAR(splitlines__doc__, 12890 "S.splitlines([keepends]) -> list of strings\n\ 12891\n\ 12892Return a list of the lines in S, breaking at line boundaries.\n\ 12893Line breaks are not included in the resulting list unless keepends\n\ 12894is given and true."); 12895 12896static PyObject* 12897unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12898{ 12899 static char *kwlist[] = {"keepends", 0}; 12900 int keepends = 0; 12901 12902 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12903 kwlist, &keepends)) 12904 return NULL; 12905 12906 return PyUnicode_Splitlines(self, keepends); 12907} 12908 12909static 12910PyObject *unicode_str(PyObject *self) 12911{ 12912 return unicode_result_unchanged(self); 12913} 12914 12915PyDoc_STRVAR(swapcase__doc__, 12916 "S.swapcase() -> str\n\ 12917\n\ 12918Return a copy of S with uppercase characters converted to lowercase\n\ 12919and vice versa."); 12920 12921static PyObject* 12922unicode_swapcase(PyObject *self) 12923{ 12924 if (PyUnicode_READY(self) == -1) 12925 return NULL; 12926 return case_operation(self, do_swapcase); 12927} 12928 12929/*[clinic input] 12930 12931@staticmethod 12932str.maketrans as unicode_maketrans 12933 12934 x: object 12935 12936 y: unicode=NULL 12937 12938 z: unicode=NULL 12939 12940 / 12941 12942Return a translation table usable for str.translate(). 12943 12944If there is only one argument, it must be a dictionary mapping Unicode 12945ordinals (integers) or characters to Unicode ordinals, strings or None. 12946Character keys will be then converted to ordinals. 12947If there are two arguments, they must be strings of equal length, and 12948in the resulting dictionary, each character in x will be mapped to the 12949character at the same position in y. If there is a third argument, it 12950must be a string, whose characters will be mapped to None in the result. 12951[clinic start generated code]*/ 12952 12953PyDoc_STRVAR(unicode_maketrans__doc__, 12954"maketrans(x, y=None, z=None, /)\n" 12955"--\n" 12956"\n" 12957"Return a translation table usable for str.translate().\n" 12958"\n" 12959"If there is only one argument, it must be a dictionary mapping Unicode\n" 12960"ordinals (integers) or characters to Unicode ordinals, strings or None.\n" 12961"Character keys will be then converted to ordinals.\n" 12962"If there are two arguments, they must be strings of equal length, and\n" 12963"in the resulting dictionary, each character in x will be mapped to the\n" 12964"character at the same position in y. If there is a third argument, it\n" 12965"must be a string, whose characters will be mapped to None in the result."); 12966 12967#define UNICODE_MAKETRANS_METHODDEF \ 12968 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__}, 12969 12970static PyObject * 12971unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z); 12972 12973static PyObject * 12974unicode_maketrans(void *null, PyObject *args) 12975{ 12976 PyObject *return_value = NULL; 12977 PyObject *x; 12978 PyObject *y = NULL; 12979 PyObject *z = NULL; 12980 12981 if (!PyArg_ParseTuple(args, 12982 "O|UU:maketrans", 12983 &x, &y, &z)) 12984 goto exit; 12985 return_value = unicode_maketrans_impl(x, y, z); 12986 12987exit: 12988 return return_value; 12989} 12990 12991static PyObject * 12992unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12993/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/ 12994{ 12995 PyObject *new = NULL, *key, *value; 12996 Py_ssize_t i = 0; 12997 int res; 12998 12999 new = PyDict_New(); 13000 if (!new) 13001 return NULL; 13002 if (y != NULL) { 13003 int x_kind, y_kind, z_kind; 13004 void *x_data, *y_data, *z_data; 13005 13006 /* x must be a string too, of equal length */ 13007 if (!PyUnicode_Check(x)) { 13008 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13009 "be a string if there is a second argument"); 13010 goto err; 13011 } 13012 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13013 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13014 "arguments must have equal length"); 13015 goto err; 13016 } 13017 /* create entries for translating chars in x to those in y */ 13018 x_kind = PyUnicode_KIND(x); 13019 y_kind = PyUnicode_KIND(y); 13020 x_data = PyUnicode_DATA(x); 13021 y_data = PyUnicode_DATA(y); 13022 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13023 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13024 if (!key) 13025 goto err; 13026 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13027 if (!value) { 13028 Py_DECREF(key); 13029 goto err; 13030 } 13031 res = PyDict_SetItem(new, key, value); 13032 Py_DECREF(key); 13033 Py_DECREF(value); 13034 if (res < 0) 13035 goto err; 13036 } 13037 /* create entries for deleting chars in z */ 13038 if (z != NULL) { 13039 z_kind = PyUnicode_KIND(z); 13040 z_data = PyUnicode_DATA(z); 13041 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13042 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13043 if (!key) 13044 goto err; 13045 res = PyDict_SetItem(new, key, Py_None); 13046 Py_DECREF(key); 13047 if (res < 0) 13048 goto err; 13049 } 13050 } 13051 } else { 13052 int kind; 13053 void *data; 13054 13055 /* x must be a dict */ 13056 if (!PyDict_CheckExact(x)) { 13057 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13058 "to maketrans it must be a dict"); 13059 goto err; 13060 } 13061 /* copy entries into the new dict, converting string keys to int keys */ 13062 while (PyDict_Next(x, &i, &key, &value)) { 13063 if (PyUnicode_Check(key)) { 13064 /* convert string keys to integer keys */ 13065 PyObject *newkey; 13066 if (PyUnicode_GET_LENGTH(key) != 1) { 13067 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13068 "table must be of length 1"); 13069 goto err; 13070 } 13071 kind = PyUnicode_KIND(key); 13072 data = PyUnicode_DATA(key); 13073 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13074 if (!newkey) 13075 goto err; 13076 res = PyDict_SetItem(new, newkey, value); 13077 Py_DECREF(newkey); 13078 if (res < 0) 13079 goto err; 13080 } else if (PyLong_Check(key)) { 13081 /* just keep integer keys */ 13082 if (PyDict_SetItem(new, key, value) < 0) 13083 goto err; 13084 } else { 13085 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13086 "be strings or integers"); 13087 goto err; 13088 } 13089 } 13090 } 13091 return new; 13092 err: 13093 Py_DECREF(new); 13094 return NULL; 13095} 13096 13097PyDoc_STRVAR(translate__doc__, 13098 "S.translate(table) -> str\n\ 13099\n\ 13100Return a copy of the string S, where all characters have been mapped\n\ 13101through the given translation table, which must be a mapping of\n\ 13102Unicode ordinals to Unicode ordinals, strings, or None.\n\ 13103Unmapped characters are left untouched. Characters mapped to None\n\ 13104are deleted."); 13105 13106static PyObject* 13107unicode_translate(PyObject *self, PyObject *table) 13108{ 13109 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13110} 13111 13112PyDoc_STRVAR(upper__doc__, 13113 "S.upper() -> str\n\ 13114\n\ 13115Return a copy of S converted to uppercase."); 13116 13117static PyObject* 13118unicode_upper(PyObject *self) 13119{ 13120 if (PyUnicode_READY(self) == -1) 13121 return NULL; 13122 if (PyUnicode_IS_ASCII(self)) 13123 return ascii_upper_or_lower(self, 0); 13124 return case_operation(self, do_upper); 13125} 13126 13127PyDoc_STRVAR(zfill__doc__, 13128 "S.zfill(width) -> str\n\ 13129\n\ 13130Pad a numeric string S with zeros on the left, to fill a field\n\ 13131of the specified width. The string S is never truncated."); 13132 13133static PyObject * 13134unicode_zfill(PyObject *self, PyObject *args) 13135{ 13136 Py_ssize_t fill; 13137 PyObject *u; 13138 Py_ssize_t width; 13139 int kind; 13140 void *data; 13141 Py_UCS4 chr; 13142 13143 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13144 return NULL; 13145 13146 if (PyUnicode_READY(self) == -1) 13147 return NULL; 13148 13149 if (PyUnicode_GET_LENGTH(self) >= width) 13150 return unicode_result_unchanged(self); 13151 13152 fill = width - PyUnicode_GET_LENGTH(self); 13153 13154 u = pad(self, fill, 0, '0'); 13155 13156 if (u == NULL) 13157 return NULL; 13158 13159 kind = PyUnicode_KIND(u); 13160 data = PyUnicode_DATA(u); 13161 chr = PyUnicode_READ(kind, data, fill); 13162 13163 if (chr == '+' || chr == '-') { 13164 /* move sign to beginning of string */ 13165 PyUnicode_WRITE(kind, data, 0, chr); 13166 PyUnicode_WRITE(kind, data, fill, '0'); 13167 } 13168 13169 assert(_PyUnicode_CheckConsistency(u, 1)); 13170 return u; 13171} 13172 13173#if 0 13174static PyObject * 13175unicode__decimal2ascii(PyObject *self) 13176{ 13177 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13178} 13179#endif 13180 13181PyDoc_STRVAR(startswith__doc__, 13182 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13183\n\ 13184Return True if S starts with the specified prefix, False otherwise.\n\ 13185With optional start, test S beginning at that position.\n\ 13186With optional end, stop comparing S at that position.\n\ 13187prefix can also be a tuple of strings to try."); 13188 13189static PyObject * 13190unicode_startswith(PyObject *self, 13191 PyObject *args) 13192{ 13193 PyObject *subobj; 13194 PyObject *substring; 13195 Py_ssize_t start = 0; 13196 Py_ssize_t end = PY_SSIZE_T_MAX; 13197 int result; 13198 13199 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13200 return NULL; 13201 if (PyTuple_Check(subobj)) { 13202 Py_ssize_t i; 13203 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13204 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13205 if (substring == NULL) 13206 return NULL; 13207 result = tailmatch(self, substring, start, end, -1); 13208 Py_DECREF(substring); 13209 if (result == -1) 13210 return NULL; 13211 if (result) { 13212 Py_RETURN_TRUE; 13213 } 13214 } 13215 /* nothing matched */ 13216 Py_RETURN_FALSE; 13217 } 13218 substring = PyUnicode_FromObject(subobj); 13219 if (substring == NULL) { 13220 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13221 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13222 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13223 return NULL; 13224 } 13225 result = tailmatch(self, substring, start, end, -1); 13226 Py_DECREF(substring); 13227 if (result == -1) 13228 return NULL; 13229 return PyBool_FromLong(result); 13230} 13231 13232 13233PyDoc_STRVAR(endswith__doc__, 13234 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13235\n\ 13236Return True if S ends with the specified suffix, False otherwise.\n\ 13237With optional start, test S beginning at that position.\n\ 13238With optional end, stop comparing S at that position.\n\ 13239suffix can also be a tuple of strings to try."); 13240 13241static PyObject * 13242unicode_endswith(PyObject *self, 13243 PyObject *args) 13244{ 13245 PyObject *subobj; 13246 PyObject *substring; 13247 Py_ssize_t start = 0; 13248 Py_ssize_t end = PY_SSIZE_T_MAX; 13249 int result; 13250 13251 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13252 return NULL; 13253 if (PyTuple_Check(subobj)) { 13254 Py_ssize_t i; 13255 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13256 substring = PyUnicode_FromObject( 13257 PyTuple_GET_ITEM(subobj, i)); 13258 if (substring == NULL) 13259 return NULL; 13260 result = tailmatch(self, substring, start, end, +1); 13261 Py_DECREF(substring); 13262 if (result == -1) 13263 return NULL; 13264 if (result) { 13265 Py_RETURN_TRUE; 13266 } 13267 } 13268 Py_RETURN_FALSE; 13269 } 13270 substring = PyUnicode_FromObject(subobj); 13271 if (substring == NULL) { 13272 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13273 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13274 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13275 return NULL; 13276 } 13277 result = tailmatch(self, substring, start, end, +1); 13278 Py_DECREF(substring); 13279 if (result == -1) 13280 return NULL; 13281 return PyBool_FromLong(result); 13282} 13283 13284Py_LOCAL_INLINE(void) 13285_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13286{ 13287 if (!writer->readonly) 13288 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13289 else { 13290 /* Copy-on-write mode: set buffer size to 0 so 13291 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13292 * next write. */ 13293 writer->size = 0; 13294 } 13295 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13296 writer->data = PyUnicode_DATA(writer->buffer); 13297 writer->kind = PyUnicode_KIND(writer->buffer); 13298} 13299 13300void 13301_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13302{ 13303 memset(writer, 0, sizeof(*writer)); 13304#ifdef Py_DEBUG 13305 writer->kind = 5; /* invalid kind */ 13306#endif 13307 writer->min_char = 127; 13308} 13309 13310int 13311_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13312 Py_ssize_t length, Py_UCS4 maxchar) 13313{ 13314#ifdef MS_WINDOWS 13315 /* On Windows, overallocate by 50% is the best factor */ 13316# define OVERALLOCATE_FACTOR 2 13317#else 13318 /* On Linux, overallocate by 25% is the best factor */ 13319# define OVERALLOCATE_FACTOR 4 13320#endif 13321 Py_ssize_t newlen; 13322 PyObject *newbuffer; 13323 13324 assert(length > 0); 13325 13326 if (length > PY_SSIZE_T_MAX - writer->pos) { 13327 PyErr_NoMemory(); 13328 return -1; 13329 } 13330 newlen = writer->pos + length; 13331 13332 maxchar = Py_MAX(maxchar, writer->min_char); 13333 13334 if (writer->buffer == NULL) { 13335 assert(!writer->readonly); 13336 if (writer->overallocate 13337 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13338 /* overallocate to limit the number of realloc() */ 13339 newlen += newlen / OVERALLOCATE_FACTOR; 13340 } 13341 if (newlen < writer->min_length) 13342 newlen = writer->min_length; 13343 13344 writer->buffer = PyUnicode_New(newlen, maxchar); 13345 if (writer->buffer == NULL) 13346 return -1; 13347 } 13348 else if (newlen > writer->size) { 13349 if (writer->overallocate 13350 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13351 /* overallocate to limit the number of realloc() */ 13352 newlen += newlen / OVERALLOCATE_FACTOR; 13353 } 13354 if (newlen < writer->min_length) 13355 newlen = writer->min_length; 13356 13357 if (maxchar > writer->maxchar || writer->readonly) { 13358 /* resize + widen */ 13359 newbuffer = PyUnicode_New(newlen, maxchar); 13360 if (newbuffer == NULL) 13361 return -1; 13362 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13363 writer->buffer, 0, writer->pos); 13364 Py_DECREF(writer->buffer); 13365 writer->readonly = 0; 13366 } 13367 else { 13368 newbuffer = resize_compact(writer->buffer, newlen); 13369 if (newbuffer == NULL) 13370 return -1; 13371 } 13372 writer->buffer = newbuffer; 13373 } 13374 else if (maxchar > writer->maxchar) { 13375 assert(!writer->readonly); 13376 newbuffer = PyUnicode_New(writer->size, maxchar); 13377 if (newbuffer == NULL) 13378 return -1; 13379 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13380 writer->buffer, 0, writer->pos); 13381 Py_DECREF(writer->buffer); 13382 writer->buffer = newbuffer; 13383 } 13384 _PyUnicodeWriter_Update(writer); 13385 return 0; 13386 13387#undef OVERALLOCATE_FACTOR 13388} 13389 13390Py_LOCAL_INLINE(int) 13391_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13392{ 13393 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13394 return -1; 13395 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13396 writer->pos++; 13397 return 0; 13398} 13399 13400int 13401_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13402{ 13403 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13404} 13405 13406int 13407_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13408{ 13409 Py_UCS4 maxchar; 13410 Py_ssize_t len; 13411 13412 if (PyUnicode_READY(str) == -1) 13413 return -1; 13414 len = PyUnicode_GET_LENGTH(str); 13415 if (len == 0) 13416 return 0; 13417 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13418 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13419 if (writer->buffer == NULL && !writer->overallocate) { 13420 writer->readonly = 1; 13421 Py_INCREF(str); 13422 writer->buffer = str; 13423 _PyUnicodeWriter_Update(writer); 13424 writer->pos += len; 13425 return 0; 13426 } 13427 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13428 return -1; 13429 } 13430 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13431 str, 0, len); 13432 writer->pos += len; 13433 return 0; 13434} 13435 13436int 13437_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13438 Py_ssize_t start, Py_ssize_t end) 13439{ 13440 Py_UCS4 maxchar; 13441 Py_ssize_t len; 13442 13443 if (PyUnicode_READY(str) == -1) 13444 return -1; 13445 13446 assert(0 <= start); 13447 assert(end <= PyUnicode_GET_LENGTH(str)); 13448 assert(start <= end); 13449 13450 if (end == 0) 13451 return 0; 13452 13453 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13454 return _PyUnicodeWriter_WriteStr(writer, str); 13455 13456 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13457 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13458 else 13459 maxchar = writer->maxchar; 13460 len = end - start; 13461 13462 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13463 return -1; 13464 13465 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13466 str, start, len); 13467 writer->pos += len; 13468 return 0; 13469} 13470 13471int 13472_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13473 const char *ascii, Py_ssize_t len) 13474{ 13475 if (len == -1) 13476 len = strlen(ascii); 13477 13478 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13479 13480 if (writer->buffer == NULL && !writer->overallocate) { 13481 PyObject *str; 13482 13483 str = _PyUnicode_FromASCII(ascii, len); 13484 if (str == NULL) 13485 return -1; 13486 13487 writer->readonly = 1; 13488 writer->buffer = str; 13489 _PyUnicodeWriter_Update(writer); 13490 writer->pos += len; 13491 return 0; 13492 } 13493 13494 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13495 return -1; 13496 13497 switch (writer->kind) 13498 { 13499 case PyUnicode_1BYTE_KIND: 13500 { 13501 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13502 Py_UCS1 *data = writer->data; 13503 13504 Py_MEMCPY(data + writer->pos, str, len); 13505 break; 13506 } 13507 case PyUnicode_2BYTE_KIND: 13508 { 13509 _PyUnicode_CONVERT_BYTES( 13510 Py_UCS1, Py_UCS2, 13511 ascii, ascii + len, 13512 (Py_UCS2 *)writer->data + writer->pos); 13513 break; 13514 } 13515 case PyUnicode_4BYTE_KIND: 13516 { 13517 _PyUnicode_CONVERT_BYTES( 13518 Py_UCS1, Py_UCS4, 13519 ascii, ascii + len, 13520 (Py_UCS4 *)writer->data + writer->pos); 13521 break; 13522 } 13523 default: 13524 assert(0); 13525 } 13526 13527 writer->pos += len; 13528 return 0; 13529} 13530 13531int 13532_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13533 const char *str, Py_ssize_t len) 13534{ 13535 Py_UCS4 maxchar; 13536 13537 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13538 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13539 return -1; 13540 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13541 writer->pos += len; 13542 return 0; 13543} 13544 13545PyObject * 13546_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13547{ 13548 PyObject *str; 13549 if (writer->pos == 0) { 13550 Py_CLEAR(writer->buffer); 13551 _Py_RETURN_UNICODE_EMPTY(); 13552 } 13553 if (writer->readonly) { 13554 str = writer->buffer; 13555 writer->buffer = NULL; 13556 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13557 return str; 13558 } 13559 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13560 PyObject *newbuffer; 13561 newbuffer = resize_compact(writer->buffer, writer->pos); 13562 if (newbuffer == NULL) { 13563 Py_CLEAR(writer->buffer); 13564 return NULL; 13565 } 13566 writer->buffer = newbuffer; 13567 } 13568 str = writer->buffer; 13569 writer->buffer = NULL; 13570 assert(_PyUnicode_CheckConsistency(str, 1)); 13571 return unicode_result_ready(str); 13572} 13573 13574void 13575_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13576{ 13577 Py_CLEAR(writer->buffer); 13578} 13579 13580#include "stringlib/unicode_format.h" 13581 13582PyDoc_STRVAR(format__doc__, 13583 "S.format(*args, **kwargs) -> str\n\ 13584\n\ 13585Return a formatted version of S, using substitutions from args and kwargs.\n\ 13586The substitutions are identified by braces ('{' and '}')."); 13587 13588PyDoc_STRVAR(format_map__doc__, 13589 "S.format_map(mapping) -> str\n\ 13590\n\ 13591Return a formatted version of S, using substitutions from mapping.\n\ 13592The substitutions are identified by braces ('{' and '}')."); 13593 13594static PyObject * 13595unicode__format__(PyObject* self, PyObject* args) 13596{ 13597 PyObject *format_spec; 13598 _PyUnicodeWriter writer; 13599 int ret; 13600 13601 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13602 return NULL; 13603 13604 if (PyUnicode_READY(self) == -1) 13605 return NULL; 13606 _PyUnicodeWriter_Init(&writer); 13607 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13608 self, format_spec, 0, 13609 PyUnicode_GET_LENGTH(format_spec)); 13610 if (ret == -1) { 13611 _PyUnicodeWriter_Dealloc(&writer); 13612 return NULL; 13613 } 13614 return _PyUnicodeWriter_Finish(&writer); 13615} 13616 13617PyDoc_STRVAR(p_format__doc__, 13618 "S.__format__(format_spec) -> str\n\ 13619\n\ 13620Return a formatted version of S as described by format_spec."); 13621 13622static PyObject * 13623unicode__sizeof__(PyObject *v) 13624{ 13625 Py_ssize_t size; 13626 13627 /* If it's a compact object, account for base structure + 13628 character data. */ 13629 if (PyUnicode_IS_COMPACT_ASCII(v)) 13630 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13631 else if (PyUnicode_IS_COMPACT(v)) 13632 size = sizeof(PyCompactUnicodeObject) + 13633 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13634 else { 13635 /* If it is a two-block object, account for base object, and 13636 for character block if present. */ 13637 size = sizeof(PyUnicodeObject); 13638 if (_PyUnicode_DATA_ANY(v)) 13639 size += (PyUnicode_GET_LENGTH(v) + 1) * 13640 PyUnicode_KIND(v); 13641 } 13642 /* If the wstr pointer is present, account for it unless it is shared 13643 with the data pointer. Check if the data is not shared. */ 13644 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13645 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13646 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13647 size += PyUnicode_UTF8_LENGTH(v) + 1; 13648 13649 return PyLong_FromSsize_t(size); 13650} 13651 13652PyDoc_STRVAR(sizeof__doc__, 13653 "S.__sizeof__() -> size of S in memory, in bytes"); 13654 13655static PyObject * 13656unicode_getnewargs(PyObject *v) 13657{ 13658 PyObject *copy = _PyUnicode_Copy(v); 13659 if (!copy) 13660 return NULL; 13661 return Py_BuildValue("(N)", copy); 13662} 13663 13664static PyMethodDef unicode_methods[] = { 13665 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13666 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13667 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13668 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13669 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13670 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13671 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13672 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13673 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13674 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13675 {"expandtabs", (PyCFunction) unicode_expandtabs, 13676 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13677 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13678 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13679 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13680 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13681 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13682 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13683 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13684 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13685 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13686 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13687 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13688 {"splitlines", (PyCFunction) unicode_splitlines, 13689 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13690 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13691 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13692 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13693 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13694 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13695 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13696 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13697 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13698 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13699 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13700 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13701 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13702 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13703 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13704 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13705 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13706 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13707 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13708 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13709 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13710 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13711 UNICODE_MAKETRANS_METHODDEF 13712 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13713#if 0 13714 /* These methods are just used for debugging the implementation. */ 13715 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13716#endif 13717 13718 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13719 {NULL, NULL} 13720}; 13721 13722static PyObject * 13723unicode_mod(PyObject *v, PyObject *w) 13724{ 13725 if (!PyUnicode_Check(v)) 13726 Py_RETURN_NOTIMPLEMENTED; 13727 return PyUnicode_Format(v, w); 13728} 13729 13730static PyNumberMethods unicode_as_number = { 13731 0, /*nb_add*/ 13732 0, /*nb_subtract*/ 13733 0, /*nb_multiply*/ 13734 unicode_mod, /*nb_remainder*/ 13735}; 13736 13737static PySequenceMethods unicode_as_sequence = { 13738 (lenfunc) unicode_length, /* sq_length */ 13739 PyUnicode_Concat, /* sq_concat */ 13740 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13741 (ssizeargfunc) unicode_getitem, /* sq_item */ 13742 0, /* sq_slice */ 13743 0, /* sq_ass_item */ 13744 0, /* sq_ass_slice */ 13745 PyUnicode_Contains, /* sq_contains */ 13746}; 13747 13748static PyObject* 13749unicode_subscript(PyObject* self, PyObject* item) 13750{ 13751 if (PyUnicode_READY(self) == -1) 13752 return NULL; 13753 13754 if (PyIndex_Check(item)) { 13755 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13756 if (i == -1 && PyErr_Occurred()) 13757 return NULL; 13758 if (i < 0) 13759 i += PyUnicode_GET_LENGTH(self); 13760 return unicode_getitem(self, i); 13761 } else if (PySlice_Check(item)) { 13762 Py_ssize_t start, stop, step, slicelength, cur, i; 13763 PyObject *result; 13764 void *src_data, *dest_data; 13765 int src_kind, dest_kind; 13766 Py_UCS4 ch, max_char, kind_limit; 13767 13768 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13769 &start, &stop, &step, &slicelength) < 0) { 13770 return NULL; 13771 } 13772 13773 if (slicelength <= 0) { 13774 _Py_RETURN_UNICODE_EMPTY(); 13775 } else if (start == 0 && step == 1 && 13776 slicelength == PyUnicode_GET_LENGTH(self)) { 13777 return unicode_result_unchanged(self); 13778 } else if (step == 1) { 13779 return PyUnicode_Substring(self, 13780 start, start + slicelength); 13781 } 13782 /* General case */ 13783 src_kind = PyUnicode_KIND(self); 13784 src_data = PyUnicode_DATA(self); 13785 if (!PyUnicode_IS_ASCII(self)) { 13786 kind_limit = kind_maxchar_limit(src_kind); 13787 max_char = 0; 13788 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13789 ch = PyUnicode_READ(src_kind, src_data, cur); 13790 if (ch > max_char) { 13791 max_char = ch; 13792 if (max_char >= kind_limit) 13793 break; 13794 } 13795 } 13796 } 13797 else 13798 max_char = 127; 13799 result = PyUnicode_New(slicelength, max_char); 13800 if (result == NULL) 13801 return NULL; 13802 dest_kind = PyUnicode_KIND(result); 13803 dest_data = PyUnicode_DATA(result); 13804 13805 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13806 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13807 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13808 } 13809 assert(_PyUnicode_CheckConsistency(result, 1)); 13810 return result; 13811 } else { 13812 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13813 return NULL; 13814 } 13815} 13816 13817static PyMappingMethods unicode_as_mapping = { 13818 (lenfunc)unicode_length, /* mp_length */ 13819 (binaryfunc)unicode_subscript, /* mp_subscript */ 13820 (objobjargproc)0, /* mp_ass_subscript */ 13821}; 13822 13823 13824/* Helpers for PyUnicode_Format() */ 13825 13826struct unicode_formatter_t { 13827 PyObject *args; 13828 int args_owned; 13829 Py_ssize_t arglen, argidx; 13830 PyObject *dict; 13831 13832 enum PyUnicode_Kind fmtkind; 13833 Py_ssize_t fmtcnt, fmtpos; 13834 void *fmtdata; 13835 PyObject *fmtstr; 13836 13837 _PyUnicodeWriter writer; 13838}; 13839 13840struct unicode_format_arg_t { 13841 Py_UCS4 ch; 13842 int flags; 13843 Py_ssize_t width; 13844 int prec; 13845 int sign; 13846}; 13847 13848static PyObject * 13849unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13850{ 13851 Py_ssize_t argidx = ctx->argidx; 13852 13853 if (argidx < ctx->arglen) { 13854 ctx->argidx++; 13855 if (ctx->arglen < 0) 13856 return ctx->args; 13857 else 13858 return PyTuple_GetItem(ctx->args, argidx); 13859 } 13860 PyErr_SetString(PyExc_TypeError, 13861 "not enough arguments for format string"); 13862 return NULL; 13863} 13864 13865/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13866 13867/* Format a float into the writer if the writer is not NULL, or into *p_output 13868 otherwise. 13869 13870 Return 0 on success, raise an exception and return -1 on error. */ 13871static int 13872formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13873 PyObject **p_output, 13874 _PyUnicodeWriter *writer) 13875{ 13876 char *p; 13877 double x; 13878 Py_ssize_t len; 13879 int prec; 13880 int dtoa_flags; 13881 13882 x = PyFloat_AsDouble(v); 13883 if (x == -1.0 && PyErr_Occurred()) 13884 return -1; 13885 13886 prec = arg->prec; 13887 if (prec < 0) 13888 prec = 6; 13889 13890 if (arg->flags & F_ALT) 13891 dtoa_flags = Py_DTSF_ALT; 13892 else 13893 dtoa_flags = 0; 13894 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13895 if (p == NULL) 13896 return -1; 13897 len = strlen(p); 13898 if (writer) { 13899 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13900 PyMem_Free(p); 13901 return -1; 13902 } 13903 } 13904 else 13905 *p_output = _PyUnicode_FromASCII(p, len); 13906 PyMem_Free(p); 13907 return 0; 13908} 13909 13910/* formatlong() emulates the format codes d, u, o, x and X, and 13911 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13912 * Python's regular ints. 13913 * Return value: a new PyUnicodeObject*, or NULL if error. 13914 * The output string is of the form 13915 * "-"? ("0x" | "0X")? digit+ 13916 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13917 * set in flags. The case of hex digits will be correct, 13918 * There will be at least prec digits, zero-filled on the left if 13919 * necessary to get that many. 13920 * val object to be converted 13921 * flags bitmask of format flags; only F_ALT is looked at 13922 * prec minimum number of digits; 0-fill on left if needed 13923 * type a character in [duoxX]; u acts the same as d 13924 * 13925 * CAUTION: o, x and X conversions on regular ints can never 13926 * produce a '-' sign, but can for Python's unbounded ints. 13927 */ 13928static PyObject* 13929formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13930{ 13931 PyObject *result = NULL; 13932 char *buf; 13933 Py_ssize_t i; 13934 int sign; /* 1 if '-', else 0 */ 13935 int len; /* number of characters */ 13936 Py_ssize_t llen; 13937 int numdigits; /* len == numnondigits + numdigits */ 13938 int numnondigits = 0; 13939 int prec = arg->prec; 13940 int type = arg->ch; 13941 13942 /* Avoid exceeding SSIZE_T_MAX */ 13943 if (prec > INT_MAX-3) { 13944 PyErr_SetString(PyExc_OverflowError, 13945 "precision too large"); 13946 return NULL; 13947 } 13948 13949 assert(PyLong_Check(val)); 13950 13951 switch (type) { 13952 default: 13953 assert(!"'type' not in [diuoxX]"); 13954 case 'd': 13955 case 'i': 13956 case 'u': 13957 /* int and int subclasses should print numerically when a numeric */ 13958 /* format code is used (see issue18780) */ 13959 result = PyNumber_ToBase(val, 10); 13960 break; 13961 case 'o': 13962 numnondigits = 2; 13963 result = PyNumber_ToBase(val, 8); 13964 break; 13965 case 'x': 13966 case 'X': 13967 numnondigits = 2; 13968 result = PyNumber_ToBase(val, 16); 13969 break; 13970 } 13971 if (!result) 13972 return NULL; 13973 13974 assert(unicode_modifiable(result)); 13975 assert(PyUnicode_IS_READY(result)); 13976 assert(PyUnicode_IS_ASCII(result)); 13977 13978 /* To modify the string in-place, there can only be one reference. */ 13979 if (Py_REFCNT(result) != 1) { 13980 Py_DECREF(result); 13981 PyErr_BadInternalCall(); 13982 return NULL; 13983 } 13984 buf = PyUnicode_DATA(result); 13985 llen = PyUnicode_GET_LENGTH(result); 13986 if (llen > INT_MAX) { 13987 Py_DECREF(result); 13988 PyErr_SetString(PyExc_ValueError, 13989 "string too large in _PyBytes_FormatLong"); 13990 return NULL; 13991 } 13992 len = (int)llen; 13993 sign = buf[0] == '-'; 13994 numnondigits += sign; 13995 numdigits = len - numnondigits; 13996 assert(numdigits > 0); 13997 13998 /* Get rid of base marker unless F_ALT */ 13999 if (((arg->flags & F_ALT) == 0 && 14000 (type == 'o' || type == 'x' || type == 'X'))) { 14001 assert(buf[sign] == '0'); 14002 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14003 buf[sign+1] == 'o'); 14004 numnondigits -= 2; 14005 buf += 2; 14006 len -= 2; 14007 if (sign) 14008 buf[0] = '-'; 14009 assert(len == numnondigits + numdigits); 14010 assert(numdigits > 0); 14011 } 14012 14013 /* Fill with leading zeroes to meet minimum width. */ 14014 if (prec > numdigits) { 14015 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14016 numnondigits + prec); 14017 char *b1; 14018 if (!r1) { 14019 Py_DECREF(result); 14020 return NULL; 14021 } 14022 b1 = PyBytes_AS_STRING(r1); 14023 for (i = 0; i < numnondigits; ++i) 14024 *b1++ = *buf++; 14025 for (i = 0; i < prec - numdigits; i++) 14026 *b1++ = '0'; 14027 for (i = 0; i < numdigits; i++) 14028 *b1++ = *buf++; 14029 *b1 = '\0'; 14030 Py_DECREF(result); 14031 result = r1; 14032 buf = PyBytes_AS_STRING(result); 14033 len = numnondigits + prec; 14034 } 14035 14036 /* Fix up case for hex conversions. */ 14037 if (type == 'X') { 14038 /* Need to convert all lower case letters to upper case. 14039 and need to convert 0x to 0X (and -0x to -0X). */ 14040 for (i = 0; i < len; i++) 14041 if (buf[i] >= 'a' && buf[i] <= 'x') 14042 buf[i] -= 'a'-'A'; 14043 } 14044 if (!PyUnicode_Check(result) 14045 || buf != PyUnicode_DATA(result)) { 14046 PyObject *unicode; 14047 unicode = _PyUnicode_FromASCII(buf, len); 14048 Py_DECREF(result); 14049 result = unicode; 14050 } 14051 else if (len != PyUnicode_GET_LENGTH(result)) { 14052 if (PyUnicode_Resize(&result, len) < 0) 14053 Py_CLEAR(result); 14054 } 14055 return result; 14056} 14057 14058/* Format an integer or a float as an integer. 14059 * Return 1 if the number has been formatted into the writer, 14060 * 0 if the number has been formatted into *p_output 14061 * -1 and raise an exception on error */ 14062static int 14063mainformatlong(PyObject *v, 14064 struct unicode_format_arg_t *arg, 14065 PyObject **p_output, 14066 _PyUnicodeWriter *writer) 14067{ 14068 PyObject *iobj, *res; 14069 char type = (char)arg->ch; 14070 14071 if (!PyNumber_Check(v)) 14072 goto wrongtype; 14073 14074 /* make sure number is a type of integer for o, x, and X */ 14075 if (!PyLong_Check(v)) { 14076 if (type == 'o' || type == 'x' || type == 'X') { 14077 iobj = PyNumber_Index(v); 14078 if (iobj == NULL) { 14079 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14080 goto wrongtype; 14081 return -1; 14082 } 14083 } 14084 else { 14085 iobj = PyNumber_Long(v); 14086 if (iobj == NULL ) { 14087 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14088 goto wrongtype; 14089 return -1; 14090 } 14091 } 14092 assert(PyLong_Check(iobj)); 14093 } 14094 else { 14095 iobj = v; 14096 Py_INCREF(iobj); 14097 } 14098 14099 if (PyLong_CheckExact(v) 14100 && arg->width == -1 && arg->prec == -1 14101 && !(arg->flags & (F_SIGN | F_BLANK)) 14102 && type != 'X') 14103 { 14104 /* Fast path */ 14105 int alternate = arg->flags & F_ALT; 14106 int base; 14107 14108 switch(type) 14109 { 14110 default: 14111 assert(0 && "'type' not in [diuoxX]"); 14112 case 'd': 14113 case 'i': 14114 case 'u': 14115 base = 10; 14116 break; 14117 case 'o': 14118 base = 8; 14119 break; 14120 case 'x': 14121 case 'X': 14122 base = 16; 14123 break; 14124 } 14125 14126 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14127 Py_DECREF(iobj); 14128 return -1; 14129 } 14130 Py_DECREF(iobj); 14131 return 1; 14132 } 14133 14134 res = formatlong(iobj, arg); 14135 Py_DECREF(iobj); 14136 if (res == NULL) 14137 return -1; 14138 *p_output = res; 14139 return 0; 14140 14141wrongtype: 14142 switch(type) 14143 { 14144 case 'o': 14145 case 'x': 14146 case 'X': 14147 PyErr_Format(PyExc_TypeError, 14148 "%%%c format: an integer is required, " 14149 "not %.200s", 14150 type, Py_TYPE(v)->tp_name); 14151 break; 14152 default: 14153 PyErr_Format(PyExc_TypeError, 14154 "%%%c format: a number is required, " 14155 "not %.200s", 14156 type, Py_TYPE(v)->tp_name); 14157 break; 14158 } 14159 return -1; 14160} 14161 14162static Py_UCS4 14163formatchar(PyObject *v) 14164{ 14165 /* presume that the buffer is at least 3 characters long */ 14166 if (PyUnicode_Check(v)) { 14167 if (PyUnicode_GET_LENGTH(v) == 1) { 14168 return PyUnicode_READ_CHAR(v, 0); 14169 } 14170 goto onError; 14171 } 14172 else { 14173 PyObject *iobj; 14174 long x; 14175 /* make sure number is a type of integer */ 14176 if (!PyLong_Check(v)) { 14177 iobj = PyNumber_Index(v); 14178 if (iobj == NULL) { 14179 goto onError; 14180 } 14181 v = iobj; 14182 Py_DECREF(iobj); 14183 } 14184 /* Integer input truncated to a character */ 14185 x = PyLong_AsLong(v); 14186 if (x == -1 && PyErr_Occurred()) 14187 goto onError; 14188 14189 if (x < 0 || x > MAX_UNICODE) { 14190 PyErr_SetString(PyExc_OverflowError, 14191 "%c arg not in range(0x110000)"); 14192 return (Py_UCS4) -1; 14193 } 14194 14195 return (Py_UCS4) x; 14196 } 14197 14198 onError: 14199 PyErr_SetString(PyExc_TypeError, 14200 "%c requires int or char"); 14201 return (Py_UCS4) -1; 14202} 14203 14204/* Parse options of an argument: flags, width, precision. 14205 Handle also "%(name)" syntax. 14206 14207 Return 0 if the argument has been formatted into arg->str. 14208 Return 1 if the argument has been written into ctx->writer, 14209 Raise an exception and return -1 on error. */ 14210static int 14211unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14212 struct unicode_format_arg_t *arg) 14213{ 14214#define FORMAT_READ(ctx) \ 14215 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14216 14217 PyObject *v; 14218 14219 if (arg->ch == '(') { 14220 /* Get argument value from a dictionary. Example: "%(name)s". */ 14221 Py_ssize_t keystart; 14222 Py_ssize_t keylen; 14223 PyObject *key; 14224 int pcount = 1; 14225 14226 if (ctx->dict == NULL) { 14227 PyErr_SetString(PyExc_TypeError, 14228 "format requires a mapping"); 14229 return -1; 14230 } 14231 ++ctx->fmtpos; 14232 --ctx->fmtcnt; 14233 keystart = ctx->fmtpos; 14234 /* Skip over balanced parentheses */ 14235 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14236 arg->ch = FORMAT_READ(ctx); 14237 if (arg->ch == ')') 14238 --pcount; 14239 else if (arg->ch == '(') 14240 ++pcount; 14241 ctx->fmtpos++; 14242 } 14243 keylen = ctx->fmtpos - keystart - 1; 14244 if (ctx->fmtcnt < 0 || pcount > 0) { 14245 PyErr_SetString(PyExc_ValueError, 14246 "incomplete format key"); 14247 return -1; 14248 } 14249 key = PyUnicode_Substring(ctx->fmtstr, 14250 keystart, keystart + keylen); 14251 if (key == NULL) 14252 return -1; 14253 if (ctx->args_owned) { 14254 Py_DECREF(ctx->args); 14255 ctx->args_owned = 0; 14256 } 14257 ctx->args = PyObject_GetItem(ctx->dict, key); 14258 Py_DECREF(key); 14259 if (ctx->args == NULL) 14260 return -1; 14261 ctx->args_owned = 1; 14262 ctx->arglen = -1; 14263 ctx->argidx = -2; 14264 } 14265 14266 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14267 while (--ctx->fmtcnt >= 0) { 14268 arg->ch = FORMAT_READ(ctx); 14269 ctx->fmtpos++; 14270 switch (arg->ch) { 14271 case '-': arg->flags |= F_LJUST; continue; 14272 case '+': arg->flags |= F_SIGN; continue; 14273 case ' ': arg->flags |= F_BLANK; continue; 14274 case '#': arg->flags |= F_ALT; continue; 14275 case '0': arg->flags |= F_ZERO; continue; 14276 } 14277 break; 14278 } 14279 14280 /* Parse width. Example: "%10s" => width=10 */ 14281 if (arg->ch == '*') { 14282 v = unicode_format_getnextarg(ctx); 14283 if (v == NULL) 14284 return -1; 14285 if (!PyLong_Check(v)) { 14286 PyErr_SetString(PyExc_TypeError, 14287 "* wants int"); 14288 return -1; 14289 } 14290 arg->width = PyLong_AsSsize_t(v); 14291 if (arg->width == -1 && PyErr_Occurred()) 14292 return -1; 14293 if (arg->width < 0) { 14294 arg->flags |= F_LJUST; 14295 arg->width = -arg->width; 14296 } 14297 if (--ctx->fmtcnt >= 0) { 14298 arg->ch = FORMAT_READ(ctx); 14299 ctx->fmtpos++; 14300 } 14301 } 14302 else if (arg->ch >= '0' && arg->ch <= '9') { 14303 arg->width = arg->ch - '0'; 14304 while (--ctx->fmtcnt >= 0) { 14305 arg->ch = FORMAT_READ(ctx); 14306 ctx->fmtpos++; 14307 if (arg->ch < '0' || arg->ch > '9') 14308 break; 14309 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14310 mixing signed and unsigned comparison. Since arg->ch is between 14311 '0' and '9', casting to int is safe. */ 14312 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14313 PyErr_SetString(PyExc_ValueError, 14314 "width too big"); 14315 return -1; 14316 } 14317 arg->width = arg->width*10 + (arg->ch - '0'); 14318 } 14319 } 14320 14321 /* Parse precision. Example: "%.3f" => prec=3 */ 14322 if (arg->ch == '.') { 14323 arg->prec = 0; 14324 if (--ctx->fmtcnt >= 0) { 14325 arg->ch = FORMAT_READ(ctx); 14326 ctx->fmtpos++; 14327 } 14328 if (arg->ch == '*') { 14329 v = unicode_format_getnextarg(ctx); 14330 if (v == NULL) 14331 return -1; 14332 if (!PyLong_Check(v)) { 14333 PyErr_SetString(PyExc_TypeError, 14334 "* wants int"); 14335 return -1; 14336 } 14337 arg->prec = _PyLong_AsInt(v); 14338 if (arg->prec == -1 && PyErr_Occurred()) 14339 return -1; 14340 if (arg->prec < 0) 14341 arg->prec = 0; 14342 if (--ctx->fmtcnt >= 0) { 14343 arg->ch = FORMAT_READ(ctx); 14344 ctx->fmtpos++; 14345 } 14346 } 14347 else if (arg->ch >= '0' && arg->ch <= '9') { 14348 arg->prec = arg->ch - '0'; 14349 while (--ctx->fmtcnt >= 0) { 14350 arg->ch = FORMAT_READ(ctx); 14351 ctx->fmtpos++; 14352 if (arg->ch < '0' || arg->ch > '9') 14353 break; 14354 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14355 PyErr_SetString(PyExc_ValueError, 14356 "precision too big"); 14357 return -1; 14358 } 14359 arg->prec = arg->prec*10 + (arg->ch - '0'); 14360 } 14361 } 14362 } 14363 14364 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14365 if (ctx->fmtcnt >= 0) { 14366 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14367 if (--ctx->fmtcnt >= 0) { 14368 arg->ch = FORMAT_READ(ctx); 14369 ctx->fmtpos++; 14370 } 14371 } 14372 } 14373 if (ctx->fmtcnt < 0) { 14374 PyErr_SetString(PyExc_ValueError, 14375 "incomplete format"); 14376 return -1; 14377 } 14378 return 0; 14379 14380#undef FORMAT_READ 14381} 14382 14383/* Format one argument. Supported conversion specifiers: 14384 14385 - "s", "r", "a": any type 14386 - "i", "d", "u": int or float 14387 - "o", "x", "X": int 14388 - "e", "E", "f", "F", "g", "G": float 14389 - "c": int or str (1 character) 14390 14391 When possible, the output is written directly into the Unicode writer 14392 (ctx->writer). A string is created when padding is required. 14393 14394 Return 0 if the argument has been formatted into *p_str, 14395 1 if the argument has been written into ctx->writer, 14396 -1 on error. */ 14397static int 14398unicode_format_arg_format(struct unicode_formatter_t *ctx, 14399 struct unicode_format_arg_t *arg, 14400 PyObject **p_str) 14401{ 14402 PyObject *v; 14403 _PyUnicodeWriter *writer = &ctx->writer; 14404 14405 if (ctx->fmtcnt == 0) 14406 ctx->writer.overallocate = 0; 14407 14408 if (arg->ch == '%') { 14409 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14410 return -1; 14411 return 1; 14412 } 14413 14414 v = unicode_format_getnextarg(ctx); 14415 if (v == NULL) 14416 return -1; 14417 14418 14419 switch (arg->ch) { 14420 case 's': 14421 case 'r': 14422 case 'a': 14423 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14424 /* Fast path */ 14425 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14426 return -1; 14427 return 1; 14428 } 14429 14430 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14431 *p_str = v; 14432 Py_INCREF(*p_str); 14433 } 14434 else { 14435 if (arg->ch == 's') 14436 *p_str = PyObject_Str(v); 14437 else if (arg->ch == 'r') 14438 *p_str = PyObject_Repr(v); 14439 else 14440 *p_str = PyObject_ASCII(v); 14441 } 14442 break; 14443 14444 case 'i': 14445 case 'd': 14446 case 'u': 14447 case 'o': 14448 case 'x': 14449 case 'X': 14450 { 14451 int ret = mainformatlong(v, arg, p_str, writer); 14452 if (ret != 0) 14453 return ret; 14454 arg->sign = 1; 14455 break; 14456 } 14457 14458 case 'e': 14459 case 'E': 14460 case 'f': 14461 case 'F': 14462 case 'g': 14463 case 'G': 14464 if (arg->width == -1 && arg->prec == -1 14465 && !(arg->flags & (F_SIGN | F_BLANK))) 14466 { 14467 /* Fast path */ 14468 if (formatfloat(v, arg, NULL, writer) == -1) 14469 return -1; 14470 return 1; 14471 } 14472 14473 arg->sign = 1; 14474 if (formatfloat(v, arg, p_str, NULL) == -1) 14475 return -1; 14476 break; 14477 14478 case 'c': 14479 { 14480 Py_UCS4 ch = formatchar(v); 14481 if (ch == (Py_UCS4) -1) 14482 return -1; 14483 if (arg->width == -1 && arg->prec == -1) { 14484 /* Fast path */ 14485 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14486 return -1; 14487 return 1; 14488 } 14489 *p_str = PyUnicode_FromOrdinal(ch); 14490 break; 14491 } 14492 14493 default: 14494 PyErr_Format(PyExc_ValueError, 14495 "unsupported format character '%c' (0x%x) " 14496 "at index %zd", 14497 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14498 (int)arg->ch, 14499 ctx->fmtpos - 1); 14500 return -1; 14501 } 14502 if (*p_str == NULL) 14503 return -1; 14504 assert (PyUnicode_Check(*p_str)); 14505 return 0; 14506} 14507 14508static int 14509unicode_format_arg_output(struct unicode_formatter_t *ctx, 14510 struct unicode_format_arg_t *arg, 14511 PyObject *str) 14512{ 14513 Py_ssize_t len; 14514 enum PyUnicode_Kind kind; 14515 void *pbuf; 14516 Py_ssize_t pindex; 14517 Py_UCS4 signchar; 14518 Py_ssize_t buflen; 14519 Py_UCS4 maxchar; 14520 Py_ssize_t sublen; 14521 _PyUnicodeWriter *writer = &ctx->writer; 14522 Py_UCS4 fill; 14523 14524 fill = ' '; 14525 if (arg->sign && arg->flags & F_ZERO) 14526 fill = '0'; 14527 14528 if (PyUnicode_READY(str) == -1) 14529 return -1; 14530 14531 len = PyUnicode_GET_LENGTH(str); 14532 if ((arg->width == -1 || arg->width <= len) 14533 && (arg->prec == -1 || arg->prec >= len) 14534 && !(arg->flags & (F_SIGN | F_BLANK))) 14535 { 14536 /* Fast path */ 14537 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14538 return -1; 14539 return 0; 14540 } 14541 14542 /* Truncate the string for "s", "r" and "a" formats 14543 if the precision is set */ 14544 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14545 if (arg->prec >= 0 && len > arg->prec) 14546 len = arg->prec; 14547 } 14548 14549 /* Adjust sign and width */ 14550 kind = PyUnicode_KIND(str); 14551 pbuf = PyUnicode_DATA(str); 14552 pindex = 0; 14553 signchar = '\0'; 14554 if (arg->sign) { 14555 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14556 if (ch == '-' || ch == '+') { 14557 signchar = ch; 14558 len--; 14559 pindex++; 14560 } 14561 else if (arg->flags & F_SIGN) 14562 signchar = '+'; 14563 else if (arg->flags & F_BLANK) 14564 signchar = ' '; 14565 else 14566 arg->sign = 0; 14567 } 14568 if (arg->width < len) 14569 arg->width = len; 14570 14571 /* Prepare the writer */ 14572 maxchar = writer->maxchar; 14573 if (!(arg->flags & F_LJUST)) { 14574 if (arg->sign) { 14575 if ((arg->width-1) > len) 14576 maxchar = Py_MAX(maxchar, fill); 14577 } 14578 else { 14579 if (arg->width > len) 14580 maxchar = Py_MAX(maxchar, fill); 14581 } 14582 } 14583 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14584 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14585 maxchar = Py_MAX(maxchar, strmaxchar); 14586 } 14587 14588 buflen = arg->width; 14589 if (arg->sign && len == arg->width) 14590 buflen++; 14591 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14592 return -1; 14593 14594 /* Write the sign if needed */ 14595 if (arg->sign) { 14596 if (fill != ' ') { 14597 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14598 writer->pos += 1; 14599 } 14600 if (arg->width > len) 14601 arg->width--; 14602 } 14603 14604 /* Write the numeric prefix for "x", "X" and "o" formats 14605 if the alternate form is used. 14606 For example, write "0x" for the "%#x" format. */ 14607 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14608 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14609 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14610 if (fill != ' ') { 14611 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14612 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14613 writer->pos += 2; 14614 pindex += 2; 14615 } 14616 arg->width -= 2; 14617 if (arg->width < 0) 14618 arg->width = 0; 14619 len -= 2; 14620 } 14621 14622 /* Pad left with the fill character if needed */ 14623 if (arg->width > len && !(arg->flags & F_LJUST)) { 14624 sublen = arg->width - len; 14625 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14626 writer->pos += sublen; 14627 arg->width = len; 14628 } 14629 14630 /* If padding with spaces: write sign if needed and/or numeric prefix if 14631 the alternate form is used */ 14632 if (fill == ' ') { 14633 if (arg->sign) { 14634 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14635 writer->pos += 1; 14636 } 14637 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14638 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14639 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14640 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14641 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14642 writer->pos += 2; 14643 pindex += 2; 14644 } 14645 } 14646 14647 /* Write characters */ 14648 if (len) { 14649 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14650 str, pindex, len); 14651 writer->pos += len; 14652 } 14653 14654 /* Pad right with the fill character if needed */ 14655 if (arg->width > len) { 14656 sublen = arg->width - len; 14657 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14658 writer->pos += sublen; 14659 } 14660 return 0; 14661} 14662 14663/* Helper of PyUnicode_Format(): format one arg. 14664 Return 0 on success, raise an exception and return -1 on error. */ 14665static int 14666unicode_format_arg(struct unicode_formatter_t *ctx) 14667{ 14668 struct unicode_format_arg_t arg; 14669 PyObject *str; 14670 int ret; 14671 14672 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14673 arg.flags = 0; 14674 arg.width = -1; 14675 arg.prec = -1; 14676 arg.sign = 0; 14677 str = NULL; 14678 14679 ret = unicode_format_arg_parse(ctx, &arg); 14680 if (ret == -1) 14681 return -1; 14682 14683 ret = unicode_format_arg_format(ctx, &arg, &str); 14684 if (ret == -1) 14685 return -1; 14686 14687 if (ret != 1) { 14688 ret = unicode_format_arg_output(ctx, &arg, str); 14689 Py_DECREF(str); 14690 if (ret == -1) 14691 return -1; 14692 } 14693 14694 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14695 PyErr_SetString(PyExc_TypeError, 14696 "not all arguments converted during string formatting"); 14697 return -1; 14698 } 14699 return 0; 14700} 14701 14702PyObject * 14703PyUnicode_Format(PyObject *format, PyObject *args) 14704{ 14705 struct unicode_formatter_t ctx; 14706 14707 if (format == NULL || args == NULL) { 14708 PyErr_BadInternalCall(); 14709 return NULL; 14710 } 14711 14712 ctx.fmtstr = PyUnicode_FromObject(format); 14713 if (ctx.fmtstr == NULL) 14714 return NULL; 14715 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14716 Py_DECREF(ctx.fmtstr); 14717 return NULL; 14718 } 14719 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14720 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14721 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14722 ctx.fmtpos = 0; 14723 14724 _PyUnicodeWriter_Init(&ctx.writer); 14725 ctx.writer.min_length = ctx.fmtcnt + 100; 14726 ctx.writer.overallocate = 1; 14727 14728 if (PyTuple_Check(args)) { 14729 ctx.arglen = PyTuple_Size(args); 14730 ctx.argidx = 0; 14731 } 14732 else { 14733 ctx.arglen = -1; 14734 ctx.argidx = -2; 14735 } 14736 ctx.args_owned = 0; 14737 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14738 ctx.dict = args; 14739 else 14740 ctx.dict = NULL; 14741 ctx.args = args; 14742 14743 while (--ctx.fmtcnt >= 0) { 14744 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14745 Py_ssize_t nonfmtpos; 14746 14747 nonfmtpos = ctx.fmtpos++; 14748 while (ctx.fmtcnt >= 0 && 14749 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14750 ctx.fmtpos++; 14751 ctx.fmtcnt--; 14752 } 14753 if (ctx.fmtcnt < 0) { 14754 ctx.fmtpos--; 14755 ctx.writer.overallocate = 0; 14756 } 14757 14758 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14759 nonfmtpos, ctx.fmtpos) < 0) 14760 goto onError; 14761 } 14762 else { 14763 ctx.fmtpos++; 14764 if (unicode_format_arg(&ctx) == -1) 14765 goto onError; 14766 } 14767 } 14768 14769 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14770 PyErr_SetString(PyExc_TypeError, 14771 "not all arguments converted during string formatting"); 14772 goto onError; 14773 } 14774 14775 if (ctx.args_owned) { 14776 Py_DECREF(ctx.args); 14777 } 14778 Py_DECREF(ctx.fmtstr); 14779 return _PyUnicodeWriter_Finish(&ctx.writer); 14780 14781 onError: 14782 Py_DECREF(ctx.fmtstr); 14783 _PyUnicodeWriter_Dealloc(&ctx.writer); 14784 if (ctx.args_owned) { 14785 Py_DECREF(ctx.args); 14786 } 14787 return NULL; 14788} 14789 14790static PyObject * 14791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14792 14793static PyObject * 14794unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14795{ 14796 PyObject *x = NULL; 14797 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14798 char *encoding = NULL; 14799 char *errors = NULL; 14800 14801 if (type != &PyUnicode_Type) 14802 return unicode_subtype_new(type, args, kwds); 14803 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14804 kwlist, &x, &encoding, &errors)) 14805 return NULL; 14806 if (x == NULL) 14807 _Py_RETURN_UNICODE_EMPTY(); 14808 if (encoding == NULL && errors == NULL) 14809 return PyObject_Str(x); 14810 else 14811 return PyUnicode_FromEncodedObject(x, encoding, errors); 14812} 14813 14814static PyObject * 14815unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14816{ 14817 PyObject *unicode, *self; 14818 Py_ssize_t length, char_size; 14819 int share_wstr, share_utf8; 14820 unsigned int kind; 14821 void *data; 14822 14823 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14824 14825 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14826 if (unicode == NULL) 14827 return NULL; 14828 assert(_PyUnicode_CHECK(unicode)); 14829 if (PyUnicode_READY(unicode) == -1) { 14830 Py_DECREF(unicode); 14831 return NULL; 14832 } 14833 14834 self = type->tp_alloc(type, 0); 14835 if (self == NULL) { 14836 Py_DECREF(unicode); 14837 return NULL; 14838 } 14839 kind = PyUnicode_KIND(unicode); 14840 length = PyUnicode_GET_LENGTH(unicode); 14841 14842 _PyUnicode_LENGTH(self) = length; 14843#ifdef Py_DEBUG 14844 _PyUnicode_HASH(self) = -1; 14845#else 14846 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14847#endif 14848 _PyUnicode_STATE(self).interned = 0; 14849 _PyUnicode_STATE(self).kind = kind; 14850 _PyUnicode_STATE(self).compact = 0; 14851 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14852 _PyUnicode_STATE(self).ready = 1; 14853 _PyUnicode_WSTR(self) = NULL; 14854 _PyUnicode_UTF8_LENGTH(self) = 0; 14855 _PyUnicode_UTF8(self) = NULL; 14856 _PyUnicode_WSTR_LENGTH(self) = 0; 14857 _PyUnicode_DATA_ANY(self) = NULL; 14858 14859 share_utf8 = 0; 14860 share_wstr = 0; 14861 if (kind == PyUnicode_1BYTE_KIND) { 14862 char_size = 1; 14863 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14864 share_utf8 = 1; 14865 } 14866 else if (kind == PyUnicode_2BYTE_KIND) { 14867 char_size = 2; 14868 if (sizeof(wchar_t) == 2) 14869 share_wstr = 1; 14870 } 14871 else { 14872 assert(kind == PyUnicode_4BYTE_KIND); 14873 char_size = 4; 14874 if (sizeof(wchar_t) == 4) 14875 share_wstr = 1; 14876 } 14877 14878 /* Ensure we won't overflow the length. */ 14879 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14880 PyErr_NoMemory(); 14881 goto onError; 14882 } 14883 data = PyObject_MALLOC((length + 1) * char_size); 14884 if (data == NULL) { 14885 PyErr_NoMemory(); 14886 goto onError; 14887 } 14888 14889 _PyUnicode_DATA_ANY(self) = data; 14890 if (share_utf8) { 14891 _PyUnicode_UTF8_LENGTH(self) = length; 14892 _PyUnicode_UTF8(self) = data; 14893 } 14894 if (share_wstr) { 14895 _PyUnicode_WSTR_LENGTH(self) = length; 14896 _PyUnicode_WSTR(self) = (wchar_t *)data; 14897 } 14898 14899 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14900 kind * (length + 1)); 14901 assert(_PyUnicode_CheckConsistency(self, 1)); 14902#ifdef Py_DEBUG 14903 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14904#endif 14905 Py_DECREF(unicode); 14906 return self; 14907 14908onError: 14909 Py_DECREF(unicode); 14910 Py_DECREF(self); 14911 return NULL; 14912} 14913 14914PyDoc_STRVAR(unicode_doc, 14915"str(object='') -> str\n\ 14916str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14917\n\ 14918Create a new string object from the given object. If encoding or\n\ 14919errors is specified, then the object must expose a data buffer\n\ 14920that will be decoded using the given encoding and error handler.\n\ 14921Otherwise, returns the result of object.__str__() (if defined)\n\ 14922or repr(object).\n\ 14923encoding defaults to sys.getdefaultencoding().\n\ 14924errors defaults to 'strict'."); 14925 14926static PyObject *unicode_iter(PyObject *seq); 14927 14928PyTypeObject PyUnicode_Type = { 14929 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14930 "str", /* tp_name */ 14931 sizeof(PyUnicodeObject), /* tp_size */ 14932 0, /* tp_itemsize */ 14933 /* Slots */ 14934 (destructor)unicode_dealloc, /* tp_dealloc */ 14935 0, /* tp_print */ 14936 0, /* tp_getattr */ 14937 0, /* tp_setattr */ 14938 0, /* tp_reserved */ 14939 unicode_repr, /* tp_repr */ 14940 &unicode_as_number, /* tp_as_number */ 14941 &unicode_as_sequence, /* tp_as_sequence */ 14942 &unicode_as_mapping, /* tp_as_mapping */ 14943 (hashfunc) unicode_hash, /* tp_hash*/ 14944 0, /* tp_call*/ 14945 (reprfunc) unicode_str, /* tp_str */ 14946 PyObject_GenericGetAttr, /* tp_getattro */ 14947 0, /* tp_setattro */ 14948 0, /* tp_as_buffer */ 14949 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14950 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14951 unicode_doc, /* tp_doc */ 14952 0, /* tp_traverse */ 14953 0, /* tp_clear */ 14954 PyUnicode_RichCompare, /* tp_richcompare */ 14955 0, /* tp_weaklistoffset */ 14956 unicode_iter, /* tp_iter */ 14957 0, /* tp_iternext */ 14958 unicode_methods, /* tp_methods */ 14959 0, /* tp_members */ 14960 0, /* tp_getset */ 14961 &PyBaseObject_Type, /* tp_base */ 14962 0, /* tp_dict */ 14963 0, /* tp_descr_get */ 14964 0, /* tp_descr_set */ 14965 0, /* tp_dictoffset */ 14966 0, /* tp_init */ 14967 0, /* tp_alloc */ 14968 unicode_new, /* tp_new */ 14969 PyObject_Del, /* tp_free */ 14970}; 14971 14972/* Initialize the Unicode implementation */ 14973 14974int _PyUnicode_Init(void) 14975{ 14976 /* XXX - move this array to unicodectype.c ? */ 14977 Py_UCS2 linebreak[] = { 14978 0x000A, /* LINE FEED */ 14979 0x000D, /* CARRIAGE RETURN */ 14980 0x001C, /* FILE SEPARATOR */ 14981 0x001D, /* GROUP SEPARATOR */ 14982 0x001E, /* RECORD SEPARATOR */ 14983 0x0085, /* NEXT LINE */ 14984 0x2028, /* LINE SEPARATOR */ 14985 0x2029, /* PARAGRAPH SEPARATOR */ 14986 }; 14987 14988 /* Init the implementation */ 14989 _Py_INCREF_UNICODE_EMPTY(); 14990 if (!unicode_empty) 14991 Py_FatalError("Can't create empty string"); 14992 Py_DECREF(unicode_empty); 14993 14994 if (PyType_Ready(&PyUnicode_Type) < 0) 14995 Py_FatalError("Can't initialize 'unicode'"); 14996 14997 /* initialize the linebreak bloom filter */ 14998 bloom_linebreak = make_bloom_mask( 14999 PyUnicode_2BYTE_KIND, linebreak, 15000 Py_ARRAY_LENGTH(linebreak)); 15001 15002 if (PyType_Ready(&EncodingMapType) < 0) 15003 Py_FatalError("Can't initialize encoding map type"); 15004 15005 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15006 Py_FatalError("Can't initialize field name iterator type"); 15007 15008 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15009 Py_FatalError("Can't initialize formatter iter type"); 15010 15011#ifdef HAVE_MBCS 15012 winver.dwOSVersionInfoSize = sizeof(winver); 15013 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 15014 PyErr_SetFromWindowsErr(0); 15015 return -1; 15016 } 15017#endif 15018 return 0; 15019} 15020 15021/* Finalize the Unicode implementation */ 15022 15023int 15024PyUnicode_ClearFreeList(void) 15025{ 15026 return 0; 15027} 15028 15029void 15030_PyUnicode_Fini(void) 15031{ 15032 int i; 15033 15034 Py_CLEAR(unicode_empty); 15035 15036 for (i = 0; i < 256; i++) 15037 Py_CLEAR(unicode_latin1[i]); 15038 _PyUnicode_ClearStaticStrings(); 15039 (void)PyUnicode_ClearFreeList(); 15040} 15041 15042void 15043PyUnicode_InternInPlace(PyObject **p) 15044{ 15045 PyObject *s = *p; 15046 PyObject *t; 15047#ifdef Py_DEBUG 15048 assert(s != NULL); 15049 assert(_PyUnicode_CHECK(s)); 15050#else 15051 if (s == NULL || !PyUnicode_Check(s)) 15052 return; 15053#endif 15054 /* If it's a subclass, we don't really know what putting 15055 it in the interned dict might do. */ 15056 if (!PyUnicode_CheckExact(s)) 15057 return; 15058 if (PyUnicode_CHECK_INTERNED(s)) 15059 return; 15060 if (interned == NULL) { 15061 interned = PyDict_New(); 15062 if (interned == NULL) { 15063 PyErr_Clear(); /* Don't leave an exception */ 15064 return; 15065 } 15066 } 15067 /* It might be that the GetItem call fails even 15068 though the key is present in the dictionary, 15069 namely when this happens during a stack overflow. */ 15070 Py_ALLOW_RECURSION 15071 t = PyDict_GetItem(interned, s); 15072 Py_END_ALLOW_RECURSION 15073 15074 if (t) { 15075 Py_INCREF(t); 15076 Py_DECREF(*p); 15077 *p = t; 15078 return; 15079 } 15080 15081 PyThreadState_GET()->recursion_critical = 1; 15082 if (PyDict_SetItem(interned, s, s) < 0) { 15083 PyErr_Clear(); 15084 PyThreadState_GET()->recursion_critical = 0; 15085 return; 15086 } 15087 PyThreadState_GET()->recursion_critical = 0; 15088 /* The two references in interned are not counted by refcnt. 15089 The deallocator will take care of this */ 15090 Py_REFCNT(s) -= 2; 15091 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15092} 15093 15094void 15095PyUnicode_InternImmortal(PyObject **p) 15096{ 15097 PyUnicode_InternInPlace(p); 15098 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15099 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15100 Py_INCREF(*p); 15101 } 15102} 15103 15104PyObject * 15105PyUnicode_InternFromString(const char *cp) 15106{ 15107 PyObject *s = PyUnicode_FromString(cp); 15108 if (s == NULL) 15109 return NULL; 15110 PyUnicode_InternInPlace(&s); 15111 return s; 15112} 15113 15114void 15115_Py_ReleaseInternedUnicodeStrings(void) 15116{ 15117 PyObject *keys; 15118 PyObject *s; 15119 Py_ssize_t i, n; 15120 Py_ssize_t immortal_size = 0, mortal_size = 0; 15121 15122 if (interned == NULL || !PyDict_Check(interned)) 15123 return; 15124 keys = PyDict_Keys(interned); 15125 if (keys == NULL || !PyList_Check(keys)) { 15126 PyErr_Clear(); 15127 return; 15128 } 15129 15130 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15131 detector, interned unicode strings are not forcibly deallocated; 15132 rather, we give them their stolen references back, and then clear 15133 and DECREF the interned dict. */ 15134 15135 n = PyList_GET_SIZE(keys); 15136 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15137 n); 15138 for (i = 0; i < n; i++) { 15139 s = PyList_GET_ITEM(keys, i); 15140 if (PyUnicode_READY(s) == -1) { 15141 assert(0 && "could not ready string"); 15142 fprintf(stderr, "could not ready string\n"); 15143 } 15144 switch (PyUnicode_CHECK_INTERNED(s)) { 15145 case SSTATE_NOT_INTERNED: 15146 /* XXX Shouldn't happen */ 15147 break; 15148 case SSTATE_INTERNED_IMMORTAL: 15149 Py_REFCNT(s) += 1; 15150 immortal_size += PyUnicode_GET_LENGTH(s); 15151 break; 15152 case SSTATE_INTERNED_MORTAL: 15153 Py_REFCNT(s) += 2; 15154 mortal_size += PyUnicode_GET_LENGTH(s); 15155 break; 15156 default: 15157 Py_FatalError("Inconsistent interned string state."); 15158 } 15159 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15160 } 15161 fprintf(stderr, "total size of all interned strings: " 15162 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15163 "mortal/immortal\n", mortal_size, immortal_size); 15164 Py_DECREF(keys); 15165 PyDict_Clear(interned); 15166 Py_CLEAR(interned); 15167} 15168 15169 15170/********************* Unicode Iterator **************************/ 15171 15172typedef struct { 15173 PyObject_HEAD 15174 Py_ssize_t it_index; 15175 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15176} unicodeiterobject; 15177 15178static void 15179unicodeiter_dealloc(unicodeiterobject *it) 15180{ 15181 _PyObject_GC_UNTRACK(it); 15182 Py_XDECREF(it->it_seq); 15183 PyObject_GC_Del(it); 15184} 15185 15186static int 15187unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15188{ 15189 Py_VISIT(it->it_seq); 15190 return 0; 15191} 15192 15193static PyObject * 15194unicodeiter_next(unicodeiterobject *it) 15195{ 15196 PyObject *seq, *item; 15197 15198 assert(it != NULL); 15199 seq = it->it_seq; 15200 if (seq == NULL) 15201 return NULL; 15202 assert(_PyUnicode_CHECK(seq)); 15203 15204 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15205 int kind = PyUnicode_KIND(seq); 15206 void *data = PyUnicode_DATA(seq); 15207 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15208 item = PyUnicode_FromOrdinal(chr); 15209 if (item != NULL) 15210 ++it->it_index; 15211 return item; 15212 } 15213 15214 Py_DECREF(seq); 15215 it->it_seq = NULL; 15216 return NULL; 15217} 15218 15219static PyObject * 15220unicodeiter_len(unicodeiterobject *it) 15221{ 15222 Py_ssize_t len = 0; 15223 if (it->it_seq) 15224 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15225 return PyLong_FromSsize_t(len); 15226} 15227 15228PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15229 15230static PyObject * 15231unicodeiter_reduce(unicodeiterobject *it) 15232{ 15233 if (it->it_seq != NULL) { 15234 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15235 it->it_seq, it->it_index); 15236 } else { 15237 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15238 if (u == NULL) 15239 return NULL; 15240 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15241 } 15242} 15243 15244PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15245 15246static PyObject * 15247unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15248{ 15249 Py_ssize_t index = PyLong_AsSsize_t(state); 15250 if (index == -1 && PyErr_Occurred()) 15251 return NULL; 15252 if (it->it_seq != NULL) { 15253 if (index < 0) 15254 index = 0; 15255 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15256 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15257 it->it_index = index; 15258 } 15259 Py_RETURN_NONE; 15260} 15261 15262PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15263 15264static PyMethodDef unicodeiter_methods[] = { 15265 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15266 length_hint_doc}, 15267 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15268 reduce_doc}, 15269 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15270 setstate_doc}, 15271 {NULL, NULL} /* sentinel */ 15272}; 15273 15274PyTypeObject PyUnicodeIter_Type = { 15275 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15276 "str_iterator", /* tp_name */ 15277 sizeof(unicodeiterobject), /* tp_basicsize */ 15278 0, /* tp_itemsize */ 15279 /* methods */ 15280 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15281 0, /* tp_print */ 15282 0, /* tp_getattr */ 15283 0, /* tp_setattr */ 15284 0, /* tp_reserved */ 15285 0, /* tp_repr */ 15286 0, /* tp_as_number */ 15287 0, /* tp_as_sequence */ 15288 0, /* tp_as_mapping */ 15289 0, /* tp_hash */ 15290 0, /* tp_call */ 15291 0, /* tp_str */ 15292 PyObject_GenericGetAttr, /* tp_getattro */ 15293 0, /* tp_setattro */ 15294 0, /* tp_as_buffer */ 15295 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15296 0, /* tp_doc */ 15297 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15298 0, /* tp_clear */ 15299 0, /* tp_richcompare */ 15300 0, /* tp_weaklistoffset */ 15301 PyObject_SelfIter, /* tp_iter */ 15302 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15303 unicodeiter_methods, /* tp_methods */ 15304 0, 15305}; 15306 15307static PyObject * 15308unicode_iter(PyObject *seq) 15309{ 15310 unicodeiterobject *it; 15311 15312 if (!PyUnicode_Check(seq)) { 15313 PyErr_BadInternalCall(); 15314 return NULL; 15315 } 15316 if (PyUnicode_READY(seq) == -1) 15317 return NULL; 15318 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15319 if (it == NULL) 15320 return NULL; 15321 it->it_index = 0; 15322 Py_INCREF(seq); 15323 it->it_seq = seq; 15324 _PyObject_GC_TRACK(it); 15325 return (PyObject *)it; 15326} 15327 15328 15329size_t 15330Py_UNICODE_strlen(const Py_UNICODE *u) 15331{ 15332 int res = 0; 15333 while(*u++) 15334 res++; 15335 return res; 15336} 15337 15338Py_UNICODE* 15339Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15340{ 15341 Py_UNICODE *u = s1; 15342 while ((*u++ = *s2++)); 15343 return s1; 15344} 15345 15346Py_UNICODE* 15347Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15348{ 15349 Py_UNICODE *u = s1; 15350 while ((*u++ = *s2++)) 15351 if (n-- == 0) 15352 break; 15353 return s1; 15354} 15355 15356Py_UNICODE* 15357Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15358{ 15359 Py_UNICODE *u1 = s1; 15360 u1 += Py_UNICODE_strlen(u1); 15361 Py_UNICODE_strcpy(u1, s2); 15362 return s1; 15363} 15364 15365int 15366Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15367{ 15368 while (*s1 && *s2 && *s1 == *s2) 15369 s1++, s2++; 15370 if (*s1 && *s2) 15371 return (*s1 < *s2) ? -1 : +1; 15372 if (*s1) 15373 return 1; 15374 if (*s2) 15375 return -1; 15376 return 0; 15377} 15378 15379int 15380Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15381{ 15382 Py_UNICODE u1, u2; 15383 for (; n != 0; n--) { 15384 u1 = *s1; 15385 u2 = *s2; 15386 if (u1 != u2) 15387 return (u1 < u2) ? -1 : +1; 15388 if (u1 == '\0') 15389 return 0; 15390 s1++; 15391 s2++; 15392 } 15393 return 0; 15394} 15395 15396Py_UNICODE* 15397Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15398{ 15399 const Py_UNICODE *p; 15400 for (p = s; *p; p++) 15401 if (*p == c) 15402 return (Py_UNICODE*)p; 15403 return NULL; 15404} 15405 15406Py_UNICODE* 15407Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15408{ 15409 const Py_UNICODE *p; 15410 p = s + Py_UNICODE_strlen(s); 15411 while (p != s) { 15412 p--; 15413 if (*p == c) 15414 return (Py_UNICODE*)p; 15415 } 15416 return NULL; 15417} 15418 15419Py_UNICODE* 15420PyUnicode_AsUnicodeCopy(PyObject *unicode) 15421{ 15422 Py_UNICODE *u, *copy; 15423 Py_ssize_t len, size; 15424 15425 if (!PyUnicode_Check(unicode)) { 15426 PyErr_BadArgument(); 15427 return NULL; 15428 } 15429 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15430 if (u == NULL) 15431 return NULL; 15432 /* Ensure we won't overflow the size. */ 15433 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 15434 PyErr_NoMemory(); 15435 return NULL; 15436 } 15437 size = len + 1; /* copy the null character */ 15438 size *= sizeof(Py_UNICODE); 15439 copy = PyMem_Malloc(size); 15440 if (copy == NULL) { 15441 PyErr_NoMemory(); 15442 return NULL; 15443 } 15444 memcpy(copy, u, size); 15445 return copy; 15446} 15447 15448/* A _string module, to export formatter_parser and formatter_field_name_split 15449 to the string.Formatter class implemented in Python. */ 15450 15451static PyMethodDef _string_methods[] = { 15452 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15453 METH_O, PyDoc_STR("split the argument as a field name")}, 15454 {"formatter_parser", (PyCFunction) formatter_parser, 15455 METH_O, PyDoc_STR("parse the argument as a format string")}, 15456 {NULL, NULL} 15457}; 15458 15459static struct PyModuleDef _string_module = { 15460 PyModuleDef_HEAD_INIT, 15461 "_string", 15462 PyDoc_STR("string helper module"), 15463 0, 15464 _string_methods, 15465 NULL, 15466 NULL, 15467 NULL, 15468 NULL 15469}; 15470 15471PyMODINIT_FUNC 15472PyInit__string(void) 15473{ 15474 return PyModule_Create(&_string_module); 15475} 15476 15477 15478#ifdef __cplusplus 15479} 15480#endif 15481