unicodeobject.c revision c6cf1ba29ea75d924fc4644e4f4383a71e146f22
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* --- Globals ------------------------------------------------------------ 51 52 The globals are initialized by the _PyUnicode_Init() API and should 53 not be used before calling that API. 54 55*/ 56 57 58#ifdef __cplusplus 59extern "C" { 60#endif 61 62/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 63#define MAX_UNICODE 0x10ffff 64 65#ifdef Py_DEBUG 66# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 67#else 68# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 69#endif 70 71#define _PyUnicode_UTF8(op) \ 72 (((PyCompactUnicodeObject*)(op))->utf8) 73#define PyUnicode_UTF8(op) \ 74 (assert(_PyUnicode_CHECK(op)), \ 75 assert(PyUnicode_IS_READY(op)), \ 76 PyUnicode_IS_COMPACT_ASCII(op) ? \ 77 ((char*)((PyASCIIObject*)(op) + 1)) : \ 78 _PyUnicode_UTF8(op)) 79#define _PyUnicode_UTF8_LENGTH(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8_length) 81#define PyUnicode_UTF8_LENGTH(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((PyASCIIObject*)(op))->length : \ 86 _PyUnicode_UTF8_LENGTH(op)) 87#define _PyUnicode_WSTR(op) \ 88 (((PyASCIIObject*)(op))->wstr) 89#define _PyUnicode_WSTR_LENGTH(op) \ 90 (((PyCompactUnicodeObject*)(op))->wstr_length) 91#define _PyUnicode_LENGTH(op) \ 92 (((PyASCIIObject *)(op))->length) 93#define _PyUnicode_STATE(op) \ 94 (((PyASCIIObject *)(op))->state) 95#define _PyUnicode_HASH(op) \ 96 (((PyASCIIObject *)(op))->hash) 97#define _PyUnicode_KIND(op) \ 98 (assert(_PyUnicode_CHECK(op)), \ 99 ((PyASCIIObject *)(op))->state.kind) 100#define _PyUnicode_GET_LENGTH(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 ((PyASCIIObject *)(op))->length) 103#define _PyUnicode_DATA_ANY(op) \ 104 (((PyUnicodeObject*)(op))->data.any) 105 106/* Optimized version of Py_MAX() to compute the maximum character: 107 use it when your are computing the second argument of PyUnicode_New() */ 108#define MAX_MAXCHAR(maxchar1, maxchar2) \ 109 ((maxchar1) | (maxchar2)) 110 111#undef PyUnicode_READY 112#define PyUnicode_READY(op) \ 113 (assert(_PyUnicode_CHECK(op)), \ 114 (PyUnicode_IS_READY(op) ? \ 115 0 : \ 116 _PyUnicode_Ready(op))) 117 118#define _PyUnicode_SHARE_UTF8(op) \ 119 (assert(_PyUnicode_CHECK(op)), \ 120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 122#define _PyUnicode_SHARE_WSTR(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 125 126/* true if the Unicode object has an allocated UTF-8 memory block 127 (not shared with other data) */ 128#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 (!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (_PyUnicode_WSTR(op) && \ 139 (!PyUnicode_IS_READY(op) || \ 140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 141 142/* Generic helper macro to convert characters of different types. 143 from_type and to_type have to be valid type names, begin and end 144 are pointers to the source characters which should be of type 145 "from_type *". to is a pointer of type "to_type *" and points to the 146 buffer where the result characters are written to. */ 147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 148 do { \ 149 to_type *_to = (to_type *) to; \ 150 const from_type *_iter = (begin); \ 151 const from_type *_end = (end); \ 152 Py_ssize_t n = (_end) - (_iter); \ 153 const from_type *_unrolled_end = \ 154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 155 while (_iter < (_unrolled_end)) { \ 156 _to[0] = (to_type) _iter[0]; \ 157 _to[1] = (to_type) _iter[1]; \ 158 _to[2] = (to_type) _iter[2]; \ 159 _to[3] = (to_type) _iter[3]; \ 160 _iter += 4; _to += 4; \ 161 } \ 162 while (_iter < (_end)) \ 163 *_to++ = (to_type) *_iter++; \ 164 } while (0) 165 166/* This dictionary holds all interned unicode strings. Note that references 167 to strings in this dictionary are *not* counted in the string's ob_refcnt. 168 When the interned string reaches a refcnt of 0 the string deallocation 169 function will delete the reference from this dictionary. 170 171 Another way to look at this is that to say that the actual reference 172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 173*/ 174static PyObject *interned; 175 176/* The empty Unicode object is shared to improve performance. */ 177static PyObject *unicode_empty; 178 179/* List of static strings. */ 180static _Py_Identifier *static_strings; 181 182/* Single character Unicode strings in the Latin-1 range are being 183 shared as well. */ 184static PyObject *unicode_latin1[256]; 185 186/* Fast detection of the most frequent whitespace characters */ 187const unsigned char _Py_ascii_whitespace[] = { 188 0, 0, 0, 0, 0, 0, 0, 0, 189/* case 0x0009: * CHARACTER TABULATION */ 190/* case 0x000A: * LINE FEED */ 191/* case 0x000B: * LINE TABULATION */ 192/* case 0x000C: * FORM FEED */ 193/* case 0x000D: * CARRIAGE RETURN */ 194 0, 1, 1, 1, 1, 1, 0, 0, 195 0, 0, 0, 0, 0, 0, 0, 0, 196/* case 0x001C: * FILE SEPARATOR */ 197/* case 0x001D: * GROUP SEPARATOR */ 198/* case 0x001E: * RECORD SEPARATOR */ 199/* case 0x001F: * UNIT SEPARATOR */ 200 0, 0, 0, 0, 1, 1, 1, 1, 201/* case 0x0020: * SPACE */ 202 1, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0, 0, 0, 0, 0, 0, 0, 206 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0 215}; 216 217/* forward */ 218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 219static PyObject* get_latin1_char(unsigned char ch); 220static int unicode_modifiable(PyObject *unicode); 221 222 223static PyObject * 224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 225static PyObject * 226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 227static PyObject * 228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 229 230static PyObject * 231unicode_encode_call_errorhandler(const char *errors, 232 PyObject **errorHandler,const char *encoding, const char *reason, 233 PyObject *unicode, PyObject **exceptionObject, 234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 235 236static void 237raise_encode_exception(PyObject **exceptionObject, 238 const char *encoding, 239 PyObject *unicode, 240 Py_ssize_t startpos, Py_ssize_t endpos, 241 const char *reason); 242 243/* Same for linebreaks */ 244static unsigned char ascii_linebreak[] = { 245 0, 0, 0, 0, 0, 0, 0, 0, 246/* 0x000A, * LINE FEED */ 247/* 0x000B, * LINE TABULATION */ 248/* 0x000C, * FORM FEED */ 249/* 0x000D, * CARRIAGE RETURN */ 250 0, 0, 1, 1, 1, 1, 0, 0, 251 0, 0, 0, 0, 0, 0, 0, 0, 252/* 0x001C, * FILE SEPARATOR */ 253/* 0x001D, * GROUP SEPARATOR */ 254/* 0x001E, * RECORD SEPARATOR */ 255 0, 0, 0, 0, 1, 1, 1, 0, 256 0, 0, 0, 0, 0, 0, 0, 0, 257 0, 0, 0, 0, 0, 0, 0, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260 261 0, 0, 0, 0, 0, 0, 0, 0, 262 0, 0, 0, 0, 0, 0, 0, 0, 263 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0 269}; 270 271/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 272 This function is kept for backward compatibility with the old API. */ 273Py_UNICODE 274PyUnicode_GetMax(void) 275{ 276#ifdef Py_UNICODE_WIDE 277 return 0x10FFFF; 278#else 279 /* This is actually an illegal character, so it should 280 not be passed to unichr. */ 281 return 0xFFFF; 282#endif 283} 284 285#ifdef Py_DEBUG 286int 287_PyUnicode_CheckConsistency(PyObject *op, int check_content) 288{ 289 PyASCIIObject *ascii; 290 unsigned int kind; 291 292 assert(PyUnicode_Check(op)); 293 294 ascii = (PyASCIIObject *)op; 295 kind = ascii->state.kind; 296 297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 298 assert(kind == PyUnicode_1BYTE_KIND); 299 assert(ascii->state.ready == 1); 300 } 301 else { 302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 303 void *data; 304 305 if (ascii->state.compact == 1) { 306 data = compact + 1; 307 assert(kind == PyUnicode_1BYTE_KIND 308 || kind == PyUnicode_2BYTE_KIND 309 || kind == PyUnicode_4BYTE_KIND); 310 assert(ascii->state.ascii == 0); 311 assert(ascii->state.ready == 1); 312 assert (compact->utf8 != data); 313 } 314 else { 315 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 316 317 data = unicode->data.any; 318 if (kind == PyUnicode_WCHAR_KIND) { 319 assert(ascii->length == 0); 320 assert(ascii->hash == -1); 321 assert(ascii->state.compact == 0); 322 assert(ascii->state.ascii == 0); 323 assert(ascii->state.ready == 0); 324 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 325 assert(ascii->wstr != NULL); 326 assert(data == NULL); 327 assert(compact->utf8 == NULL); 328 } 329 else { 330 assert(kind == PyUnicode_1BYTE_KIND 331 || kind == PyUnicode_2BYTE_KIND 332 || kind == PyUnicode_4BYTE_KIND); 333 assert(ascii->state.compact == 0); 334 assert(ascii->state.ready == 1); 335 assert(data != NULL); 336 if (ascii->state.ascii) { 337 assert (compact->utf8 == data); 338 assert (compact->utf8_length == ascii->length); 339 } 340 else 341 assert (compact->utf8 != data); 342 } 343 } 344 if (kind != PyUnicode_WCHAR_KIND) { 345 if ( 346#if SIZEOF_WCHAR_T == 2 347 kind == PyUnicode_2BYTE_KIND 348#else 349 kind == PyUnicode_4BYTE_KIND 350#endif 351 ) 352 { 353 assert(ascii->wstr == data); 354 assert(compact->wstr_length == ascii->length); 355 } else 356 assert(ascii->wstr != data); 357 } 358 359 if (compact->utf8 == NULL) 360 assert(compact->utf8_length == 0); 361 if (ascii->wstr == NULL) 362 assert(compact->wstr_length == 0); 363 } 364 /* check that the best kind is used */ 365 if (check_content && kind != PyUnicode_WCHAR_KIND) 366 { 367 Py_ssize_t i; 368 Py_UCS4 maxchar = 0; 369 void *data; 370 Py_UCS4 ch; 371 372 data = PyUnicode_DATA(ascii); 373 for (i=0; i < ascii->length; i++) 374 { 375 ch = PyUnicode_READ(kind, data, i); 376 if (ch > maxchar) 377 maxchar = ch; 378 } 379 if (kind == PyUnicode_1BYTE_KIND) { 380 if (ascii->state.ascii == 0) { 381 assert(maxchar >= 128); 382 assert(maxchar <= 255); 383 } 384 else 385 assert(maxchar < 128); 386 } 387 else if (kind == PyUnicode_2BYTE_KIND) { 388 assert(maxchar >= 0x100); 389 assert(maxchar <= 0xFFFF); 390 } 391 else { 392 assert(maxchar >= 0x10000); 393 assert(maxchar <= MAX_UNICODE); 394 } 395 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 396 } 397 return 1; 398} 399#endif 400 401static PyObject* 402unicode_result_wchar(PyObject *unicode) 403{ 404#ifndef Py_DEBUG 405 Py_ssize_t len; 406 407 len = _PyUnicode_WSTR_LENGTH(unicode); 408 if (len == 0) { 409 Py_INCREF(unicode_empty); 410 Py_DECREF(unicode); 411 return unicode_empty; 412 } 413 414 if (len == 1) { 415 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 416 if (ch < 256) { 417 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 418 Py_DECREF(unicode); 419 return latin1_char; 420 } 421 } 422 423 if (_PyUnicode_Ready(unicode) < 0) { 424 Py_DECREF(unicode); 425 return NULL; 426 } 427#else 428 assert(Py_REFCNT(unicode) == 1); 429 430 /* don't make the result ready in debug mode to ensure that the caller 431 makes the string ready before using it */ 432 assert(_PyUnicode_CheckConsistency(unicode, 1)); 433#endif 434 return unicode; 435} 436 437static PyObject* 438unicode_result_ready(PyObject *unicode) 439{ 440 Py_ssize_t length; 441 442 length = PyUnicode_GET_LENGTH(unicode); 443 if (length == 0) { 444 if (unicode != unicode_empty) { 445 Py_INCREF(unicode_empty); 446 Py_DECREF(unicode); 447 } 448 return unicode_empty; 449 } 450 451 if (length == 1) { 452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 453 if (ch < 256) { 454 PyObject *latin1_char = unicode_latin1[ch]; 455 if (latin1_char != NULL) { 456 if (unicode != latin1_char) { 457 Py_INCREF(latin1_char); 458 Py_DECREF(unicode); 459 } 460 return latin1_char; 461 } 462 else { 463 assert(_PyUnicode_CheckConsistency(unicode, 1)); 464 Py_INCREF(unicode); 465 unicode_latin1[ch] = unicode; 466 return unicode; 467 } 468 } 469 } 470 471 assert(_PyUnicode_CheckConsistency(unicode, 1)); 472 return unicode; 473} 474 475static PyObject* 476unicode_result(PyObject *unicode) 477{ 478 assert(_PyUnicode_CHECK(unicode)); 479 if (PyUnicode_IS_READY(unicode)) 480 return unicode_result_ready(unicode); 481 else 482 return unicode_result_wchar(unicode); 483} 484 485static PyObject* 486unicode_result_unchanged(PyObject *unicode) 487{ 488 if (PyUnicode_CheckExact(unicode)) { 489 if (PyUnicode_READY(unicode) == -1) 490 return NULL; 491 Py_INCREF(unicode); 492 return unicode; 493 } 494 else 495 /* Subtype -- return genuine unicode string with the same value. */ 496 return _PyUnicode_Copy(unicode); 497} 498 499#ifdef HAVE_MBCS 500static OSVERSIONINFOEX winver; 501#endif 502 503/* --- Bloom Filters ----------------------------------------------------- */ 504 505/* stuff to implement simple "bloom filters" for Unicode characters. 506 to keep things simple, we use a single bitmask, using the least 5 507 bits from each unicode characters as the bit index. */ 508 509/* the linebreak mask is set up by Unicode_Init below */ 510 511#if LONG_BIT >= 128 512#define BLOOM_WIDTH 128 513#elif LONG_BIT >= 64 514#define BLOOM_WIDTH 64 515#elif LONG_BIT >= 32 516#define BLOOM_WIDTH 32 517#else 518#error "LONG_BIT is smaller than 32" 519#endif 520 521#define BLOOM_MASK unsigned long 522 523static BLOOM_MASK bloom_linebreak; 524 525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 527 528#define BLOOM_LINEBREAK(ch) \ 529 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 531 532Py_LOCAL_INLINE(BLOOM_MASK) 533make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 534{ 535 /* calculate simple bloom-style bitmask for a given unicode string */ 536 537 BLOOM_MASK mask; 538 Py_ssize_t i; 539 540 mask = 0; 541 for (i = 0; i < len; i++) 542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 543 544 return mask; 545} 546 547#define BLOOM_MEMBER(mask, chr, str) \ 548 (BLOOM(mask, chr) \ 549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 550 551/* Compilation of templated routines */ 552 553#include "stringlib/asciilib.h" 554#include "stringlib/fastsearch.h" 555#include "stringlib/partition.h" 556#include "stringlib/split.h" 557#include "stringlib/count.h" 558#include "stringlib/find.h" 559#include "stringlib/find_max_char.h" 560#include "stringlib/localeutil.h" 561#include "stringlib/undef.h" 562 563#include "stringlib/ucs1lib.h" 564#include "stringlib/fastsearch.h" 565#include "stringlib/partition.h" 566#include "stringlib/split.h" 567#include "stringlib/count.h" 568#include "stringlib/find.h" 569#include "stringlib/find_max_char.h" 570#include "stringlib/localeutil.h" 571#include "stringlib/undef.h" 572 573#include "stringlib/ucs2lib.h" 574#include "stringlib/fastsearch.h" 575#include "stringlib/partition.h" 576#include "stringlib/split.h" 577#include "stringlib/count.h" 578#include "stringlib/find.h" 579#include "stringlib/find_max_char.h" 580#include "stringlib/localeutil.h" 581#include "stringlib/undef.h" 582 583#include "stringlib/ucs4lib.h" 584#include "stringlib/fastsearch.h" 585#include "stringlib/partition.h" 586#include "stringlib/split.h" 587#include "stringlib/count.h" 588#include "stringlib/find.h" 589#include "stringlib/find_max_char.h" 590#include "stringlib/localeutil.h" 591#include "stringlib/undef.h" 592 593#include "stringlib/unicodedefs.h" 594#include "stringlib/fastsearch.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/undef.h" 598 599/* --- Unicode Object ----------------------------------------------------- */ 600 601static PyObject * 602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 603 604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 605 Py_ssize_t size, Py_UCS4 ch, 606 int direction) 607{ 608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 609 610 switch (kind) { 611 case PyUnicode_1BYTE_KIND: 612 { 613 Py_UCS1 ch1 = (Py_UCS1) ch; 614 if (ch1 == ch) 615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 616 else 617 return -1; 618 } 619 case PyUnicode_2BYTE_KIND: 620 { 621 Py_UCS2 ch2 = (Py_UCS2) ch; 622 if (ch2 == ch) 623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 624 else 625 return -1; 626 } 627 case PyUnicode_4BYTE_KIND: 628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 629 default: 630 assert(0); 631 return -1; 632 } 633} 634 635#ifdef Py_DEBUG 636/* Fill the data of an Unicode string with invalid characters to detect bugs 637 earlier. 638 639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 641 invalid character in Unicode 6.0. */ 642static void 643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 644{ 645 int kind = PyUnicode_KIND(unicode); 646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 647 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 648 if (length <= old_length) 649 return; 650 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 651} 652#endif 653 654static PyObject* 655resize_compact(PyObject *unicode, Py_ssize_t length) 656{ 657 Py_ssize_t char_size; 658 Py_ssize_t struct_size; 659 Py_ssize_t new_size; 660 int share_wstr; 661 PyObject *new_unicode; 662#ifdef Py_DEBUG 663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 664#endif 665 666 assert(unicode_modifiable(unicode)); 667 assert(PyUnicode_IS_READY(unicode)); 668 assert(PyUnicode_IS_COMPACT(unicode)); 669 670 char_size = PyUnicode_KIND(unicode); 671 if (PyUnicode_IS_ASCII(unicode)) 672 struct_size = sizeof(PyASCIIObject); 673 else 674 struct_size = sizeof(PyCompactUnicodeObject); 675 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 676 677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 678 PyErr_NoMemory(); 679 return NULL; 680 } 681 new_size = (struct_size + (length + 1) * char_size); 682 683 _Py_DEC_REFTOTAL; 684 _Py_ForgetReference(unicode); 685 686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 687 if (new_unicode == NULL) { 688 _Py_NewReference(unicode); 689 PyErr_NoMemory(); 690 return NULL; 691 } 692 unicode = new_unicode; 693 _Py_NewReference(unicode); 694 695 _PyUnicode_LENGTH(unicode) = length; 696 if (share_wstr) { 697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 698 if (!PyUnicode_IS_ASCII(unicode)) 699 _PyUnicode_WSTR_LENGTH(unicode) = length; 700 } 701#ifdef Py_DEBUG 702 unicode_fill_invalid(unicode, old_length); 703#endif 704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 705 length, 0); 706 assert(_PyUnicode_CheckConsistency(unicode, 0)); 707 return unicode; 708} 709 710static int 711resize_inplace(PyObject *unicode, Py_ssize_t length) 712{ 713 wchar_t *wstr; 714 Py_ssize_t new_size; 715 assert(!PyUnicode_IS_COMPACT(unicode)); 716 assert(Py_REFCNT(unicode) == 1); 717 718 if (PyUnicode_IS_READY(unicode)) { 719 Py_ssize_t char_size; 720 int share_wstr, share_utf8; 721 void *data; 722#ifdef Py_DEBUG 723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 724#endif 725 726 data = _PyUnicode_DATA_ANY(unicode); 727 char_size = PyUnicode_KIND(unicode); 728 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 730 731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 732 PyErr_NoMemory(); 733 return -1; 734 } 735 new_size = (length + 1) * char_size; 736 737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 738 { 739 PyObject_DEL(_PyUnicode_UTF8(unicode)); 740 _PyUnicode_UTF8(unicode) = NULL; 741 _PyUnicode_UTF8_LENGTH(unicode) = 0; 742 } 743 744 data = (PyObject *)PyObject_REALLOC(data, new_size); 745 if (data == NULL) { 746 PyErr_NoMemory(); 747 return -1; 748 } 749 _PyUnicode_DATA_ANY(unicode) = data; 750 if (share_wstr) { 751 _PyUnicode_WSTR(unicode) = data; 752 _PyUnicode_WSTR_LENGTH(unicode) = length; 753 } 754 if (share_utf8) { 755 _PyUnicode_UTF8(unicode) = data; 756 _PyUnicode_UTF8_LENGTH(unicode) = length; 757 } 758 _PyUnicode_LENGTH(unicode) = length; 759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 760#ifdef Py_DEBUG 761 unicode_fill_invalid(unicode, old_length); 762#endif 763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 764 assert(_PyUnicode_CheckConsistency(unicode, 0)); 765 return 0; 766 } 767 } 768 assert(_PyUnicode_WSTR(unicode) != NULL); 769 770 /* check for integer overflow */ 771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 772 PyErr_NoMemory(); 773 return -1; 774 } 775 new_size = sizeof(wchar_t) * (length + 1); 776 wstr = _PyUnicode_WSTR(unicode); 777 wstr = PyObject_REALLOC(wstr, new_size); 778 if (!wstr) { 779 PyErr_NoMemory(); 780 return -1; 781 } 782 _PyUnicode_WSTR(unicode) = wstr; 783 _PyUnicode_WSTR(unicode)[length] = 0; 784 _PyUnicode_WSTR_LENGTH(unicode) = length; 785 assert(_PyUnicode_CheckConsistency(unicode, 0)); 786 return 0; 787} 788 789static PyObject* 790resize_copy(PyObject *unicode, Py_ssize_t length) 791{ 792 Py_ssize_t copy_length; 793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 794 PyObject *copy; 795 796 if (PyUnicode_READY(unicode) == -1) 797 return NULL; 798 799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 800 if (copy == NULL) 801 return NULL; 802 803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 805 return copy; 806 } 807 else { 808 PyObject *w; 809 810 w = (PyObject*)_PyUnicode_New(length); 811 if (w == NULL) 812 return NULL; 813 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 814 copy_length = Py_MIN(copy_length, length); 815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 816 copy_length * sizeof(wchar_t)); 817 return w; 818 } 819} 820 821/* We allocate one more byte to make sure the string is 822 Ux0000 terminated; some code (e.g. new_identifier) 823 relies on that. 824 825 XXX This allocator could further be enhanced by assuring that the 826 free list never reduces its size below 1. 827 828*/ 829 830static PyUnicodeObject * 831_PyUnicode_New(Py_ssize_t length) 832{ 833 register PyUnicodeObject *unicode; 834 size_t new_size; 835 836 /* Optimization for empty strings */ 837 if (length == 0 && unicode_empty != NULL) { 838 Py_INCREF(unicode_empty); 839 return (PyUnicodeObject*)unicode_empty; 840 } 841 842 /* Ensure we won't overflow the size. */ 843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 844 return (PyUnicodeObject *)PyErr_NoMemory(); 845 } 846 if (length < 0) { 847 PyErr_SetString(PyExc_SystemError, 848 "Negative size passed to _PyUnicode_New"); 849 return NULL; 850 } 851 852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 853 if (unicode == NULL) 854 return NULL; 855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 857 if (!_PyUnicode_WSTR(unicode)) { 858 Py_DECREF(unicode); 859 PyErr_NoMemory(); 860 return NULL; 861 } 862 863 /* Initialize the first element to guard against cases where 864 * the caller fails before initializing str -- unicode_resize() 865 * reads str[0], and the Keep-Alive optimization can keep memory 866 * allocated for str alive across a call to unicode_dealloc(unicode). 867 * We don't want unicode_resize to read uninitialized memory in 868 * that case. 869 */ 870 _PyUnicode_WSTR(unicode)[0] = 0; 871 _PyUnicode_WSTR(unicode)[length] = 0; 872 _PyUnicode_WSTR_LENGTH(unicode) = length; 873 _PyUnicode_HASH(unicode) = -1; 874 _PyUnicode_STATE(unicode).interned = 0; 875 _PyUnicode_STATE(unicode).kind = 0; 876 _PyUnicode_STATE(unicode).compact = 0; 877 _PyUnicode_STATE(unicode).ready = 0; 878 _PyUnicode_STATE(unicode).ascii = 0; 879 _PyUnicode_DATA_ANY(unicode) = NULL; 880 _PyUnicode_LENGTH(unicode) = 0; 881 _PyUnicode_UTF8(unicode) = NULL; 882 _PyUnicode_UTF8_LENGTH(unicode) = 0; 883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 884 return unicode; 885} 886 887static const char* 888unicode_kind_name(PyObject *unicode) 889{ 890 /* don't check consistency: unicode_kind_name() is called from 891 _PyUnicode_Dump() */ 892 if (!PyUnicode_IS_COMPACT(unicode)) 893 { 894 if (!PyUnicode_IS_READY(unicode)) 895 return "wstr"; 896 switch (PyUnicode_KIND(unicode)) 897 { 898 case PyUnicode_1BYTE_KIND: 899 if (PyUnicode_IS_ASCII(unicode)) 900 return "legacy ascii"; 901 else 902 return "legacy latin1"; 903 case PyUnicode_2BYTE_KIND: 904 return "legacy UCS2"; 905 case PyUnicode_4BYTE_KIND: 906 return "legacy UCS4"; 907 default: 908 return "<legacy invalid kind>"; 909 } 910 } 911 assert(PyUnicode_IS_READY(unicode)); 912 switch (PyUnicode_KIND(unicode)) { 913 case PyUnicode_1BYTE_KIND: 914 if (PyUnicode_IS_ASCII(unicode)) 915 return "ascii"; 916 else 917 return "latin1"; 918 case PyUnicode_2BYTE_KIND: 919 return "UCS2"; 920 case PyUnicode_4BYTE_KIND: 921 return "UCS4"; 922 default: 923 return "<invalid compact kind>"; 924 } 925} 926 927#ifdef Py_DEBUG 928/* Functions wrapping macros for use in debugger */ 929char *_PyUnicode_utf8(void *unicode){ 930 return PyUnicode_UTF8(unicode); 931} 932 933void *_PyUnicode_compact_data(void *unicode) { 934 return _PyUnicode_COMPACT_DATA(unicode); 935} 936void *_PyUnicode_data(void *unicode){ 937 printf("obj %p\n", unicode); 938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 943 return PyUnicode_DATA(unicode); 944} 945 946void 947_PyUnicode_Dump(PyObject *op) 948{ 949 PyASCIIObject *ascii = (PyASCIIObject *)op; 950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 951 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 952 void *data; 953 954 if (ascii->state.compact) 955 { 956 if (ascii->state.ascii) 957 data = (ascii + 1); 958 else 959 data = (compact + 1); 960 } 961 else 962 data = unicode->data.any; 963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 964 965 if (ascii->wstr == data) 966 printf("shared "); 967 printf("wstr=%p", ascii->wstr); 968 969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 970 printf(" (%zu), ", compact->wstr_length); 971 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 972 printf("shared "); 973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 974 } 975 printf(", data=%p\n", data); 976} 977#endif 978 979PyObject * 980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 981{ 982 PyObject *obj; 983 PyCompactUnicodeObject *unicode; 984 void *data; 985 enum PyUnicode_Kind kind; 986 int is_sharing, is_ascii; 987 Py_ssize_t char_size; 988 Py_ssize_t struct_size; 989 990 /* Optimization for empty strings */ 991 if (size == 0 && unicode_empty != NULL) { 992 Py_INCREF(unicode_empty); 993 return unicode_empty; 994 } 995 996 is_ascii = 0; 997 is_sharing = 0; 998 struct_size = sizeof(PyCompactUnicodeObject); 999 if (maxchar < 128) { 1000 kind = PyUnicode_1BYTE_KIND; 1001 char_size = 1; 1002 is_ascii = 1; 1003 struct_size = sizeof(PyASCIIObject); 1004 } 1005 else if (maxchar < 256) { 1006 kind = PyUnicode_1BYTE_KIND; 1007 char_size = 1; 1008 } 1009 else if (maxchar < 65536) { 1010 kind = PyUnicode_2BYTE_KIND; 1011 char_size = 2; 1012 if (sizeof(wchar_t) == 2) 1013 is_sharing = 1; 1014 } 1015 else { 1016 if (maxchar > MAX_UNICODE) { 1017 PyErr_SetString(PyExc_SystemError, 1018 "invalid maximum character passed to PyUnicode_New"); 1019 return NULL; 1020 } 1021 kind = PyUnicode_4BYTE_KIND; 1022 char_size = 4; 1023 if (sizeof(wchar_t) == 4) 1024 is_sharing = 1; 1025 } 1026 1027 /* Ensure we won't overflow the size. */ 1028 if (size < 0) { 1029 PyErr_SetString(PyExc_SystemError, 1030 "Negative size passed to PyUnicode_New"); 1031 return NULL; 1032 } 1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1034 return PyErr_NoMemory(); 1035 1036 /* Duplicated allocation code from _PyObject_New() instead of a call to 1037 * PyObject_New() so we are able to allocate space for the object and 1038 * it's data buffer. 1039 */ 1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1041 if (obj == NULL) 1042 return PyErr_NoMemory(); 1043 obj = PyObject_INIT(obj, &PyUnicode_Type); 1044 if (obj == NULL) 1045 return NULL; 1046 1047 unicode = (PyCompactUnicodeObject *)obj; 1048 if (is_ascii) 1049 data = ((PyASCIIObject*)obj) + 1; 1050 else 1051 data = unicode + 1; 1052 _PyUnicode_LENGTH(unicode) = size; 1053 _PyUnicode_HASH(unicode) = -1; 1054 _PyUnicode_STATE(unicode).interned = 0; 1055 _PyUnicode_STATE(unicode).kind = kind; 1056 _PyUnicode_STATE(unicode).compact = 1; 1057 _PyUnicode_STATE(unicode).ready = 1; 1058 _PyUnicode_STATE(unicode).ascii = is_ascii; 1059 if (is_ascii) { 1060 ((char*)data)[size] = 0; 1061 _PyUnicode_WSTR(unicode) = NULL; 1062 } 1063 else if (kind == PyUnicode_1BYTE_KIND) { 1064 ((char*)data)[size] = 0; 1065 _PyUnicode_WSTR(unicode) = NULL; 1066 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1067 unicode->utf8 = NULL; 1068 unicode->utf8_length = 0; 1069 } 1070 else { 1071 unicode->utf8 = NULL; 1072 unicode->utf8_length = 0; 1073 if (kind == PyUnicode_2BYTE_KIND) 1074 ((Py_UCS2*)data)[size] = 0; 1075 else /* kind == PyUnicode_4BYTE_KIND */ 1076 ((Py_UCS4*)data)[size] = 0; 1077 if (is_sharing) { 1078 _PyUnicode_WSTR_LENGTH(unicode) = size; 1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1080 } 1081 else { 1082 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1083 _PyUnicode_WSTR(unicode) = NULL; 1084 } 1085 } 1086#ifdef Py_DEBUG 1087 unicode_fill_invalid((PyObject*)unicode, 0); 1088#endif 1089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1090 return obj; 1091} 1092 1093#if SIZEOF_WCHAR_T == 2 1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1095 will decode surrogate pairs, the other conversions are implemented as macros 1096 for efficiency. 1097 1098 This function assumes that unicode can hold one more code point than wstr 1099 characters for a terminating null character. */ 1100static void 1101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1102 PyObject *unicode) 1103{ 1104 const wchar_t *iter; 1105 Py_UCS4 *ucs4_out; 1106 1107 assert(unicode != NULL); 1108 assert(_PyUnicode_CHECK(unicode)); 1109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1111 1112 for (iter = begin; iter < end; ) { 1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1114 _PyUnicode_GET_LENGTH(unicode))); 1115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1116 && (iter+1) < end 1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1118 { 1119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1120 iter += 2; 1121 } 1122 else { 1123 *ucs4_out++ = *iter; 1124 iter++; 1125 } 1126 } 1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1128 _PyUnicode_GET_LENGTH(unicode))); 1129 1130} 1131#endif 1132 1133static int 1134unicode_check_modifiable(PyObject *unicode) 1135{ 1136 if (!unicode_modifiable(unicode)) { 1137 PyErr_SetString(PyExc_SystemError, 1138 "Cannot modify a string currently used"); 1139 return -1; 1140 } 1141 return 0; 1142} 1143 1144static int 1145_copy_characters(PyObject *to, Py_ssize_t to_start, 1146 PyObject *from, Py_ssize_t from_start, 1147 Py_ssize_t how_many, int check_maxchar) 1148{ 1149 unsigned int from_kind, to_kind; 1150 void *from_data, *to_data; 1151 1152 assert(0 <= how_many); 1153 assert(0 <= from_start); 1154 assert(0 <= to_start); 1155 assert(PyUnicode_Check(from)); 1156 assert(PyUnicode_IS_READY(from)); 1157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1158 1159 assert(PyUnicode_Check(to)); 1160 assert(PyUnicode_IS_READY(to)); 1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1162 1163 if (how_many == 0) 1164 return 0; 1165 1166 from_kind = PyUnicode_KIND(from); 1167 from_data = PyUnicode_DATA(from); 1168 to_kind = PyUnicode_KIND(to); 1169 to_data = PyUnicode_DATA(to); 1170 1171#ifdef Py_DEBUG 1172 if (!check_maxchar 1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1174 { 1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1176 Py_UCS4 ch; 1177 Py_ssize_t i; 1178 for (i=0; i < how_many; i++) { 1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1180 assert(ch <= to_maxchar); 1181 } 1182 } 1183#endif 1184 1185 if (from_kind == to_kind) { 1186 if (check_maxchar 1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1188 { 1189 /* Writing Latin-1 characters into an ASCII string requires to 1190 check that all written characters are pure ASCII */ 1191 Py_UCS4 max_char; 1192 max_char = ucs1lib_find_max_char(from_data, 1193 (Py_UCS1*)from_data + how_many); 1194 if (max_char >= 128) 1195 return -1; 1196 } 1197 Py_MEMCPY((char*)to_data + to_kind * to_start, 1198 (char*)from_data + from_kind * from_start, 1199 to_kind * how_many); 1200 } 1201 else if (from_kind == PyUnicode_1BYTE_KIND 1202 && to_kind == PyUnicode_2BYTE_KIND) 1203 { 1204 _PyUnicode_CONVERT_BYTES( 1205 Py_UCS1, Py_UCS2, 1206 PyUnicode_1BYTE_DATA(from) + from_start, 1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1208 PyUnicode_2BYTE_DATA(to) + to_start 1209 ); 1210 } 1211 else if (from_kind == PyUnicode_1BYTE_KIND 1212 && to_kind == PyUnicode_4BYTE_KIND) 1213 { 1214 _PyUnicode_CONVERT_BYTES( 1215 Py_UCS1, Py_UCS4, 1216 PyUnicode_1BYTE_DATA(from) + from_start, 1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1218 PyUnicode_4BYTE_DATA(to) + to_start 1219 ); 1220 } 1221 else if (from_kind == PyUnicode_2BYTE_KIND 1222 && to_kind == PyUnicode_4BYTE_KIND) 1223 { 1224 _PyUnicode_CONVERT_BYTES( 1225 Py_UCS2, Py_UCS4, 1226 PyUnicode_2BYTE_DATA(from) + from_start, 1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1228 PyUnicode_4BYTE_DATA(to) + to_start 1229 ); 1230 } 1231 else { 1232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1233 1234 if (!check_maxchar) { 1235 if (from_kind == PyUnicode_2BYTE_KIND 1236 && to_kind == PyUnicode_1BYTE_KIND) 1237 { 1238 _PyUnicode_CONVERT_BYTES( 1239 Py_UCS2, Py_UCS1, 1240 PyUnicode_2BYTE_DATA(from) + from_start, 1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1242 PyUnicode_1BYTE_DATA(to) + to_start 1243 ); 1244 } 1245 else if (from_kind == PyUnicode_4BYTE_KIND 1246 && to_kind == PyUnicode_1BYTE_KIND) 1247 { 1248 _PyUnicode_CONVERT_BYTES( 1249 Py_UCS4, Py_UCS1, 1250 PyUnicode_4BYTE_DATA(from) + from_start, 1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1252 PyUnicode_1BYTE_DATA(to) + to_start 1253 ); 1254 } 1255 else if (from_kind == PyUnicode_4BYTE_KIND 1256 && to_kind == PyUnicode_2BYTE_KIND) 1257 { 1258 _PyUnicode_CONVERT_BYTES( 1259 Py_UCS4, Py_UCS2, 1260 PyUnicode_4BYTE_DATA(from) + from_start, 1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1262 PyUnicode_2BYTE_DATA(to) + to_start 1263 ); 1264 } 1265 else { 1266 assert(0); 1267 return -1; 1268 } 1269 } 1270 else { 1271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1272 Py_UCS4 ch; 1273 Py_ssize_t i; 1274 1275 for (i=0; i < how_many; i++) { 1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1277 if (ch > to_maxchar) 1278 return -1; 1279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1280 } 1281 } 1282 } 1283 return 0; 1284} 1285 1286void 1287_PyUnicode_FastCopyCharacters( 1288 PyObject *to, Py_ssize_t to_start, 1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1290{ 1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1292} 1293 1294Py_ssize_t 1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1296 PyObject *from, Py_ssize_t from_start, 1297 Py_ssize_t how_many) 1298{ 1299 int err; 1300 1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1302 PyErr_BadInternalCall(); 1303 return -1; 1304 } 1305 1306 if (PyUnicode_READY(from) == -1) 1307 return -1; 1308 if (PyUnicode_READY(to) == -1) 1309 return -1; 1310 1311 if (from_start < 0) { 1312 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1313 return -1; 1314 } 1315 if (to_start < 0) { 1316 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1317 return -1; 1318 } 1319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1321 PyErr_Format(PyExc_SystemError, 1322 "Cannot write %zi characters at %zi " 1323 "in a string of %zi characters", 1324 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1325 return -1; 1326 } 1327 1328 if (how_many == 0) 1329 return 0; 1330 1331 if (unicode_check_modifiable(to)) 1332 return -1; 1333 1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1335 if (err) { 1336 PyErr_Format(PyExc_SystemError, 1337 "Cannot copy %s characters " 1338 "into a string of %s characters", 1339 unicode_kind_name(from), 1340 unicode_kind_name(to)); 1341 return -1; 1342 } 1343 return how_many; 1344} 1345 1346/* Find the maximum code point and count the number of surrogate pairs so a 1347 correct string length can be computed before converting a string to UCS4. 1348 This function counts single surrogates as a character and not as a pair. 1349 1350 Return 0 on success, or -1 on error. */ 1351static int 1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1354{ 1355 const wchar_t *iter; 1356 Py_UCS4 ch; 1357 1358 assert(num_surrogates != NULL && maxchar != NULL); 1359 *num_surrogates = 0; 1360 *maxchar = 0; 1361 1362 for (iter = begin; iter < end; ) { 1363#if SIZEOF_WCHAR_T == 2 1364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1365 && (iter+1) < end 1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1367 { 1368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1369 ++(*num_surrogates); 1370 iter += 2; 1371 } 1372 else 1373#endif 1374 { 1375 ch = *iter; 1376 iter++; 1377 } 1378 if (ch > *maxchar) { 1379 *maxchar = ch; 1380 if (*maxchar > MAX_UNICODE) { 1381 PyErr_Format(PyExc_ValueError, 1382 "character U+%x is not in range [U+0000; U+10ffff]", 1383 ch); 1384 return -1; 1385 } 1386 } 1387 } 1388 return 0; 1389} 1390 1391int 1392_PyUnicode_Ready(PyObject *unicode) 1393{ 1394 wchar_t *end; 1395 Py_UCS4 maxchar = 0; 1396 Py_ssize_t num_surrogates; 1397#if SIZEOF_WCHAR_T == 2 1398 Py_ssize_t length_wo_surrogates; 1399#endif 1400 1401 /* _PyUnicode_Ready() is only intended for old-style API usage where 1402 strings were created using _PyObject_New() and where no canonical 1403 representation (the str field) has been set yet aka strings 1404 which are not yet ready. */ 1405 assert(_PyUnicode_CHECK(unicode)); 1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1407 assert(_PyUnicode_WSTR(unicode) != NULL); 1408 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1409 assert(_PyUnicode_UTF8(unicode) == NULL); 1410 /* Actually, it should neither be interned nor be anything else: */ 1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1412 1413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1415 &maxchar, &num_surrogates) == -1) 1416 return -1; 1417 1418 if (maxchar < 256) { 1419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1420 if (!_PyUnicode_DATA_ANY(unicode)) { 1421 PyErr_NoMemory(); 1422 return -1; 1423 } 1424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1425 _PyUnicode_WSTR(unicode), end, 1426 PyUnicode_1BYTE_DATA(unicode)); 1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1430 if (maxchar < 128) { 1431 _PyUnicode_STATE(unicode).ascii = 1; 1432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1434 } 1435 else { 1436 _PyUnicode_STATE(unicode).ascii = 0; 1437 _PyUnicode_UTF8(unicode) = NULL; 1438 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1439 } 1440 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1441 _PyUnicode_WSTR(unicode) = NULL; 1442 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1443 } 1444 /* In this case we might have to convert down from 4-byte native 1445 wchar_t to 2-byte unicode. */ 1446 else if (maxchar < 65536) { 1447 assert(num_surrogates == 0 && 1448 "FindMaxCharAndNumSurrogatePairs() messed up"); 1449 1450#if SIZEOF_WCHAR_T == 2 1451 /* We can share representations and are done. */ 1452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1456 _PyUnicode_UTF8(unicode) = NULL; 1457 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1458#else 1459 /* sizeof(wchar_t) == 4 */ 1460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1462 if (!_PyUnicode_DATA_ANY(unicode)) { 1463 PyErr_NoMemory(); 1464 return -1; 1465 } 1466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1467 _PyUnicode_WSTR(unicode), end, 1468 PyUnicode_2BYTE_DATA(unicode)); 1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1472 _PyUnicode_UTF8(unicode) = NULL; 1473 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1474 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1475 _PyUnicode_WSTR(unicode) = NULL; 1476 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1477#endif 1478 } 1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1480 else { 1481#if SIZEOF_WCHAR_T == 2 1482 /* in case the native representation is 2-bytes, we need to allocate a 1483 new normalized 4-byte version. */ 1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1486 if (!_PyUnicode_DATA_ANY(unicode)) { 1487 PyErr_NoMemory(); 1488 return -1; 1489 } 1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1492 _PyUnicode_UTF8(unicode) = NULL; 1493 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1494 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1495 _PyUnicode_STATE(unicode).ready = 1; 1496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1497 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1498 _PyUnicode_WSTR(unicode) = NULL; 1499 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1500#else 1501 assert(num_surrogates == 0); 1502 1503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1505 _PyUnicode_UTF8(unicode) = NULL; 1506 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1508#endif 1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1510 } 1511 _PyUnicode_STATE(unicode).ready = 1; 1512 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1513 return 0; 1514} 1515 1516static void 1517unicode_dealloc(register PyObject *unicode) 1518{ 1519 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1520 case SSTATE_NOT_INTERNED: 1521 break; 1522 1523 case SSTATE_INTERNED_MORTAL: 1524 /* revive dead object temporarily for DelItem */ 1525 Py_REFCNT(unicode) = 3; 1526 if (PyDict_DelItem(interned, unicode) != 0) 1527 Py_FatalError( 1528 "deletion of interned string failed"); 1529 break; 1530 1531 case SSTATE_INTERNED_IMMORTAL: 1532 Py_FatalError("Immortal interned string died."); 1533 1534 default: 1535 Py_FatalError("Inconsistent interned string state."); 1536 } 1537 1538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1539 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1541 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1544 1545 Py_TYPE(unicode)->tp_free(unicode); 1546} 1547 1548#ifdef Py_DEBUG 1549static int 1550unicode_is_singleton(PyObject *unicode) 1551{ 1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1553 if (unicode == unicode_empty) 1554 return 1; 1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1556 { 1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1558 if (ch < 256 && unicode_latin1[ch] == unicode) 1559 return 1; 1560 } 1561 return 0; 1562} 1563#endif 1564 1565static int 1566unicode_modifiable(PyObject *unicode) 1567{ 1568 assert(_PyUnicode_CHECK(unicode)); 1569 if (Py_REFCNT(unicode) != 1) 1570 return 0; 1571 if (_PyUnicode_HASH(unicode) != -1) 1572 return 0; 1573 if (PyUnicode_CHECK_INTERNED(unicode)) 1574 return 0; 1575 if (!PyUnicode_CheckExact(unicode)) 1576 return 0; 1577#ifdef Py_DEBUG 1578 /* singleton refcount is greater than 1 */ 1579 assert(!unicode_is_singleton(unicode)); 1580#endif 1581 return 1; 1582} 1583 1584static int 1585unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1586{ 1587 PyObject *unicode; 1588 Py_ssize_t old_length; 1589 1590 assert(p_unicode != NULL); 1591 unicode = *p_unicode; 1592 1593 assert(unicode != NULL); 1594 assert(PyUnicode_Check(unicode)); 1595 assert(0 <= length); 1596 1597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1598 old_length = PyUnicode_WSTR_LENGTH(unicode); 1599 else 1600 old_length = PyUnicode_GET_LENGTH(unicode); 1601 if (old_length == length) 1602 return 0; 1603 1604 if (length == 0) { 1605 Py_DECREF(*p_unicode); 1606 *p_unicode = unicode_empty; 1607 Py_INCREF(*p_unicode); 1608 return 0; 1609 } 1610 1611 if (!unicode_modifiable(unicode)) { 1612 PyObject *copy = resize_copy(unicode, length); 1613 if (copy == NULL) 1614 return -1; 1615 Py_DECREF(*p_unicode); 1616 *p_unicode = copy; 1617 return 0; 1618 } 1619 1620 if (PyUnicode_IS_COMPACT(unicode)) { 1621 PyObject *new_unicode = resize_compact(unicode, length); 1622 if (new_unicode == NULL) 1623 return -1; 1624 *p_unicode = new_unicode; 1625 return 0; 1626 } 1627 return resize_inplace(unicode, length); 1628} 1629 1630int 1631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1632{ 1633 PyObject *unicode; 1634 if (p_unicode == NULL) { 1635 PyErr_BadInternalCall(); 1636 return -1; 1637 } 1638 unicode = *p_unicode; 1639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1640 { 1641 PyErr_BadInternalCall(); 1642 return -1; 1643 } 1644 return unicode_resize(p_unicode, length); 1645} 1646 1647static int 1648unicode_widen(PyObject **p_unicode, Py_ssize_t length, 1649 unsigned int maxchar) 1650{ 1651 PyObject *result; 1652 assert(PyUnicode_IS_READY(*p_unicode)); 1653 assert(length <= PyUnicode_GET_LENGTH(*p_unicode)); 1654 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1655 return 0; 1656 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1657 maxchar); 1658 if (result == NULL) 1659 return -1; 1660 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length); 1661 Py_DECREF(*p_unicode); 1662 *p_unicode = result; 1663 return 0; 1664} 1665 1666static int 1667unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1668 Py_UCS4 ch) 1669{ 1670 assert(ch <= MAX_UNICODE); 1671 if (unicode_widen(p_unicode, *pos, ch) < 0) 1672 return -1; 1673 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1674 PyUnicode_DATA(*p_unicode), 1675 (*pos)++, ch); 1676 return 0; 1677} 1678 1679/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1680 1681 WARNING: The function doesn't copy the terminating null character and 1682 doesn't check the maximum character (may write a latin1 character in an 1683 ASCII string). */ 1684static void 1685unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1686 const char *str, Py_ssize_t len) 1687{ 1688 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1689 void *data = PyUnicode_DATA(unicode); 1690 const char *end = str + len; 1691 1692 switch (kind) { 1693 case PyUnicode_1BYTE_KIND: { 1694 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1695#ifdef Py_DEBUG 1696 if (PyUnicode_IS_ASCII(unicode)) { 1697 Py_UCS4 maxchar = ucs1lib_find_max_char( 1698 (const Py_UCS1*)str, 1699 (const Py_UCS1*)str + len); 1700 assert(maxchar < 128); 1701 } 1702#endif 1703 memcpy((char *) data + index, str, len); 1704 break; 1705 } 1706 case PyUnicode_2BYTE_KIND: { 1707 Py_UCS2 *start = (Py_UCS2 *)data + index; 1708 Py_UCS2 *ucs2 = start; 1709 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1710 1711 for (; str < end; ++ucs2, ++str) 1712 *ucs2 = (Py_UCS2)*str; 1713 1714 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1715 break; 1716 } 1717 default: { 1718 Py_UCS4 *start = (Py_UCS4 *)data + index; 1719 Py_UCS4 *ucs4 = start; 1720 assert(kind == PyUnicode_4BYTE_KIND); 1721 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1722 1723 for (; str < end; ++ucs4, ++str) 1724 *ucs4 = (Py_UCS4)*str; 1725 1726 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1727 } 1728 } 1729} 1730 1731 1732static PyObject* 1733get_latin1_char(unsigned char ch) 1734{ 1735 PyObject *unicode = unicode_latin1[ch]; 1736 if (!unicode) { 1737 unicode = PyUnicode_New(1, ch); 1738 if (!unicode) 1739 return NULL; 1740 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1741 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1742 unicode_latin1[ch] = unicode; 1743 } 1744 Py_INCREF(unicode); 1745 return unicode; 1746} 1747 1748PyObject * 1749PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1750{ 1751 PyObject *unicode; 1752 Py_UCS4 maxchar = 0; 1753 Py_ssize_t num_surrogates; 1754 1755 if (u == NULL) 1756 return (PyObject*)_PyUnicode_New(size); 1757 1758 /* If the Unicode data is known at construction time, we can apply 1759 some optimizations which share commonly used objects. */ 1760 1761 /* Optimization for empty strings */ 1762 if (size == 0 && unicode_empty != NULL) { 1763 Py_INCREF(unicode_empty); 1764 return unicode_empty; 1765 } 1766 1767 /* Single character Unicode objects in the Latin-1 range are 1768 shared when using this constructor */ 1769 if (size == 1 && *u < 256) 1770 return get_latin1_char((unsigned char)*u); 1771 1772 /* If not empty and not single character, copy the Unicode data 1773 into the new object */ 1774 if (find_maxchar_surrogates(u, u + size, 1775 &maxchar, &num_surrogates) == -1) 1776 return NULL; 1777 1778 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1779 if (!unicode) 1780 return NULL; 1781 1782 switch (PyUnicode_KIND(unicode)) { 1783 case PyUnicode_1BYTE_KIND: 1784 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1785 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1786 break; 1787 case PyUnicode_2BYTE_KIND: 1788#if Py_UNICODE_SIZE == 2 1789 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1790#else 1791 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1792 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1793#endif 1794 break; 1795 case PyUnicode_4BYTE_KIND: 1796#if SIZEOF_WCHAR_T == 2 1797 /* This is the only case which has to process surrogates, thus 1798 a simple copy loop is not enough and we need a function. */ 1799 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1800#else 1801 assert(num_surrogates == 0); 1802 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1803#endif 1804 break; 1805 default: 1806 assert(0 && "Impossible state"); 1807 } 1808 1809 return unicode_result(unicode); 1810} 1811 1812PyObject * 1813PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1814{ 1815 if (size < 0) { 1816 PyErr_SetString(PyExc_SystemError, 1817 "Negative size passed to PyUnicode_FromStringAndSize"); 1818 return NULL; 1819 } 1820 if (u != NULL) 1821 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1822 else 1823 return (PyObject *)_PyUnicode_New(size); 1824} 1825 1826PyObject * 1827PyUnicode_FromString(const char *u) 1828{ 1829 size_t size = strlen(u); 1830 if (size > PY_SSIZE_T_MAX) { 1831 PyErr_SetString(PyExc_OverflowError, "input too long"); 1832 return NULL; 1833 } 1834 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1835} 1836 1837PyObject * 1838_PyUnicode_FromId(_Py_Identifier *id) 1839{ 1840 if (!id->object) { 1841 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1842 strlen(id->string), 1843 NULL, NULL); 1844 if (!id->object) 1845 return NULL; 1846 PyUnicode_InternInPlace(&id->object); 1847 assert(!id->next); 1848 id->next = static_strings; 1849 static_strings = id; 1850 } 1851 return id->object; 1852} 1853 1854void 1855_PyUnicode_ClearStaticStrings() 1856{ 1857 _Py_Identifier *i; 1858 for (i = static_strings; i; i = i->next) { 1859 Py_DECREF(i->object); 1860 i->object = NULL; 1861 i->next = NULL; 1862 } 1863} 1864 1865/* Internal function, doesn't check maximum character */ 1866 1867PyObject* 1868_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1869{ 1870 const unsigned char *s = (const unsigned char *)buffer; 1871 PyObject *unicode; 1872 if (size == 1) { 1873#ifdef Py_DEBUG 1874 assert(s[0] < 128); 1875#endif 1876 return get_latin1_char(s[0]); 1877 } 1878 unicode = PyUnicode_New(size, 127); 1879 if (!unicode) 1880 return NULL; 1881 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1882 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1883 return unicode; 1884} 1885 1886static Py_UCS4 1887kind_maxchar_limit(unsigned int kind) 1888{ 1889 switch (kind) { 1890 case PyUnicode_1BYTE_KIND: 1891 return 0x80; 1892 case PyUnicode_2BYTE_KIND: 1893 return 0x100; 1894 case PyUnicode_4BYTE_KIND: 1895 return 0x10000; 1896 default: 1897 assert(0 && "invalid kind"); 1898 return MAX_UNICODE; 1899 } 1900} 1901 1902Py_LOCAL_INLINE(Py_UCS4) 1903align_maxchar(Py_UCS4 maxchar) 1904{ 1905 if (maxchar <= 127) 1906 return 127; 1907 else if (maxchar <= 255) 1908 return 255; 1909 else if (maxchar <= 65535) 1910 return 65535; 1911 else 1912 return MAX_UNICODE; 1913} 1914 1915static PyObject* 1916_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1917{ 1918 PyObject *res; 1919 unsigned char max_char; 1920 1921 if (size == 0) { 1922 Py_INCREF(unicode_empty); 1923 return unicode_empty; 1924 } 1925 assert(size > 0); 1926 if (size == 1) 1927 return get_latin1_char(u[0]); 1928 1929 max_char = ucs1lib_find_max_char(u, u + size); 1930 res = PyUnicode_New(size, max_char); 1931 if (!res) 1932 return NULL; 1933 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1934 assert(_PyUnicode_CheckConsistency(res, 1)); 1935 return res; 1936} 1937 1938static PyObject* 1939_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1940{ 1941 PyObject *res; 1942 Py_UCS2 max_char; 1943 1944 if (size == 0) { 1945 Py_INCREF(unicode_empty); 1946 return unicode_empty; 1947 } 1948 assert(size > 0); 1949 if (size == 1) { 1950 Py_UCS4 ch = u[0]; 1951 if (ch < 256) 1952 return get_latin1_char((unsigned char)ch); 1953 1954 res = PyUnicode_New(1, ch); 1955 if (res == NULL) 1956 return NULL; 1957 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1958 assert(_PyUnicode_CheckConsistency(res, 1)); 1959 return res; 1960 } 1961 1962 max_char = ucs2lib_find_max_char(u, u + size); 1963 res = PyUnicode_New(size, max_char); 1964 if (!res) 1965 return NULL; 1966 if (max_char >= 256) 1967 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1968 else { 1969 _PyUnicode_CONVERT_BYTES( 1970 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1971 } 1972 assert(_PyUnicode_CheckConsistency(res, 1)); 1973 return res; 1974} 1975 1976static PyObject* 1977_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1978{ 1979 PyObject *res; 1980 Py_UCS4 max_char; 1981 1982 if (size == 0) { 1983 Py_INCREF(unicode_empty); 1984 return unicode_empty; 1985 } 1986 assert(size > 0); 1987 if (size == 1) { 1988 Py_UCS4 ch = u[0]; 1989 if (ch < 256) 1990 return get_latin1_char((unsigned char)ch); 1991 1992 res = PyUnicode_New(1, ch); 1993 if (res == NULL) 1994 return NULL; 1995 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1996 assert(_PyUnicode_CheckConsistency(res, 1)); 1997 return res; 1998 } 1999 2000 max_char = ucs4lib_find_max_char(u, u + size); 2001 res = PyUnicode_New(size, max_char); 2002 if (!res) 2003 return NULL; 2004 if (max_char < 256) 2005 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2006 PyUnicode_1BYTE_DATA(res)); 2007 else if (max_char < 0x10000) 2008 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2009 PyUnicode_2BYTE_DATA(res)); 2010 else 2011 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2012 assert(_PyUnicode_CheckConsistency(res, 1)); 2013 return res; 2014} 2015 2016PyObject* 2017PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2018{ 2019 if (size < 0) { 2020 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2021 return NULL; 2022 } 2023 switch (kind) { 2024 case PyUnicode_1BYTE_KIND: 2025 return _PyUnicode_FromUCS1(buffer, size); 2026 case PyUnicode_2BYTE_KIND: 2027 return _PyUnicode_FromUCS2(buffer, size); 2028 case PyUnicode_4BYTE_KIND: 2029 return _PyUnicode_FromUCS4(buffer, size); 2030 default: 2031 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2032 return NULL; 2033 } 2034} 2035 2036Py_UCS4 2037_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2038{ 2039 enum PyUnicode_Kind kind; 2040 void *startptr, *endptr; 2041 2042 assert(PyUnicode_IS_READY(unicode)); 2043 assert(0 <= start); 2044 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2045 assert(start <= end); 2046 2047 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2048 return PyUnicode_MAX_CHAR_VALUE(unicode); 2049 2050 if (start == end) 2051 return 127; 2052 2053 if (PyUnicode_IS_ASCII(unicode)) 2054 return 127; 2055 2056 kind = PyUnicode_KIND(unicode); 2057 startptr = PyUnicode_DATA(unicode); 2058 endptr = (char *)startptr + end * kind; 2059 startptr = (char *)startptr + start * kind; 2060 switch(kind) { 2061 case PyUnicode_1BYTE_KIND: 2062 return ucs1lib_find_max_char(startptr, endptr); 2063 case PyUnicode_2BYTE_KIND: 2064 return ucs2lib_find_max_char(startptr, endptr); 2065 case PyUnicode_4BYTE_KIND: 2066 return ucs4lib_find_max_char(startptr, endptr); 2067 default: 2068 assert(0); 2069 return 0; 2070 } 2071} 2072 2073/* Ensure that a string uses the most efficient storage, if it is not the 2074 case: create a new string with of the right kind. Write NULL into *p_unicode 2075 on error. */ 2076static void 2077unicode_adjust_maxchar(PyObject **p_unicode) 2078{ 2079 PyObject *unicode, *copy; 2080 Py_UCS4 max_char; 2081 Py_ssize_t len; 2082 unsigned int kind; 2083 2084 assert(p_unicode != NULL); 2085 unicode = *p_unicode; 2086 assert(PyUnicode_IS_READY(unicode)); 2087 if (PyUnicode_IS_ASCII(unicode)) 2088 return; 2089 2090 len = PyUnicode_GET_LENGTH(unicode); 2091 kind = PyUnicode_KIND(unicode); 2092 if (kind == PyUnicode_1BYTE_KIND) { 2093 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2094 max_char = ucs1lib_find_max_char(u, u + len); 2095 if (max_char >= 128) 2096 return; 2097 } 2098 else if (kind == PyUnicode_2BYTE_KIND) { 2099 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2100 max_char = ucs2lib_find_max_char(u, u + len); 2101 if (max_char >= 256) 2102 return; 2103 } 2104 else { 2105 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2106 assert(kind == PyUnicode_4BYTE_KIND); 2107 max_char = ucs4lib_find_max_char(u, u + len); 2108 if (max_char >= 0x10000) 2109 return; 2110 } 2111 copy = PyUnicode_New(len, max_char); 2112 if (copy != NULL) 2113 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2114 Py_DECREF(unicode); 2115 *p_unicode = copy; 2116} 2117 2118PyObject* 2119_PyUnicode_Copy(PyObject *unicode) 2120{ 2121 Py_ssize_t length; 2122 PyObject *copy; 2123 2124 if (!PyUnicode_Check(unicode)) { 2125 PyErr_BadInternalCall(); 2126 return NULL; 2127 } 2128 if (PyUnicode_READY(unicode) == -1) 2129 return NULL; 2130 2131 length = PyUnicode_GET_LENGTH(unicode); 2132 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2133 if (!copy) 2134 return NULL; 2135 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2136 2137 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2138 length * PyUnicode_KIND(unicode)); 2139 assert(_PyUnicode_CheckConsistency(copy, 1)); 2140 return copy; 2141} 2142 2143 2144/* Widen Unicode objects to larger buffers. Don't write terminating null 2145 character. Return NULL on error. */ 2146 2147void* 2148_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2149{ 2150 Py_ssize_t len; 2151 void *result; 2152 unsigned int skind; 2153 2154 if (PyUnicode_READY(s) == -1) 2155 return NULL; 2156 2157 len = PyUnicode_GET_LENGTH(s); 2158 skind = PyUnicode_KIND(s); 2159 if (skind >= kind) { 2160 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2161 return NULL; 2162 } 2163 switch (kind) { 2164 case PyUnicode_2BYTE_KIND: 2165 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2166 if (!result) 2167 return PyErr_NoMemory(); 2168 assert(skind == PyUnicode_1BYTE_KIND); 2169 _PyUnicode_CONVERT_BYTES( 2170 Py_UCS1, Py_UCS2, 2171 PyUnicode_1BYTE_DATA(s), 2172 PyUnicode_1BYTE_DATA(s) + len, 2173 result); 2174 return result; 2175 case PyUnicode_4BYTE_KIND: 2176 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2177 if (!result) 2178 return PyErr_NoMemory(); 2179 if (skind == PyUnicode_2BYTE_KIND) { 2180 _PyUnicode_CONVERT_BYTES( 2181 Py_UCS2, Py_UCS4, 2182 PyUnicode_2BYTE_DATA(s), 2183 PyUnicode_2BYTE_DATA(s) + len, 2184 result); 2185 } 2186 else { 2187 assert(skind == PyUnicode_1BYTE_KIND); 2188 _PyUnicode_CONVERT_BYTES( 2189 Py_UCS1, Py_UCS4, 2190 PyUnicode_1BYTE_DATA(s), 2191 PyUnicode_1BYTE_DATA(s) + len, 2192 result); 2193 } 2194 return result; 2195 default: 2196 break; 2197 } 2198 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2199 return NULL; 2200} 2201 2202static Py_UCS4* 2203as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2204 int copy_null) 2205{ 2206 int kind; 2207 void *data; 2208 Py_ssize_t len, targetlen; 2209 if (PyUnicode_READY(string) == -1) 2210 return NULL; 2211 kind = PyUnicode_KIND(string); 2212 data = PyUnicode_DATA(string); 2213 len = PyUnicode_GET_LENGTH(string); 2214 targetlen = len; 2215 if (copy_null) 2216 targetlen++; 2217 if (!target) { 2218 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2219 PyErr_NoMemory(); 2220 return NULL; 2221 } 2222 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2223 if (!target) { 2224 PyErr_NoMemory(); 2225 return NULL; 2226 } 2227 } 2228 else { 2229 if (targetsize < targetlen) { 2230 PyErr_Format(PyExc_SystemError, 2231 "string is longer than the buffer"); 2232 if (copy_null && 0 < targetsize) 2233 target[0] = 0; 2234 return NULL; 2235 } 2236 } 2237 if (kind == PyUnicode_1BYTE_KIND) { 2238 Py_UCS1 *start = (Py_UCS1 *) data; 2239 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2240 } 2241 else if (kind == PyUnicode_2BYTE_KIND) { 2242 Py_UCS2 *start = (Py_UCS2 *) data; 2243 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2244 } 2245 else { 2246 assert(kind == PyUnicode_4BYTE_KIND); 2247 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2248 } 2249 if (copy_null) 2250 target[len] = 0; 2251 return target; 2252} 2253 2254Py_UCS4* 2255PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2256 int copy_null) 2257{ 2258 if (target == NULL || targetsize < 0) { 2259 PyErr_BadInternalCall(); 2260 return NULL; 2261 } 2262 return as_ucs4(string, target, targetsize, copy_null); 2263} 2264 2265Py_UCS4* 2266PyUnicode_AsUCS4Copy(PyObject *string) 2267{ 2268 return as_ucs4(string, NULL, 0, 1); 2269} 2270 2271#ifdef HAVE_WCHAR_H 2272 2273PyObject * 2274PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2275{ 2276 if (w == NULL) { 2277 if (size == 0) { 2278 Py_INCREF(unicode_empty); 2279 return unicode_empty; 2280 } 2281 PyErr_BadInternalCall(); 2282 return NULL; 2283 } 2284 2285 if (size == -1) { 2286 size = wcslen(w); 2287 } 2288 2289 return PyUnicode_FromUnicode(w, size); 2290} 2291 2292#endif /* HAVE_WCHAR_H */ 2293 2294static void 2295makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2296 char c) 2297{ 2298 *fmt++ = '%'; 2299 if (longflag) 2300 *fmt++ = 'l'; 2301 else if (longlongflag) { 2302 /* longlongflag should only ever be nonzero on machines with 2303 HAVE_LONG_LONG defined */ 2304#ifdef HAVE_LONG_LONG 2305 char *f = PY_FORMAT_LONG_LONG; 2306 while (*f) 2307 *fmt++ = *f++; 2308#else 2309 /* we shouldn't ever get here */ 2310 assert(0); 2311 *fmt++ = 'l'; 2312#endif 2313 } 2314 else if (size_tflag) { 2315 char *f = PY_FORMAT_SIZE_T; 2316 while (*f) 2317 *fmt++ = *f++; 2318 } 2319 *fmt++ = c; 2320 *fmt = '\0'; 2321} 2322 2323/* maximum number of characters required for output of %lld or %p. 2324 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2325 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2326#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2327 2328static const char* 2329unicode_fromformat_arg(_PyUnicodeWriter *writer, 2330 const char *f, va_list *vargs) 2331{ 2332 const char *p; 2333 Py_ssize_t len; 2334 int zeropad; 2335 int width; 2336 int precision; 2337 int longflag; 2338 int longlongflag; 2339 int size_tflag; 2340 int fill; 2341 2342 p = f; 2343 f++; 2344 zeropad = 0; 2345 if (*f == '0') { 2346 zeropad = 1; 2347 f++; 2348 } 2349 2350 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2351 width = 0; 2352 while (Py_ISDIGIT((unsigned)*f)) { 2353 if (width > (INT_MAX - ((int)*f - '0')) / 10) { 2354 PyErr_SetString(PyExc_ValueError, 2355 "width too big"); 2356 return NULL; 2357 } 2358 width = (width*10) + (*f - '0'); 2359 f++; 2360 } 2361 precision = 0; 2362 if (*f == '.') { 2363 f++; 2364 while (Py_ISDIGIT((unsigned)*f)) { 2365 if (precision > (INT_MAX - ((int)*f - '0')) / 10) { 2366 PyErr_SetString(PyExc_ValueError, 2367 "precision too big"); 2368 return NULL; 2369 } 2370 precision = (precision*10) + (*f - '0'); 2371 f++; 2372 } 2373 if (*f == '%') { 2374 /* "%.3%s" => f points to "3" */ 2375 f--; 2376 } 2377 } 2378 if (*f == '\0') { 2379 /* bogus format "%.123" => go backward, f points to "3" */ 2380 f--; 2381 } 2382 2383 /* Handle %ld, %lu, %lld and %llu. */ 2384 longflag = 0; 2385 longlongflag = 0; 2386 size_tflag = 0; 2387 if (*f == 'l') { 2388 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2389 longflag = 1; 2390 ++f; 2391 } 2392#ifdef HAVE_LONG_LONG 2393 else if (f[1] == 'l' && 2394 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2395 longlongflag = 1; 2396 f += 2; 2397 } 2398#endif 2399 } 2400 /* handle the size_t flag. */ 2401 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2402 size_tflag = 1; 2403 ++f; 2404 } 2405 2406 if (f[1] == '\0') 2407 writer->overallocate = 0; 2408 2409 switch (*f) { 2410 case 'c': 2411 { 2412 int ordinal = va_arg(*vargs, int); 2413 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2414 PyErr_SetString(PyExc_ValueError, 2415 "character argument not in range(0x110000)"); 2416 return NULL; 2417 } 2418 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1) 2419 return NULL; 2420 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal); 2421 writer->pos++; 2422 break; 2423 } 2424 2425 case 'i': 2426 case 'd': 2427 case 'u': 2428 case 'x': 2429 { 2430 /* used by sprintf */ 2431 char fmt[10]; /* should be enough for "%0lld\0" */ 2432 char buffer[MAX_LONG_LONG_CHARS]; 2433 2434 if (*f == 'u') { 2435 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2436 2437 if (longflag) 2438 len = sprintf(buffer, fmt, 2439 va_arg(*vargs, unsigned long)); 2440#ifdef HAVE_LONG_LONG 2441 else if (longlongflag) 2442 len = sprintf(buffer, fmt, 2443 va_arg(*vargs, unsigned PY_LONG_LONG)); 2444#endif 2445 else if (size_tflag) 2446 len = sprintf(buffer, fmt, 2447 va_arg(*vargs, size_t)); 2448 else 2449 len = sprintf(buffer, fmt, 2450 va_arg(*vargs, unsigned int)); 2451 } 2452 else if (*f == 'x') { 2453 makefmt(fmt, 0, 0, 0, 'x'); 2454 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2455 } 2456 else { 2457 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2458 2459 if (longflag) 2460 len = sprintf(buffer, fmt, 2461 va_arg(*vargs, long)); 2462#ifdef HAVE_LONG_LONG 2463 else if (longlongflag) 2464 len = sprintf(buffer, fmt, 2465 va_arg(*vargs, PY_LONG_LONG)); 2466#endif 2467 else if (size_tflag) 2468 len = sprintf(buffer, fmt, 2469 va_arg(*vargs, Py_ssize_t)); 2470 else 2471 len = sprintf(buffer, fmt, 2472 va_arg(*vargs, int)); 2473 } 2474 assert(len >= 0); 2475 2476 if (precision < len) 2477 precision = len; 2478 if (width > precision) { 2479 Py_UCS4 fillchar; 2480 fill = width - precision; 2481 fillchar = zeropad?'0':' '; 2482 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1) 2483 return NULL; 2484 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2485 return NULL; 2486 writer->pos += fill; 2487 } 2488 if (precision > len) { 2489 fill = precision - len; 2490 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1) 2491 return NULL; 2492 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2493 return NULL; 2494 writer->pos += fill; 2495 } 2496 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) 2497 return NULL; 2498 break; 2499 } 2500 2501 case 'p': 2502 { 2503 char number[MAX_LONG_LONG_CHARS]; 2504 2505 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2506 assert(len >= 0); 2507 2508 /* %p is ill-defined: ensure leading 0x. */ 2509 if (number[1] == 'X') 2510 number[1] = 'x'; 2511 else if (number[1] != 'x') { 2512 memmove(number + 2, number, 2513 strlen(number) + 1); 2514 number[0] = '0'; 2515 number[1] = 'x'; 2516 len += 2; 2517 } 2518 2519 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) 2520 return NULL; 2521 break; 2522 } 2523 2524 case 's': 2525 { 2526 /* UTF-8 */ 2527 const char *s = va_arg(*vargs, const char*); 2528 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); 2529 if (!str) 2530 return NULL; 2531 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { 2532 Py_DECREF(str); 2533 return NULL; 2534 } 2535 Py_DECREF(str); 2536 break; 2537 } 2538 2539 case 'U': 2540 { 2541 PyObject *obj = va_arg(*vargs, PyObject *); 2542 assert(obj && _PyUnicode_CHECK(obj)); 2543 2544 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2545 return NULL; 2546 break; 2547 } 2548 2549 case 'V': 2550 { 2551 PyObject *obj = va_arg(*vargs, PyObject *); 2552 const char *str = va_arg(*vargs, const char *); 2553 PyObject *str_obj; 2554 assert(obj || str); 2555 if (obj) { 2556 assert(_PyUnicode_CHECK(obj)); 2557 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2558 return NULL; 2559 } 2560 else { 2561 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2562 if (!str_obj) 2563 return NULL; 2564 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { 2565 Py_DECREF(str_obj); 2566 return NULL; 2567 } 2568 Py_DECREF(str_obj); 2569 } 2570 break; 2571 } 2572 2573 case 'S': 2574 { 2575 PyObject *obj = va_arg(*vargs, PyObject *); 2576 PyObject *str; 2577 assert(obj); 2578 str = PyObject_Str(obj); 2579 if (!str) 2580 return NULL; 2581 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { 2582 Py_DECREF(str); 2583 return NULL; 2584 } 2585 Py_DECREF(str); 2586 break; 2587 } 2588 2589 case 'R': 2590 { 2591 PyObject *obj = va_arg(*vargs, PyObject *); 2592 PyObject *repr; 2593 assert(obj); 2594 repr = PyObject_Repr(obj); 2595 if (!repr) 2596 return NULL; 2597 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { 2598 Py_DECREF(repr); 2599 return NULL; 2600 } 2601 Py_DECREF(repr); 2602 break; 2603 } 2604 2605 case 'A': 2606 { 2607 PyObject *obj = va_arg(*vargs, PyObject *); 2608 PyObject *ascii; 2609 assert(obj); 2610 ascii = PyObject_ASCII(obj); 2611 if (!ascii) 2612 return NULL; 2613 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { 2614 Py_DECREF(ascii); 2615 return NULL; 2616 } 2617 Py_DECREF(ascii); 2618 break; 2619 } 2620 2621 case '%': 2622 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1) 2623 return NULL; 2624 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); 2625 writer->pos++; 2626 break; 2627 2628 default: 2629 /* if we stumble upon an unknown formatting code, copy the rest 2630 of the format string to the output string. (we cannot just 2631 skip the code, since there's no way to know what's in the 2632 argument list) */ 2633 len = strlen(p); 2634 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) 2635 return NULL; 2636 f = p+len; 2637 return f; 2638 } 2639 2640 f++; 2641 return f; 2642} 2643 2644PyObject * 2645PyUnicode_FromFormatV(const char *format, va_list vargs) 2646{ 2647 va_list vargs2; 2648 const char *f; 2649 _PyUnicodeWriter writer; 2650 2651 _PyUnicodeWriter_Init(&writer, strlen(format) + 100); 2652 2653 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2654 Copy it to be able to pass a reference to a subfunction. */ 2655 Py_VA_COPY(vargs2, vargs); 2656 2657 for (f = format; *f; ) { 2658 if (*f == '%') { 2659 f = unicode_fromformat_arg(&writer, f, &vargs2); 2660 if (f == NULL) 2661 goto fail; 2662 } 2663 else { 2664 const char *p; 2665 Py_ssize_t len; 2666 2667 p = f; 2668 do 2669 { 2670 if ((unsigned char)*p > 127) { 2671 PyErr_Format(PyExc_ValueError, 2672 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2673 "string, got a non-ASCII byte: 0x%02x", 2674 (unsigned char)*p); 2675 return NULL; 2676 } 2677 p++; 2678 } 2679 while (*p != '\0' && *p != '%'); 2680 len = p - f; 2681 2682 if (*p == '\0') 2683 writer.overallocate = 0; 2684 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) 2685 goto fail; 2686 unicode_write_cstr(writer.buffer, writer.pos, f, len); 2687 writer.pos += len; 2688 2689 f = p; 2690 } 2691 } 2692 return _PyUnicodeWriter_Finish(&writer); 2693 2694 fail: 2695 _PyUnicodeWriter_Dealloc(&writer); 2696 return NULL; 2697} 2698 2699PyObject * 2700PyUnicode_FromFormat(const char *format, ...) 2701{ 2702 PyObject* ret; 2703 va_list vargs; 2704 2705#ifdef HAVE_STDARG_PROTOTYPES 2706 va_start(vargs, format); 2707#else 2708 va_start(vargs); 2709#endif 2710 ret = PyUnicode_FromFormatV(format, vargs); 2711 va_end(vargs); 2712 return ret; 2713} 2714 2715#ifdef HAVE_WCHAR_H 2716 2717/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2718 convert a Unicode object to a wide character string. 2719 2720 - If w is NULL: return the number of wide characters (including the null 2721 character) required to convert the unicode object. Ignore size argument. 2722 2723 - Otherwise: return the number of wide characters (excluding the null 2724 character) written into w. Write at most size wide characters (including 2725 the null character). */ 2726static Py_ssize_t 2727unicode_aswidechar(PyObject *unicode, 2728 wchar_t *w, 2729 Py_ssize_t size) 2730{ 2731 Py_ssize_t res; 2732 const wchar_t *wstr; 2733 2734 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2735 if (wstr == NULL) 2736 return -1; 2737 2738 if (w != NULL) { 2739 if (size > res) 2740 size = res + 1; 2741 else 2742 res = size; 2743 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2744 return res; 2745 } 2746 else 2747 return res + 1; 2748} 2749 2750Py_ssize_t 2751PyUnicode_AsWideChar(PyObject *unicode, 2752 wchar_t *w, 2753 Py_ssize_t size) 2754{ 2755 if (unicode == NULL) { 2756 PyErr_BadInternalCall(); 2757 return -1; 2758 } 2759 return unicode_aswidechar(unicode, w, size); 2760} 2761 2762wchar_t* 2763PyUnicode_AsWideCharString(PyObject *unicode, 2764 Py_ssize_t *size) 2765{ 2766 wchar_t* buffer; 2767 Py_ssize_t buflen; 2768 2769 if (unicode == NULL) { 2770 PyErr_BadInternalCall(); 2771 return NULL; 2772 } 2773 2774 buflen = unicode_aswidechar(unicode, NULL, 0); 2775 if (buflen == -1) 2776 return NULL; 2777 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2778 PyErr_NoMemory(); 2779 return NULL; 2780 } 2781 2782 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2783 if (buffer == NULL) { 2784 PyErr_NoMemory(); 2785 return NULL; 2786 } 2787 buflen = unicode_aswidechar(unicode, buffer, buflen); 2788 if (buflen == -1) { 2789 PyMem_FREE(buffer); 2790 return NULL; 2791 } 2792 if (size != NULL) 2793 *size = buflen; 2794 return buffer; 2795} 2796 2797#endif /* HAVE_WCHAR_H */ 2798 2799PyObject * 2800PyUnicode_FromOrdinal(int ordinal) 2801{ 2802 PyObject *v; 2803 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2804 PyErr_SetString(PyExc_ValueError, 2805 "chr() arg not in range(0x110000)"); 2806 return NULL; 2807 } 2808 2809 if (ordinal < 256) 2810 return get_latin1_char(ordinal); 2811 2812 v = PyUnicode_New(1, ordinal); 2813 if (v == NULL) 2814 return NULL; 2815 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2816 assert(_PyUnicode_CheckConsistency(v, 1)); 2817 return v; 2818} 2819 2820PyObject * 2821PyUnicode_FromObject(register PyObject *obj) 2822{ 2823 /* XXX Perhaps we should make this API an alias of 2824 PyObject_Str() instead ?! */ 2825 if (PyUnicode_CheckExact(obj)) { 2826 if (PyUnicode_READY(obj) == -1) 2827 return NULL; 2828 Py_INCREF(obj); 2829 return obj; 2830 } 2831 if (PyUnicode_Check(obj)) { 2832 /* For a Unicode subtype that's not a Unicode object, 2833 return a true Unicode object with the same data. */ 2834 return _PyUnicode_Copy(obj); 2835 } 2836 PyErr_Format(PyExc_TypeError, 2837 "Can't convert '%.100s' object to str implicitly", 2838 Py_TYPE(obj)->tp_name); 2839 return NULL; 2840} 2841 2842PyObject * 2843PyUnicode_FromEncodedObject(register PyObject *obj, 2844 const char *encoding, 2845 const char *errors) 2846{ 2847 Py_buffer buffer; 2848 PyObject *v; 2849 2850 if (obj == NULL) { 2851 PyErr_BadInternalCall(); 2852 return NULL; 2853 } 2854 2855 /* Decoding bytes objects is the most common case and should be fast */ 2856 if (PyBytes_Check(obj)) { 2857 if (PyBytes_GET_SIZE(obj) == 0) { 2858 Py_INCREF(unicode_empty); 2859 v = unicode_empty; 2860 } 2861 else { 2862 v = PyUnicode_Decode( 2863 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2864 encoding, errors); 2865 } 2866 return v; 2867 } 2868 2869 if (PyUnicode_Check(obj)) { 2870 PyErr_SetString(PyExc_TypeError, 2871 "decoding str is not supported"); 2872 return NULL; 2873 } 2874 2875 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2876 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2877 PyErr_Format(PyExc_TypeError, 2878 "coercing to str: need bytes, bytearray " 2879 "or buffer-like object, %.80s found", 2880 Py_TYPE(obj)->tp_name); 2881 return NULL; 2882 } 2883 2884 if (buffer.len == 0) { 2885 Py_INCREF(unicode_empty); 2886 v = unicode_empty; 2887 } 2888 else 2889 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2890 2891 PyBuffer_Release(&buffer); 2892 return v; 2893} 2894 2895/* Convert encoding to lower case and replace '_' with '-' in order to 2896 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2897 1 on success. */ 2898static int 2899normalize_encoding(const char *encoding, 2900 char *lower, 2901 size_t lower_len) 2902{ 2903 const char *e; 2904 char *l; 2905 char *l_end; 2906 2907 if (encoding == NULL) { 2908 strcpy(lower, "utf-8"); 2909 return 1; 2910 } 2911 e = encoding; 2912 l = lower; 2913 l_end = &lower[lower_len - 1]; 2914 while (*e) { 2915 if (l == l_end) 2916 return 0; 2917 if (Py_ISUPPER(*e)) { 2918 *l++ = Py_TOLOWER(*e++); 2919 } 2920 else if (*e == '_') { 2921 *l++ = '-'; 2922 e++; 2923 } 2924 else { 2925 *l++ = *e++; 2926 } 2927 } 2928 *l = '\0'; 2929 return 1; 2930} 2931 2932PyObject * 2933PyUnicode_Decode(const char *s, 2934 Py_ssize_t size, 2935 const char *encoding, 2936 const char *errors) 2937{ 2938 PyObject *buffer = NULL, *unicode; 2939 Py_buffer info; 2940 char lower[11]; /* Enough for any encoding shortcut */ 2941 2942 /* Shortcuts for common default encodings */ 2943 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2944 if ((strcmp(lower, "utf-8") == 0) || 2945 (strcmp(lower, "utf8") == 0)) 2946 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2947 else if ((strcmp(lower, "latin-1") == 0) || 2948 (strcmp(lower, "latin1") == 0) || 2949 (strcmp(lower, "iso-8859-1") == 0)) 2950 return PyUnicode_DecodeLatin1(s, size, errors); 2951#ifdef HAVE_MBCS 2952 else if (strcmp(lower, "mbcs") == 0) 2953 return PyUnicode_DecodeMBCS(s, size, errors); 2954#endif 2955 else if (strcmp(lower, "ascii") == 0) 2956 return PyUnicode_DecodeASCII(s, size, errors); 2957 else if (strcmp(lower, "utf-16") == 0) 2958 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2959 else if (strcmp(lower, "utf-32") == 0) 2960 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2961 } 2962 2963 /* Decode via the codec registry */ 2964 buffer = NULL; 2965 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2966 goto onError; 2967 buffer = PyMemoryView_FromBuffer(&info); 2968 if (buffer == NULL) 2969 goto onError; 2970 unicode = PyCodec_Decode(buffer, encoding, errors); 2971 if (unicode == NULL) 2972 goto onError; 2973 if (!PyUnicode_Check(unicode)) { 2974 PyErr_Format(PyExc_TypeError, 2975 "decoder did not return a str object (type=%.400s)", 2976 Py_TYPE(unicode)->tp_name); 2977 Py_DECREF(unicode); 2978 goto onError; 2979 } 2980 Py_DECREF(buffer); 2981 return unicode_result(unicode); 2982 2983 onError: 2984 Py_XDECREF(buffer); 2985 return NULL; 2986} 2987 2988PyObject * 2989PyUnicode_AsDecodedObject(PyObject *unicode, 2990 const char *encoding, 2991 const char *errors) 2992{ 2993 PyObject *v; 2994 2995 if (!PyUnicode_Check(unicode)) { 2996 PyErr_BadArgument(); 2997 goto onError; 2998 } 2999 3000 if (encoding == NULL) 3001 encoding = PyUnicode_GetDefaultEncoding(); 3002 3003 /* Decode via the codec registry */ 3004 v = PyCodec_Decode(unicode, encoding, errors); 3005 if (v == NULL) 3006 goto onError; 3007 return unicode_result(v); 3008 3009 onError: 3010 return NULL; 3011} 3012 3013PyObject * 3014PyUnicode_AsDecodedUnicode(PyObject *unicode, 3015 const char *encoding, 3016 const char *errors) 3017{ 3018 PyObject *v; 3019 3020 if (!PyUnicode_Check(unicode)) { 3021 PyErr_BadArgument(); 3022 goto onError; 3023 } 3024 3025 if (encoding == NULL) 3026 encoding = PyUnicode_GetDefaultEncoding(); 3027 3028 /* Decode via the codec registry */ 3029 v = PyCodec_Decode(unicode, encoding, errors); 3030 if (v == NULL) 3031 goto onError; 3032 if (!PyUnicode_Check(v)) { 3033 PyErr_Format(PyExc_TypeError, 3034 "decoder did not return a str object (type=%.400s)", 3035 Py_TYPE(v)->tp_name); 3036 Py_DECREF(v); 3037 goto onError; 3038 } 3039 return unicode_result(v); 3040 3041 onError: 3042 return NULL; 3043} 3044 3045PyObject * 3046PyUnicode_Encode(const Py_UNICODE *s, 3047 Py_ssize_t size, 3048 const char *encoding, 3049 const char *errors) 3050{ 3051 PyObject *v, *unicode; 3052 3053 unicode = PyUnicode_FromUnicode(s, size); 3054 if (unicode == NULL) 3055 return NULL; 3056 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3057 Py_DECREF(unicode); 3058 return v; 3059} 3060 3061PyObject * 3062PyUnicode_AsEncodedObject(PyObject *unicode, 3063 const char *encoding, 3064 const char *errors) 3065{ 3066 PyObject *v; 3067 3068 if (!PyUnicode_Check(unicode)) { 3069 PyErr_BadArgument(); 3070 goto onError; 3071 } 3072 3073 if (encoding == NULL) 3074 encoding = PyUnicode_GetDefaultEncoding(); 3075 3076 /* Encode via the codec registry */ 3077 v = PyCodec_Encode(unicode, encoding, errors); 3078 if (v == NULL) 3079 goto onError; 3080 return v; 3081 3082 onError: 3083 return NULL; 3084} 3085 3086static size_t 3087wcstombs_errorpos(const wchar_t *wstr) 3088{ 3089 size_t len; 3090#if SIZEOF_WCHAR_T == 2 3091 wchar_t buf[3]; 3092#else 3093 wchar_t buf[2]; 3094#endif 3095 char outbuf[MB_LEN_MAX]; 3096 const wchar_t *start, *previous; 3097 3098#if SIZEOF_WCHAR_T == 2 3099 buf[2] = 0; 3100#else 3101 buf[1] = 0; 3102#endif 3103 start = wstr; 3104 while (*wstr != L'\0') 3105 { 3106 previous = wstr; 3107#if SIZEOF_WCHAR_T == 2 3108 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3109 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3110 { 3111 buf[0] = wstr[0]; 3112 buf[1] = wstr[1]; 3113 wstr += 2; 3114 } 3115 else { 3116 buf[0] = *wstr; 3117 buf[1] = 0; 3118 wstr++; 3119 } 3120#else 3121 buf[0] = *wstr; 3122 wstr++; 3123#endif 3124 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3125 if (len == (size_t)-1) 3126 return previous - start; 3127 } 3128 3129 /* failed to find the unencodable character */ 3130 return 0; 3131} 3132 3133static int 3134locale_error_handler(const char *errors, int *surrogateescape) 3135{ 3136 if (errors == NULL) { 3137 *surrogateescape = 0; 3138 return 0; 3139 } 3140 3141 if (strcmp(errors, "strict") == 0) { 3142 *surrogateescape = 0; 3143 return 0; 3144 } 3145 if (strcmp(errors, "surrogateescape") == 0) { 3146 *surrogateescape = 1; 3147 return 0; 3148 } 3149 PyErr_Format(PyExc_ValueError, 3150 "only 'strict' and 'surrogateescape' error handlers " 3151 "are supported, not '%s'", 3152 errors); 3153 return -1; 3154} 3155 3156PyObject * 3157PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3158{ 3159 Py_ssize_t wlen, wlen2; 3160 wchar_t *wstr; 3161 PyObject *bytes = NULL; 3162 char *errmsg; 3163 PyObject *reason; 3164 PyObject *exc; 3165 size_t error_pos; 3166 int surrogateescape; 3167 3168 if (locale_error_handler(errors, &surrogateescape) < 0) 3169 return NULL; 3170 3171 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3172 if (wstr == NULL) 3173 return NULL; 3174 3175 wlen2 = wcslen(wstr); 3176 if (wlen2 != wlen) { 3177 PyMem_Free(wstr); 3178 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3179 return NULL; 3180 } 3181 3182 if (surrogateescape) { 3183 /* locale encoding with surrogateescape */ 3184 char *str; 3185 3186 str = _Py_wchar2char(wstr, &error_pos); 3187 if (str == NULL) { 3188 if (error_pos == (size_t)-1) { 3189 PyErr_NoMemory(); 3190 PyMem_Free(wstr); 3191 return NULL; 3192 } 3193 else { 3194 goto encode_error; 3195 } 3196 } 3197 PyMem_Free(wstr); 3198 3199 bytes = PyBytes_FromString(str); 3200 PyMem_Free(str); 3201 } 3202 else { 3203 size_t len, len2; 3204 3205 len = wcstombs(NULL, wstr, 0); 3206 if (len == (size_t)-1) { 3207 error_pos = (size_t)-1; 3208 goto encode_error; 3209 } 3210 3211 bytes = PyBytes_FromStringAndSize(NULL, len); 3212 if (bytes == NULL) { 3213 PyMem_Free(wstr); 3214 return NULL; 3215 } 3216 3217 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3218 if (len2 == (size_t)-1 || len2 > len) { 3219 error_pos = (size_t)-1; 3220 goto encode_error; 3221 } 3222 PyMem_Free(wstr); 3223 } 3224 return bytes; 3225 3226encode_error: 3227 errmsg = strerror(errno); 3228 assert(errmsg != NULL); 3229 3230 if (error_pos == (size_t)-1) 3231 error_pos = wcstombs_errorpos(wstr); 3232 3233 PyMem_Free(wstr); 3234 Py_XDECREF(bytes); 3235 3236 if (errmsg != NULL) { 3237 size_t errlen; 3238 wstr = _Py_char2wchar(errmsg, &errlen); 3239 if (wstr != NULL) { 3240 reason = PyUnicode_FromWideChar(wstr, errlen); 3241 PyMem_Free(wstr); 3242 } else 3243 errmsg = NULL; 3244 } 3245 if (errmsg == NULL) 3246 reason = PyUnicode_FromString( 3247 "wcstombs() encountered an unencodable " 3248 "wide character"); 3249 if (reason == NULL) 3250 return NULL; 3251 3252 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3253 "locale", unicode, 3254 (Py_ssize_t)error_pos, 3255 (Py_ssize_t)(error_pos+1), 3256 reason); 3257 Py_DECREF(reason); 3258 if (exc != NULL) { 3259 PyCodec_StrictErrors(exc); 3260 Py_XDECREF(exc); 3261 } 3262 return NULL; 3263} 3264 3265PyObject * 3266PyUnicode_EncodeFSDefault(PyObject *unicode) 3267{ 3268#ifdef HAVE_MBCS 3269 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3270#elif defined(__APPLE__) 3271 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3272#else 3273 PyInterpreterState *interp = PyThreadState_GET()->interp; 3274 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3275 cannot use it to encode and decode filenames before it is loaded. Load 3276 the Python codec requires to encode at least its own filename. Use the C 3277 version of the locale codec until the codec registry is initialized and 3278 the Python codec is loaded. 3279 3280 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3281 cannot only rely on it: check also interp->fscodec_initialized for 3282 subinterpreters. */ 3283 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3284 return PyUnicode_AsEncodedString(unicode, 3285 Py_FileSystemDefaultEncoding, 3286 "surrogateescape"); 3287 } 3288 else { 3289 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3290 } 3291#endif 3292} 3293 3294PyObject * 3295PyUnicode_AsEncodedString(PyObject *unicode, 3296 const char *encoding, 3297 const char *errors) 3298{ 3299 PyObject *v; 3300 char lower[11]; /* Enough for any encoding shortcut */ 3301 3302 if (!PyUnicode_Check(unicode)) { 3303 PyErr_BadArgument(); 3304 return NULL; 3305 } 3306 3307 /* Shortcuts for common default encodings */ 3308 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3309 if ((strcmp(lower, "utf-8") == 0) || 3310 (strcmp(lower, "utf8") == 0)) 3311 { 3312 if (errors == NULL || strcmp(errors, "strict") == 0) 3313 return _PyUnicode_AsUTF8String(unicode, NULL); 3314 else 3315 return _PyUnicode_AsUTF8String(unicode, errors); 3316 } 3317 else if ((strcmp(lower, "latin-1") == 0) || 3318 (strcmp(lower, "latin1") == 0) || 3319 (strcmp(lower, "iso-8859-1") == 0)) 3320 return _PyUnicode_AsLatin1String(unicode, errors); 3321#ifdef HAVE_MBCS 3322 else if (strcmp(lower, "mbcs") == 0) 3323 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3324#endif 3325 else if (strcmp(lower, "ascii") == 0) 3326 return _PyUnicode_AsASCIIString(unicode, errors); 3327 } 3328 3329 /* Encode via the codec registry */ 3330 v = PyCodec_Encode(unicode, encoding, errors); 3331 if (v == NULL) 3332 return NULL; 3333 3334 /* The normal path */ 3335 if (PyBytes_Check(v)) 3336 return v; 3337 3338 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3339 if (PyByteArray_Check(v)) { 3340 int error; 3341 PyObject *b; 3342 3343 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3344 "encoder %s returned bytearray instead of bytes", 3345 encoding); 3346 if (error) { 3347 Py_DECREF(v); 3348 return NULL; 3349 } 3350 3351 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3352 Py_DECREF(v); 3353 return b; 3354 } 3355 3356 PyErr_Format(PyExc_TypeError, 3357 "encoder did not return a bytes object (type=%.400s)", 3358 Py_TYPE(v)->tp_name); 3359 Py_DECREF(v); 3360 return NULL; 3361} 3362 3363PyObject * 3364PyUnicode_AsEncodedUnicode(PyObject *unicode, 3365 const char *encoding, 3366 const char *errors) 3367{ 3368 PyObject *v; 3369 3370 if (!PyUnicode_Check(unicode)) { 3371 PyErr_BadArgument(); 3372 goto onError; 3373 } 3374 3375 if (encoding == NULL) 3376 encoding = PyUnicode_GetDefaultEncoding(); 3377 3378 /* Encode via the codec registry */ 3379 v = PyCodec_Encode(unicode, encoding, errors); 3380 if (v == NULL) 3381 goto onError; 3382 if (!PyUnicode_Check(v)) { 3383 PyErr_Format(PyExc_TypeError, 3384 "encoder did not return an str object (type=%.400s)", 3385 Py_TYPE(v)->tp_name); 3386 Py_DECREF(v); 3387 goto onError; 3388 } 3389 return v; 3390 3391 onError: 3392 return NULL; 3393} 3394 3395static size_t 3396mbstowcs_errorpos(const char *str, size_t len) 3397{ 3398#ifdef HAVE_MBRTOWC 3399 const char *start = str; 3400 mbstate_t mbs; 3401 size_t converted; 3402 wchar_t ch; 3403 3404 memset(&mbs, 0, sizeof mbs); 3405 while (len) 3406 { 3407 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3408 if (converted == 0) 3409 /* Reached end of string */ 3410 break; 3411 if (converted == (size_t)-1 || converted == (size_t)-2) { 3412 /* Conversion error or incomplete character */ 3413 return str - start; 3414 } 3415 else { 3416 str += converted; 3417 len -= converted; 3418 } 3419 } 3420 /* failed to find the undecodable byte sequence */ 3421 return 0; 3422#endif 3423 return 0; 3424} 3425 3426PyObject* 3427PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3428 const char *errors) 3429{ 3430 wchar_t smallbuf[256]; 3431 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3432 wchar_t *wstr; 3433 size_t wlen, wlen2; 3434 PyObject *unicode; 3435 int surrogateescape; 3436 size_t error_pos; 3437 char *errmsg; 3438 PyObject *reason, *exc; 3439 3440 if (locale_error_handler(errors, &surrogateescape) < 0) 3441 return NULL; 3442 3443 if (str[len] != '\0' || len != strlen(str)) { 3444 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3445 return NULL; 3446 } 3447 3448 if (surrogateescape) 3449 { 3450 wstr = _Py_char2wchar(str, &wlen); 3451 if (wstr == NULL) { 3452 if (wlen == (size_t)-1) 3453 PyErr_NoMemory(); 3454 else 3455 PyErr_SetFromErrno(PyExc_OSError); 3456 return NULL; 3457 } 3458 3459 unicode = PyUnicode_FromWideChar(wstr, wlen); 3460 PyMem_Free(wstr); 3461 } 3462 else { 3463#ifndef HAVE_BROKEN_MBSTOWCS 3464 wlen = mbstowcs(NULL, str, 0); 3465#else 3466 wlen = len; 3467#endif 3468 if (wlen == (size_t)-1) 3469 goto decode_error; 3470 if (wlen+1 <= smallbuf_len) { 3471 wstr = smallbuf; 3472 } 3473 else { 3474 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3475 return PyErr_NoMemory(); 3476 3477 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3478 if (!wstr) 3479 return PyErr_NoMemory(); 3480 } 3481 3482 /* This shouldn't fail now */ 3483 wlen2 = mbstowcs(wstr, str, wlen+1); 3484 if (wlen2 == (size_t)-1) { 3485 if (wstr != smallbuf) 3486 PyMem_Free(wstr); 3487 goto decode_error; 3488 } 3489#ifdef HAVE_BROKEN_MBSTOWCS 3490 assert(wlen2 == wlen); 3491#endif 3492 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3493 if (wstr != smallbuf) 3494 PyMem_Free(wstr); 3495 } 3496 return unicode; 3497 3498decode_error: 3499 errmsg = strerror(errno); 3500 assert(errmsg != NULL); 3501 3502 error_pos = mbstowcs_errorpos(str, len); 3503 if (errmsg != NULL) { 3504 size_t errlen; 3505 wstr = _Py_char2wchar(errmsg, &errlen); 3506 if (wstr != NULL) { 3507 reason = PyUnicode_FromWideChar(wstr, errlen); 3508 PyMem_Free(wstr); 3509 } else 3510 errmsg = NULL; 3511 } 3512 if (errmsg == NULL) 3513 reason = PyUnicode_FromString( 3514 "mbstowcs() encountered an invalid multibyte sequence"); 3515 if (reason == NULL) 3516 return NULL; 3517 3518 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3519 "locale", str, len, 3520 (Py_ssize_t)error_pos, 3521 (Py_ssize_t)(error_pos+1), 3522 reason); 3523 Py_DECREF(reason); 3524 if (exc != NULL) { 3525 PyCodec_StrictErrors(exc); 3526 Py_XDECREF(exc); 3527 } 3528 return NULL; 3529} 3530 3531PyObject* 3532PyUnicode_DecodeLocale(const char *str, const char *errors) 3533{ 3534 Py_ssize_t size = (Py_ssize_t)strlen(str); 3535 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3536} 3537 3538 3539PyObject* 3540PyUnicode_DecodeFSDefault(const char *s) { 3541 Py_ssize_t size = (Py_ssize_t)strlen(s); 3542 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3543} 3544 3545PyObject* 3546PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3547{ 3548#ifdef HAVE_MBCS 3549 return PyUnicode_DecodeMBCS(s, size, NULL); 3550#elif defined(__APPLE__) 3551 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3552#else 3553 PyInterpreterState *interp = PyThreadState_GET()->interp; 3554 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3555 cannot use it to encode and decode filenames before it is loaded. Load 3556 the Python codec requires to encode at least its own filename. Use the C 3557 version of the locale codec until the codec registry is initialized and 3558 the Python codec is loaded. 3559 3560 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3561 cannot only rely on it: check also interp->fscodec_initialized for 3562 subinterpreters. */ 3563 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3564 return PyUnicode_Decode(s, size, 3565 Py_FileSystemDefaultEncoding, 3566 "surrogateescape"); 3567 } 3568 else { 3569 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3570 } 3571#endif 3572} 3573 3574 3575int 3576_PyUnicode_HasNULChars(PyObject* str) 3577{ 3578 Py_ssize_t pos; 3579 3580 if (PyUnicode_READY(str) == -1) 3581 return -1; 3582 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3583 PyUnicode_GET_LENGTH(str), '\0', 1); 3584 if (pos == -1) 3585 return 0; 3586 else 3587 return 1; 3588} 3589 3590int 3591PyUnicode_FSConverter(PyObject* arg, void* addr) 3592{ 3593 PyObject *output = NULL; 3594 Py_ssize_t size; 3595 void *data; 3596 if (arg == NULL) { 3597 Py_DECREF(*(PyObject**)addr); 3598 return 1; 3599 } 3600 if (PyBytes_Check(arg)) { 3601 output = arg; 3602 Py_INCREF(output); 3603 } 3604 else { 3605 arg = PyUnicode_FromObject(arg); 3606 if (!arg) 3607 return 0; 3608 output = PyUnicode_EncodeFSDefault(arg); 3609 Py_DECREF(arg); 3610 if (!output) 3611 return 0; 3612 if (!PyBytes_Check(output)) { 3613 Py_DECREF(output); 3614 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3615 return 0; 3616 } 3617 } 3618 size = PyBytes_GET_SIZE(output); 3619 data = PyBytes_AS_STRING(output); 3620 if (size != strlen(data)) { 3621 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3622 Py_DECREF(output); 3623 return 0; 3624 } 3625 *(PyObject**)addr = output; 3626 return Py_CLEANUP_SUPPORTED; 3627} 3628 3629 3630int 3631PyUnicode_FSDecoder(PyObject* arg, void* addr) 3632{ 3633 PyObject *output = NULL; 3634 if (arg == NULL) { 3635 Py_DECREF(*(PyObject**)addr); 3636 return 1; 3637 } 3638 if (PyUnicode_Check(arg)) { 3639 if (PyUnicode_READY(arg) == -1) 3640 return 0; 3641 output = arg; 3642 Py_INCREF(output); 3643 } 3644 else { 3645 arg = PyBytes_FromObject(arg); 3646 if (!arg) 3647 return 0; 3648 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3649 PyBytes_GET_SIZE(arg)); 3650 Py_DECREF(arg); 3651 if (!output) 3652 return 0; 3653 if (!PyUnicode_Check(output)) { 3654 Py_DECREF(output); 3655 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3656 return 0; 3657 } 3658 } 3659 if (PyUnicode_READY(output) == -1) { 3660 Py_DECREF(output); 3661 return 0; 3662 } 3663 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3664 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3665 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3666 Py_DECREF(output); 3667 return 0; 3668 } 3669 *(PyObject**)addr = output; 3670 return Py_CLEANUP_SUPPORTED; 3671} 3672 3673 3674char* 3675PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3676{ 3677 PyObject *bytes; 3678 3679 if (!PyUnicode_Check(unicode)) { 3680 PyErr_BadArgument(); 3681 return NULL; 3682 } 3683 if (PyUnicode_READY(unicode) == -1) 3684 return NULL; 3685 3686 if (PyUnicode_UTF8(unicode) == NULL) { 3687 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3688 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3689 if (bytes == NULL) 3690 return NULL; 3691 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3692 if (_PyUnicode_UTF8(unicode) == NULL) { 3693 Py_DECREF(bytes); 3694 return NULL; 3695 } 3696 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3697 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3698 PyBytes_AS_STRING(bytes), 3699 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3700 Py_DECREF(bytes); 3701 } 3702 3703 if (psize) 3704 *psize = PyUnicode_UTF8_LENGTH(unicode); 3705 return PyUnicode_UTF8(unicode); 3706} 3707 3708char* 3709PyUnicode_AsUTF8(PyObject *unicode) 3710{ 3711 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3712} 3713 3714Py_UNICODE * 3715PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3716{ 3717 const unsigned char *one_byte; 3718#if SIZEOF_WCHAR_T == 4 3719 const Py_UCS2 *two_bytes; 3720#else 3721 const Py_UCS4 *four_bytes; 3722 const Py_UCS4 *ucs4_end; 3723 Py_ssize_t num_surrogates; 3724#endif 3725 wchar_t *w; 3726 wchar_t *wchar_end; 3727 3728 if (!PyUnicode_Check(unicode)) { 3729 PyErr_BadArgument(); 3730 return NULL; 3731 } 3732 if (_PyUnicode_WSTR(unicode) == NULL) { 3733 /* Non-ASCII compact unicode object */ 3734 assert(_PyUnicode_KIND(unicode) != 0); 3735 assert(PyUnicode_IS_READY(unicode)); 3736 3737 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3738#if SIZEOF_WCHAR_T == 2 3739 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3740 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3741 num_surrogates = 0; 3742 3743 for (; four_bytes < ucs4_end; ++four_bytes) { 3744 if (*four_bytes > 0xFFFF) 3745 ++num_surrogates; 3746 } 3747 3748 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3749 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3750 if (!_PyUnicode_WSTR(unicode)) { 3751 PyErr_NoMemory(); 3752 return NULL; 3753 } 3754 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3755 3756 w = _PyUnicode_WSTR(unicode); 3757 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3758 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3759 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3760 if (*four_bytes > 0xFFFF) { 3761 assert(*four_bytes <= MAX_UNICODE); 3762 /* encode surrogate pair in this case */ 3763 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3764 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3765 } 3766 else 3767 *w = *four_bytes; 3768 3769 if (w > wchar_end) { 3770 assert(0 && "Miscalculated string end"); 3771 } 3772 } 3773 *w = 0; 3774#else 3775 /* sizeof(wchar_t) == 4 */ 3776 Py_FatalError("Impossible unicode object state, wstr and str " 3777 "should share memory already."); 3778 return NULL; 3779#endif 3780 } 3781 else { 3782 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3783 (_PyUnicode_LENGTH(unicode) + 1)); 3784 if (!_PyUnicode_WSTR(unicode)) { 3785 PyErr_NoMemory(); 3786 return NULL; 3787 } 3788 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3789 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3790 w = _PyUnicode_WSTR(unicode); 3791 wchar_end = w + _PyUnicode_LENGTH(unicode); 3792 3793 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3794 one_byte = PyUnicode_1BYTE_DATA(unicode); 3795 for (; w < wchar_end; ++one_byte, ++w) 3796 *w = *one_byte; 3797 /* null-terminate the wstr */ 3798 *w = 0; 3799 } 3800 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3801#if SIZEOF_WCHAR_T == 4 3802 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3803 for (; w < wchar_end; ++two_bytes, ++w) 3804 *w = *two_bytes; 3805 /* null-terminate the wstr */ 3806 *w = 0; 3807#else 3808 /* sizeof(wchar_t) == 2 */ 3809 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3810 _PyUnicode_WSTR(unicode) = NULL; 3811 Py_FatalError("Impossible unicode object state, wstr " 3812 "and str should share memory already."); 3813 return NULL; 3814#endif 3815 } 3816 else { 3817 assert(0 && "This should never happen."); 3818 } 3819 } 3820 } 3821 if (size != NULL) 3822 *size = PyUnicode_WSTR_LENGTH(unicode); 3823 return _PyUnicode_WSTR(unicode); 3824} 3825 3826Py_UNICODE * 3827PyUnicode_AsUnicode(PyObject *unicode) 3828{ 3829 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3830} 3831 3832 3833Py_ssize_t 3834PyUnicode_GetSize(PyObject *unicode) 3835{ 3836 if (!PyUnicode_Check(unicode)) { 3837 PyErr_BadArgument(); 3838 goto onError; 3839 } 3840 return PyUnicode_GET_SIZE(unicode); 3841 3842 onError: 3843 return -1; 3844} 3845 3846Py_ssize_t 3847PyUnicode_GetLength(PyObject *unicode) 3848{ 3849 if (!PyUnicode_Check(unicode)) { 3850 PyErr_BadArgument(); 3851 return -1; 3852 } 3853 if (PyUnicode_READY(unicode) == -1) 3854 return -1; 3855 return PyUnicode_GET_LENGTH(unicode); 3856} 3857 3858Py_UCS4 3859PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3860{ 3861 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3862 PyErr_BadArgument(); 3863 return (Py_UCS4)-1; 3864 } 3865 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3866 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3867 return (Py_UCS4)-1; 3868 } 3869 return PyUnicode_READ_CHAR(unicode, index); 3870} 3871 3872int 3873PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3874{ 3875 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3876 PyErr_BadArgument(); 3877 return -1; 3878 } 3879 assert(PyUnicode_IS_READY(unicode)); 3880 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3881 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3882 return -1; 3883 } 3884 if (unicode_check_modifiable(unicode)) 3885 return -1; 3886 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3887 PyErr_SetString(PyExc_ValueError, "character out of range"); 3888 return -1; 3889 } 3890 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3891 index, ch); 3892 return 0; 3893} 3894 3895const char * 3896PyUnicode_GetDefaultEncoding(void) 3897{ 3898 return "utf-8"; 3899} 3900 3901/* create or adjust a UnicodeDecodeError */ 3902static void 3903make_decode_exception(PyObject **exceptionObject, 3904 const char *encoding, 3905 const char *input, Py_ssize_t length, 3906 Py_ssize_t startpos, Py_ssize_t endpos, 3907 const char *reason) 3908{ 3909 if (*exceptionObject == NULL) { 3910 *exceptionObject = PyUnicodeDecodeError_Create( 3911 encoding, input, length, startpos, endpos, reason); 3912 } 3913 else { 3914 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3915 goto onError; 3916 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3917 goto onError; 3918 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3919 goto onError; 3920 } 3921 return; 3922 3923onError: 3924 Py_DECREF(*exceptionObject); 3925 *exceptionObject = NULL; 3926} 3927 3928/* error handling callback helper: 3929 build arguments, call the callback and check the arguments, 3930 if no exception occurred, copy the replacement to the output 3931 and adjust various state variables. 3932 return 0 on success, -1 on error 3933*/ 3934 3935static int 3936unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3937 const char *encoding, const char *reason, 3938 const char **input, const char **inend, Py_ssize_t *startinpos, 3939 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3940 PyObject **output, Py_ssize_t *outpos) 3941{ 3942 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3943 3944 PyObject *restuple = NULL; 3945 PyObject *repunicode = NULL; 3946 Py_ssize_t outsize; 3947 Py_ssize_t insize; 3948 Py_ssize_t requiredsize; 3949 Py_ssize_t newpos; 3950 PyObject *inputobj = NULL; 3951 int res = -1; 3952 3953 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 3954 outsize = PyUnicode_GET_LENGTH(*output); 3955 else 3956 outsize = _PyUnicode_WSTR_LENGTH(*output); 3957 3958 if (*errorHandler == NULL) { 3959 *errorHandler = PyCodec_LookupError(errors); 3960 if (*errorHandler == NULL) 3961 goto onError; 3962 } 3963 3964 make_decode_exception(exceptionObject, 3965 encoding, 3966 *input, *inend - *input, 3967 *startinpos, *endinpos, 3968 reason); 3969 if (*exceptionObject == NULL) 3970 goto onError; 3971 3972 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3973 if (restuple == NULL) 3974 goto onError; 3975 if (!PyTuple_Check(restuple)) { 3976 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3977 goto onError; 3978 } 3979 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3980 goto onError; 3981 if (PyUnicode_READY(repunicode) == -1) 3982 goto onError; 3983 3984 /* Copy back the bytes variables, which might have been modified by the 3985 callback */ 3986 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3987 if (!inputobj) 3988 goto onError; 3989 if (!PyBytes_Check(inputobj)) { 3990 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3991 } 3992 *input = PyBytes_AS_STRING(inputobj); 3993 insize = PyBytes_GET_SIZE(inputobj); 3994 *inend = *input + insize; 3995 /* we can DECREF safely, as the exception has another reference, 3996 so the object won't go away. */ 3997 Py_DECREF(inputobj); 3998 3999 if (newpos<0) 4000 newpos = insize+newpos; 4001 if (newpos<0 || newpos>insize) { 4002 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4003 goto onError; 4004 } 4005 4006 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4007 /* need more space? (at least enough for what we 4008 have+the replacement+the rest of the string (starting 4009 at the new input position), so we won't have to check space 4010 when there are no errors in the rest of the string) */ 4011 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4012 requiredsize = *outpos + replen + insize-newpos; 4013 if (requiredsize > outsize) { 4014 if (requiredsize<2*outsize) 4015 requiredsize = 2*outsize; 4016 if (unicode_resize(output, requiredsize) < 0) 4017 goto onError; 4018 } 4019 if (unicode_widen(output, *outpos, 4020 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 4021 goto onError; 4022 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4023 *outpos += replen; 4024 } 4025 else { 4026 wchar_t *repwstr; 4027 Py_ssize_t repwlen; 4028 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4029 if (repwstr == NULL) 4030 goto onError; 4031 /* need more space? (at least enough for what we 4032 have+the replacement+the rest of the string (starting 4033 at the new input position), so we won't have to check space 4034 when there are no errors in the rest of the string) */ 4035 requiredsize = *outpos + repwlen + insize-newpos; 4036 if (requiredsize > outsize) { 4037 if (requiredsize < 2*outsize) 4038 requiredsize = 2*outsize; 4039 if (unicode_resize(output, requiredsize) < 0) 4040 goto onError; 4041 } 4042 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4043 *outpos += repwlen; 4044 } 4045 *endinpos = newpos; 4046 *inptr = *input + newpos; 4047 4048 /* we made it! */ 4049 res = 0; 4050 4051 onError: 4052 Py_XDECREF(restuple); 4053 return res; 4054} 4055 4056/* --- UTF-7 Codec -------------------------------------------------------- */ 4057 4058/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4059 4060/* Three simple macros defining base-64. */ 4061 4062/* Is c a base-64 character? */ 4063 4064#define IS_BASE64(c) \ 4065 (((c) >= 'A' && (c) <= 'Z') || \ 4066 ((c) >= 'a' && (c) <= 'z') || \ 4067 ((c) >= '0' && (c) <= '9') || \ 4068 (c) == '+' || (c) == '/') 4069 4070/* given that c is a base-64 character, what is its base-64 value? */ 4071 4072#define FROM_BASE64(c) \ 4073 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4074 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4075 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4076 (c) == '+' ? 62 : 63) 4077 4078/* What is the base-64 character of the bottom 6 bits of n? */ 4079 4080#define TO_BASE64(n) \ 4081 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4082 4083/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4084 * decoded as itself. We are permissive on decoding; the only ASCII 4085 * byte not decoding to itself is the + which begins a base64 4086 * string. */ 4087 4088#define DECODE_DIRECT(c) \ 4089 ((c) <= 127 && (c) != '+') 4090 4091/* The UTF-7 encoder treats ASCII characters differently according to 4092 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4093 * the above). See RFC2152. This array identifies these different 4094 * sets: 4095 * 0 : "Set D" 4096 * alphanumeric and '(),-./:? 4097 * 1 : "Set O" 4098 * !"#$%&*;<=>@[]^_`{|} 4099 * 2 : "whitespace" 4100 * ht nl cr sp 4101 * 3 : special (must be base64 encoded) 4102 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4103 */ 4104 4105static 4106char utf7_category[128] = { 4107/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4108 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4109/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4110 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4111/* sp ! " # $ % & ' ( ) * + , - . / */ 4112 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4113/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4115/* @ A B C D E F G H I J K L M N O */ 4116 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4117/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4119/* ` a b c d e f g h i j k l m n o */ 4120 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4121/* p q r s t u v w x y z { | } ~ del */ 4122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4123}; 4124 4125/* ENCODE_DIRECT: this character should be encoded as itself. The 4126 * answer depends on whether we are encoding set O as itself, and also 4127 * on whether we are encoding whitespace as itself. RFC2152 makes it 4128 * clear that the answers to these questions vary between 4129 * applications, so this code needs to be flexible. */ 4130 4131#define ENCODE_DIRECT(c, directO, directWS) \ 4132 ((c) < 128 && (c) > 0 && \ 4133 ((utf7_category[(c)] == 0) || \ 4134 (directWS && (utf7_category[(c)] == 2)) || \ 4135 (directO && (utf7_category[(c)] == 1)))) 4136 4137PyObject * 4138PyUnicode_DecodeUTF7(const char *s, 4139 Py_ssize_t size, 4140 const char *errors) 4141{ 4142 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4143} 4144 4145/* The decoder. The only state we preserve is our read position, 4146 * i.e. how many characters we have consumed. So if we end in the 4147 * middle of a shift sequence we have to back off the read position 4148 * and the output to the beginning of the sequence, otherwise we lose 4149 * all the shift state (seen bits, number of bits seen, high 4150 * surrogate). */ 4151 4152PyObject * 4153PyUnicode_DecodeUTF7Stateful(const char *s, 4154 Py_ssize_t size, 4155 const char *errors, 4156 Py_ssize_t *consumed) 4157{ 4158 const char *starts = s; 4159 Py_ssize_t startinpos; 4160 Py_ssize_t endinpos; 4161 Py_ssize_t outpos; 4162 const char *e; 4163 PyObject *unicode; 4164 const char *errmsg = ""; 4165 int inShift = 0; 4166 Py_ssize_t shiftOutStart; 4167 unsigned int base64bits = 0; 4168 unsigned long base64buffer = 0; 4169 Py_UCS4 surrogate = 0; 4170 PyObject *errorHandler = NULL; 4171 PyObject *exc = NULL; 4172 4173 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4174 unicode = PyUnicode_New(size, 127); 4175 if (!unicode) 4176 return NULL; 4177 if (size == 0) { 4178 if (consumed) 4179 *consumed = 0; 4180 return unicode; 4181 } 4182 4183 shiftOutStart = outpos = 0; 4184 e = s + size; 4185 4186 while (s < e) { 4187 Py_UCS4 ch; 4188 restart: 4189 ch = (unsigned char) *s; 4190 4191 if (inShift) { /* in a base-64 section */ 4192 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4193 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4194 base64bits += 6; 4195 s++; 4196 if (base64bits >= 16) { 4197 /* we have enough bits for a UTF-16 value */ 4198 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4199 base64bits -= 16; 4200 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4201 if (surrogate) { 4202 /* expecting a second surrogate */ 4203 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4204 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4205 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4206 goto onError; 4207 surrogate = 0; 4208 continue; 4209 } 4210 else { 4211 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4212 goto onError; 4213 surrogate = 0; 4214 } 4215 } 4216 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4217 /* first surrogate */ 4218 surrogate = outCh; 4219 } 4220 else { 4221 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4222 goto onError; 4223 } 4224 } 4225 } 4226 else { /* now leaving a base-64 section */ 4227 inShift = 0; 4228 s++; 4229 if (surrogate) { 4230 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4231 goto onError; 4232 surrogate = 0; 4233 } 4234 if (base64bits > 0) { /* left-over bits */ 4235 if (base64bits >= 6) { 4236 /* We've seen at least one base-64 character */ 4237 errmsg = "partial character in shift sequence"; 4238 goto utf7Error; 4239 } 4240 else { 4241 /* Some bits remain; they should be zero */ 4242 if (base64buffer != 0) { 4243 errmsg = "non-zero padding bits in shift sequence"; 4244 goto utf7Error; 4245 } 4246 } 4247 } 4248 if (ch != '-') { 4249 /* '-' is absorbed; other terminating 4250 characters are preserved */ 4251 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4252 goto onError; 4253 } 4254 } 4255 } 4256 else if ( ch == '+' ) { 4257 startinpos = s-starts; 4258 s++; /* consume '+' */ 4259 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4260 s++; 4261 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4262 goto onError; 4263 } 4264 else { /* begin base64-encoded section */ 4265 inShift = 1; 4266 shiftOutStart = outpos; 4267 base64bits = 0; 4268 } 4269 } 4270 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4271 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4272 goto onError; 4273 s++; 4274 } 4275 else { 4276 startinpos = s-starts; 4277 s++; 4278 errmsg = "unexpected special character"; 4279 goto utf7Error; 4280 } 4281 continue; 4282utf7Error: 4283 endinpos = s-starts; 4284 if (unicode_decode_call_errorhandler( 4285 errors, &errorHandler, 4286 "utf7", errmsg, 4287 &starts, &e, &startinpos, &endinpos, &exc, &s, 4288 &unicode, &outpos)) 4289 goto onError; 4290 } 4291 4292 /* end of string */ 4293 4294 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4295 /* if we're in an inconsistent state, that's an error */ 4296 if (surrogate || 4297 (base64bits >= 6) || 4298 (base64bits > 0 && base64buffer != 0)) { 4299 endinpos = size; 4300 if (unicode_decode_call_errorhandler( 4301 errors, &errorHandler, 4302 "utf7", "unterminated shift sequence", 4303 &starts, &e, &startinpos, &endinpos, &exc, &s, 4304 &unicode, &outpos)) 4305 goto onError; 4306 if (s < e) 4307 goto restart; 4308 } 4309 } 4310 4311 /* return state */ 4312 if (consumed) { 4313 if (inShift) { 4314 outpos = shiftOutStart; /* back off output */ 4315 *consumed = startinpos; 4316 } 4317 else { 4318 *consumed = s-starts; 4319 } 4320 } 4321 4322 if (unicode_resize(&unicode, outpos) < 0) 4323 goto onError; 4324 4325 Py_XDECREF(errorHandler); 4326 Py_XDECREF(exc); 4327 return unicode_result(unicode); 4328 4329 onError: 4330 Py_XDECREF(errorHandler); 4331 Py_XDECREF(exc); 4332 Py_DECREF(unicode); 4333 return NULL; 4334} 4335 4336 4337PyObject * 4338_PyUnicode_EncodeUTF7(PyObject *str, 4339 int base64SetO, 4340 int base64WhiteSpace, 4341 const char *errors) 4342{ 4343 int kind; 4344 void *data; 4345 Py_ssize_t len; 4346 PyObject *v; 4347 int inShift = 0; 4348 Py_ssize_t i; 4349 unsigned int base64bits = 0; 4350 unsigned long base64buffer = 0; 4351 char * out; 4352 char * start; 4353 4354 if (PyUnicode_READY(str) == -1) 4355 return NULL; 4356 kind = PyUnicode_KIND(str); 4357 data = PyUnicode_DATA(str); 4358 len = PyUnicode_GET_LENGTH(str); 4359 4360 if (len == 0) 4361 return PyBytes_FromStringAndSize(NULL, 0); 4362 4363 /* It might be possible to tighten this worst case */ 4364 if (len > PY_SSIZE_T_MAX / 8) 4365 return PyErr_NoMemory(); 4366 v = PyBytes_FromStringAndSize(NULL, len * 8); 4367 if (v == NULL) 4368 return NULL; 4369 4370 start = out = PyBytes_AS_STRING(v); 4371 for (i = 0; i < len; ++i) { 4372 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4373 4374 if (inShift) { 4375 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4376 /* shifting out */ 4377 if (base64bits) { /* output remaining bits */ 4378 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4379 base64buffer = 0; 4380 base64bits = 0; 4381 } 4382 inShift = 0; 4383 /* Characters not in the BASE64 set implicitly unshift the sequence 4384 so no '-' is required, except if the character is itself a '-' */ 4385 if (IS_BASE64(ch) || ch == '-') { 4386 *out++ = '-'; 4387 } 4388 *out++ = (char) ch; 4389 } 4390 else { 4391 goto encode_char; 4392 } 4393 } 4394 else { /* not in a shift sequence */ 4395 if (ch == '+') { 4396 *out++ = '+'; 4397 *out++ = '-'; 4398 } 4399 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4400 *out++ = (char) ch; 4401 } 4402 else { 4403 *out++ = '+'; 4404 inShift = 1; 4405 goto encode_char; 4406 } 4407 } 4408 continue; 4409encode_char: 4410 if (ch >= 0x10000) { 4411 assert(ch <= MAX_UNICODE); 4412 4413 /* code first surrogate */ 4414 base64bits += 16; 4415 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4416 while (base64bits >= 6) { 4417 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4418 base64bits -= 6; 4419 } 4420 /* prepare second surrogate */ 4421 ch = Py_UNICODE_LOW_SURROGATE(ch); 4422 } 4423 base64bits += 16; 4424 base64buffer = (base64buffer << 16) | ch; 4425 while (base64bits >= 6) { 4426 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4427 base64bits -= 6; 4428 } 4429 } 4430 if (base64bits) 4431 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4432 if (inShift) 4433 *out++ = '-'; 4434 if (_PyBytes_Resize(&v, out - start) < 0) 4435 return NULL; 4436 return v; 4437} 4438PyObject * 4439PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4440 Py_ssize_t size, 4441 int base64SetO, 4442 int base64WhiteSpace, 4443 const char *errors) 4444{ 4445 PyObject *result; 4446 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4447 if (tmp == NULL) 4448 return NULL; 4449 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4450 base64WhiteSpace, errors); 4451 Py_DECREF(tmp); 4452 return result; 4453} 4454 4455#undef IS_BASE64 4456#undef FROM_BASE64 4457#undef TO_BASE64 4458#undef DECODE_DIRECT 4459#undef ENCODE_DIRECT 4460 4461/* --- UTF-8 Codec -------------------------------------------------------- */ 4462 4463PyObject * 4464PyUnicode_DecodeUTF8(const char *s, 4465 Py_ssize_t size, 4466 const char *errors) 4467{ 4468 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4469} 4470 4471#include "stringlib/asciilib.h" 4472#include "stringlib/codecs.h" 4473#include "stringlib/undef.h" 4474 4475#include "stringlib/ucs1lib.h" 4476#include "stringlib/codecs.h" 4477#include "stringlib/undef.h" 4478 4479#include "stringlib/ucs2lib.h" 4480#include "stringlib/codecs.h" 4481#include "stringlib/undef.h" 4482 4483#include "stringlib/ucs4lib.h" 4484#include "stringlib/codecs.h" 4485#include "stringlib/undef.h" 4486 4487/* Mask to quickly check whether a C 'long' contains a 4488 non-ASCII, UTF8-encoded char. */ 4489#if (SIZEOF_LONG == 8) 4490# define ASCII_CHAR_MASK 0x8080808080808080UL 4491#elif (SIZEOF_LONG == 4) 4492# define ASCII_CHAR_MASK 0x80808080UL 4493#else 4494# error C 'long' size should be either 4 or 8! 4495#endif 4496 4497static Py_ssize_t 4498ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4499{ 4500 const char *p = start; 4501 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4502 4503#if SIZEOF_LONG <= SIZEOF_VOID_P 4504 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4505 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4506 /* Fast path, see in STRINGLIB(utf8_decode) for 4507 an explanation. */ 4508 /* Help register allocation */ 4509 register const char *_p = p; 4510 register Py_UCS1 * q = dest; 4511 while (_p < aligned_end) { 4512 unsigned long value = *(const unsigned long *) _p; 4513 if (value & ASCII_CHAR_MASK) 4514 break; 4515 *((unsigned long *)q) = value; 4516 _p += SIZEOF_LONG; 4517 q += SIZEOF_LONG; 4518 } 4519 p = _p; 4520 while (p < end) { 4521 if ((unsigned char)*p & 0x80) 4522 break; 4523 *q++ = *p++; 4524 } 4525 return p - start; 4526 } 4527#endif 4528 while (p < end) { 4529 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4530 for an explanation. */ 4531 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4532 /* Help register allocation */ 4533 register const char *_p = p; 4534 while (_p < aligned_end) { 4535 unsigned long value = *(unsigned long *) _p; 4536 if (value & ASCII_CHAR_MASK) 4537 break; 4538 _p += SIZEOF_LONG; 4539 } 4540 p = _p; 4541 if (_p == end) 4542 break; 4543 } 4544 if ((unsigned char)*p & 0x80) 4545 break; 4546 ++p; 4547 } 4548 memcpy(dest, start, p - start); 4549 return p - start; 4550} 4551 4552PyObject * 4553PyUnicode_DecodeUTF8Stateful(const char *s, 4554 Py_ssize_t size, 4555 const char *errors, 4556 Py_ssize_t *consumed) 4557{ 4558 PyObject *unicode; 4559 const char *starts = s; 4560 const char *end = s + size; 4561 Py_ssize_t outpos; 4562 4563 Py_ssize_t startinpos; 4564 Py_ssize_t endinpos; 4565 const char *errmsg = ""; 4566 PyObject *errorHandler = NULL; 4567 PyObject *exc = NULL; 4568 4569 if (size == 0) { 4570 if (consumed) 4571 *consumed = 0; 4572 Py_INCREF(unicode_empty); 4573 return unicode_empty; 4574 } 4575 4576 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4577 if (size == 1 && (unsigned char)s[0] < 128) { 4578 if (consumed) 4579 *consumed = 1; 4580 return get_latin1_char((unsigned char)s[0]); 4581 } 4582 4583 unicode = PyUnicode_New(size, 127); 4584 if (!unicode) 4585 return NULL; 4586 4587 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4588 s += outpos; 4589 while (s < end) { 4590 Py_UCS4 ch; 4591 int kind = PyUnicode_KIND(unicode); 4592 if (kind == PyUnicode_1BYTE_KIND) { 4593 if (PyUnicode_IS_ASCII(unicode)) 4594 ch = asciilib_utf8_decode(&s, end, 4595 PyUnicode_1BYTE_DATA(unicode), &outpos); 4596 else 4597 ch = ucs1lib_utf8_decode(&s, end, 4598 PyUnicode_1BYTE_DATA(unicode), &outpos); 4599 } else if (kind == PyUnicode_2BYTE_KIND) { 4600 ch = ucs2lib_utf8_decode(&s, end, 4601 PyUnicode_2BYTE_DATA(unicode), &outpos); 4602 } else { 4603 assert(kind == PyUnicode_4BYTE_KIND); 4604 ch = ucs4lib_utf8_decode(&s, end, 4605 PyUnicode_4BYTE_DATA(unicode), &outpos); 4606 } 4607 4608 switch (ch) { 4609 case 0: 4610 if (s == end || consumed) 4611 goto End; 4612 errmsg = "unexpected end of data"; 4613 startinpos = s - starts; 4614 endinpos = startinpos + 1; 4615 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4616 endinpos++; 4617 break; 4618 case 1: 4619 errmsg = "invalid start byte"; 4620 startinpos = s - starts; 4621 endinpos = startinpos + 1; 4622 break; 4623 case 2: 4624 errmsg = "invalid continuation byte"; 4625 startinpos = s - starts; 4626 endinpos = startinpos + 1; 4627 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4628 endinpos++; 4629 break; 4630 default: 4631 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4632 goto onError; 4633 continue; 4634 } 4635 4636 if (unicode_decode_call_errorhandler( 4637 errors, &errorHandler, 4638 "utf-8", errmsg, 4639 &starts, &end, &startinpos, &endinpos, &exc, &s, 4640 &unicode, &outpos)) 4641 goto onError; 4642 } 4643 4644End: 4645 if (unicode_resize(&unicode, outpos) < 0) 4646 goto onError; 4647 4648 if (consumed) 4649 *consumed = s - starts; 4650 4651 Py_XDECREF(errorHandler); 4652 Py_XDECREF(exc); 4653 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4654 return unicode; 4655 4656onError: 4657 Py_XDECREF(errorHandler); 4658 Py_XDECREF(exc); 4659 Py_XDECREF(unicode); 4660 return NULL; 4661} 4662 4663#ifdef __APPLE__ 4664 4665/* Simplified UTF-8 decoder using surrogateescape error handler, 4666 used to decode the command line arguments on Mac OS X. */ 4667 4668wchar_t* 4669_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4670{ 4671 const char *e; 4672 wchar_t *unicode; 4673 Py_ssize_t outpos; 4674 4675 /* Note: size will always be longer than the resulting Unicode 4676 character count */ 4677 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4678 PyErr_NoMemory(); 4679 return NULL; 4680 } 4681 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4682 if (!unicode) 4683 return NULL; 4684 4685 /* Unpack UTF-8 encoded data */ 4686 e = s + size; 4687 outpos = 0; 4688 while (s < e) { 4689 Py_UCS4 ch; 4690#if SIZEOF_WCHAR_T == 4 4691 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4692#else 4693 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4694#endif 4695 if (ch > 0xFF) { 4696#if SIZEOF_WCHAR_T == 4 4697 assert(0); 4698#else 4699 assert(Py_UNICODE_IS_SURROGATE(ch)); 4700 /* compute and append the two surrogates: */ 4701 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4702 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4703#endif 4704 } 4705 else { 4706 if (!ch && s == e) 4707 break; 4708 /* surrogateescape */ 4709 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4710 } 4711 } 4712 unicode[outpos] = L'\0'; 4713 return unicode; 4714} 4715 4716#endif /* __APPLE__ */ 4717 4718/* Primary internal function which creates utf8 encoded bytes objects. 4719 4720 Allocation strategy: if the string is short, convert into a stack buffer 4721 and allocate exactly as much space needed at the end. Else allocate the 4722 maximum possible needed (4 result bytes per Unicode character), and return 4723 the excess memory at the end. 4724*/ 4725PyObject * 4726_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4727{ 4728 enum PyUnicode_Kind kind; 4729 void *data; 4730 Py_ssize_t size; 4731 4732 if (!PyUnicode_Check(unicode)) { 4733 PyErr_BadArgument(); 4734 return NULL; 4735 } 4736 4737 if (PyUnicode_READY(unicode) == -1) 4738 return NULL; 4739 4740 if (PyUnicode_UTF8(unicode)) 4741 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4742 PyUnicode_UTF8_LENGTH(unicode)); 4743 4744 kind = PyUnicode_KIND(unicode); 4745 data = PyUnicode_DATA(unicode); 4746 size = PyUnicode_GET_LENGTH(unicode); 4747 4748 switch (kind) { 4749 default: 4750 assert(0); 4751 case PyUnicode_1BYTE_KIND: 4752 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4753 assert(!PyUnicode_IS_ASCII(unicode)); 4754 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4755 case PyUnicode_2BYTE_KIND: 4756 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4757 case PyUnicode_4BYTE_KIND: 4758 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4759 } 4760} 4761 4762PyObject * 4763PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4764 Py_ssize_t size, 4765 const char *errors) 4766{ 4767 PyObject *v, *unicode; 4768 4769 unicode = PyUnicode_FromUnicode(s, size); 4770 if (unicode == NULL) 4771 return NULL; 4772 v = _PyUnicode_AsUTF8String(unicode, errors); 4773 Py_DECREF(unicode); 4774 return v; 4775} 4776 4777PyObject * 4778PyUnicode_AsUTF8String(PyObject *unicode) 4779{ 4780 return _PyUnicode_AsUTF8String(unicode, NULL); 4781} 4782 4783/* --- UTF-32 Codec ------------------------------------------------------- */ 4784 4785PyObject * 4786PyUnicode_DecodeUTF32(const char *s, 4787 Py_ssize_t size, 4788 const char *errors, 4789 int *byteorder) 4790{ 4791 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4792} 4793 4794PyObject * 4795PyUnicode_DecodeUTF32Stateful(const char *s, 4796 Py_ssize_t size, 4797 const char *errors, 4798 int *byteorder, 4799 Py_ssize_t *consumed) 4800{ 4801 const char *starts = s; 4802 Py_ssize_t startinpos; 4803 Py_ssize_t endinpos; 4804 Py_ssize_t outpos; 4805 PyObject *unicode; 4806 const unsigned char *q, *e; 4807 int bo = 0; /* assume native ordering by default */ 4808 const char *errmsg = ""; 4809 /* Offsets from q for retrieving bytes in the right order. */ 4810#if PY_LITTLE_ENDIAN 4811 int iorder[] = {0, 1, 2, 3}; 4812#else 4813 int iorder[] = {3, 2, 1, 0}; 4814#endif 4815 PyObject *errorHandler = NULL; 4816 PyObject *exc = NULL; 4817 4818 q = (unsigned char *)s; 4819 e = q + size; 4820 4821 if (byteorder) 4822 bo = *byteorder; 4823 4824 /* Check for BOM marks (U+FEFF) in the input and adjust current 4825 byte order setting accordingly. In native mode, the leading BOM 4826 mark is skipped, in all other modes, it is copied to the output 4827 stream as-is (giving a ZWNBSP character). */ 4828 if (bo == 0) { 4829 if (size >= 4) { 4830 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4831 (q[iorder[1]] << 8) | q[iorder[0]]; 4832#if PY_LITTLE_ENDIAN 4833 if (bom == 0x0000FEFF) { 4834 q += 4; 4835 bo = -1; 4836 } 4837 else if (bom == 0xFFFE0000) { 4838 q += 4; 4839 bo = 1; 4840 } 4841#else 4842 if (bom == 0x0000FEFF) { 4843 q += 4; 4844 bo = 1; 4845 } 4846 else if (bom == 0xFFFE0000) { 4847 q += 4; 4848 bo = -1; 4849 } 4850#endif 4851 } 4852 } 4853 4854 if (bo == -1) { 4855 /* force LE */ 4856 iorder[0] = 0; 4857 iorder[1] = 1; 4858 iorder[2] = 2; 4859 iorder[3] = 3; 4860 } 4861 else if (bo == 1) { 4862 /* force BE */ 4863 iorder[0] = 3; 4864 iorder[1] = 2; 4865 iorder[2] = 1; 4866 iorder[3] = 0; 4867 } 4868 4869 /* This might be one to much, because of a BOM */ 4870 unicode = PyUnicode_New((size+3)/4, 127); 4871 if (!unicode) 4872 return NULL; 4873 if (size == 0) 4874 return unicode; 4875 outpos = 0; 4876 4877 while (q < e) { 4878 Py_UCS4 ch; 4879 /* remaining bytes at the end? (size should be divisible by 4) */ 4880 if (e-q<4) { 4881 if (consumed) 4882 break; 4883 errmsg = "truncated data"; 4884 startinpos = ((const char *)q)-starts; 4885 endinpos = ((const char *)e)-starts; 4886 goto utf32Error; 4887 /* The remaining input chars are ignored if the callback 4888 chooses to skip the input */ 4889 } 4890 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4891 (q[iorder[1]] << 8) | q[iorder[0]]; 4892 4893 if (ch >= 0x110000) 4894 { 4895 errmsg = "codepoint not in range(0x110000)"; 4896 startinpos = ((const char *)q)-starts; 4897 endinpos = startinpos+4; 4898 goto utf32Error; 4899 } 4900 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4901 goto onError; 4902 q += 4; 4903 continue; 4904 utf32Error: 4905 if (unicode_decode_call_errorhandler( 4906 errors, &errorHandler, 4907 "utf32", errmsg, 4908 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4909 &unicode, &outpos)) 4910 goto onError; 4911 } 4912 4913 if (byteorder) 4914 *byteorder = bo; 4915 4916 if (consumed) 4917 *consumed = (const char *)q-starts; 4918 4919 /* Adjust length */ 4920 if (unicode_resize(&unicode, outpos) < 0) 4921 goto onError; 4922 4923 Py_XDECREF(errorHandler); 4924 Py_XDECREF(exc); 4925 return unicode_result(unicode); 4926 4927 onError: 4928 Py_DECREF(unicode); 4929 Py_XDECREF(errorHandler); 4930 Py_XDECREF(exc); 4931 return NULL; 4932} 4933 4934PyObject * 4935_PyUnicode_EncodeUTF32(PyObject *str, 4936 const char *errors, 4937 int byteorder) 4938{ 4939 int kind; 4940 void *data; 4941 Py_ssize_t len; 4942 PyObject *v; 4943 unsigned char *p; 4944 Py_ssize_t nsize, i; 4945 /* Offsets from p for storing byte pairs in the right order. */ 4946#if PY_LITTLE_ENDIAN 4947 int iorder[] = {0, 1, 2, 3}; 4948#else 4949 int iorder[] = {3, 2, 1, 0}; 4950#endif 4951 4952#define STORECHAR(CH) \ 4953 do { \ 4954 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4955 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4956 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4957 p[iorder[0]] = (CH) & 0xff; \ 4958 p += 4; \ 4959 } while(0) 4960 4961 if (!PyUnicode_Check(str)) { 4962 PyErr_BadArgument(); 4963 return NULL; 4964 } 4965 if (PyUnicode_READY(str) == -1) 4966 return NULL; 4967 kind = PyUnicode_KIND(str); 4968 data = PyUnicode_DATA(str); 4969 len = PyUnicode_GET_LENGTH(str); 4970 4971 nsize = len + (byteorder == 0); 4972 if (nsize > PY_SSIZE_T_MAX / 4) 4973 return PyErr_NoMemory(); 4974 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 4975 if (v == NULL) 4976 return NULL; 4977 4978 p = (unsigned char *)PyBytes_AS_STRING(v); 4979 if (byteorder == 0) 4980 STORECHAR(0xFEFF); 4981 if (len == 0) 4982 goto done; 4983 4984 if (byteorder == -1) { 4985 /* force LE */ 4986 iorder[0] = 0; 4987 iorder[1] = 1; 4988 iorder[2] = 2; 4989 iorder[3] = 3; 4990 } 4991 else if (byteorder == 1) { 4992 /* force BE */ 4993 iorder[0] = 3; 4994 iorder[1] = 2; 4995 iorder[2] = 1; 4996 iorder[3] = 0; 4997 } 4998 4999 for (i = 0; i < len; i++) 5000 STORECHAR(PyUnicode_READ(kind, data, i)); 5001 5002 done: 5003 return v; 5004#undef STORECHAR 5005} 5006 5007PyObject * 5008PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5009 Py_ssize_t size, 5010 const char *errors, 5011 int byteorder) 5012{ 5013 PyObject *result; 5014 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5015 if (tmp == NULL) 5016 return NULL; 5017 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5018 Py_DECREF(tmp); 5019 return result; 5020} 5021 5022PyObject * 5023PyUnicode_AsUTF32String(PyObject *unicode) 5024{ 5025 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5026} 5027 5028/* --- UTF-16 Codec ------------------------------------------------------- */ 5029 5030PyObject * 5031PyUnicode_DecodeUTF16(const char *s, 5032 Py_ssize_t size, 5033 const char *errors, 5034 int *byteorder) 5035{ 5036 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5037} 5038 5039PyObject * 5040PyUnicode_DecodeUTF16Stateful(const char *s, 5041 Py_ssize_t size, 5042 const char *errors, 5043 int *byteorder, 5044 Py_ssize_t *consumed) 5045{ 5046 const char *starts = s; 5047 Py_ssize_t startinpos; 5048 Py_ssize_t endinpos; 5049 Py_ssize_t outpos; 5050 PyObject *unicode; 5051 const unsigned char *q, *e; 5052 int bo = 0; /* assume native ordering by default */ 5053 int native_ordering; 5054 const char *errmsg = ""; 5055 PyObject *errorHandler = NULL; 5056 PyObject *exc = NULL; 5057 5058 q = (unsigned char *)s; 5059 e = q + size; 5060 5061 if (byteorder) 5062 bo = *byteorder; 5063 5064 /* Check for BOM marks (U+FEFF) in the input and adjust current 5065 byte order setting accordingly. In native mode, the leading BOM 5066 mark is skipped, in all other modes, it is copied to the output 5067 stream as-is (giving a ZWNBSP character). */ 5068 if (bo == 0 && size >= 2) { 5069 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5070 if (bom == 0xFEFF) { 5071 q += 2; 5072 bo = -1; 5073 } 5074 else if (bom == 0xFFFE) { 5075 q += 2; 5076 bo = 1; 5077 } 5078 if (byteorder) 5079 *byteorder = bo; 5080 } 5081 5082 if (q == e) { 5083 if (consumed) 5084 *consumed = size; 5085 Py_INCREF(unicode_empty); 5086 return unicode_empty; 5087 } 5088 5089#if PY_LITTLE_ENDIAN 5090 native_ordering = bo <= 0; 5091#else 5092 native_ordering = bo >= 0; 5093#endif 5094 5095 /* Note: size will always be longer than the resulting Unicode 5096 character count */ 5097 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5098 if (!unicode) 5099 return NULL; 5100 5101 outpos = 0; 5102 while (1) { 5103 Py_UCS4 ch = 0; 5104 if (e - q >= 2) { 5105 int kind = PyUnicode_KIND(unicode); 5106 if (kind == PyUnicode_1BYTE_KIND) { 5107 if (PyUnicode_IS_ASCII(unicode)) 5108 ch = asciilib_utf16_decode(&q, e, 5109 PyUnicode_1BYTE_DATA(unicode), &outpos, 5110 native_ordering); 5111 else 5112 ch = ucs1lib_utf16_decode(&q, e, 5113 PyUnicode_1BYTE_DATA(unicode), &outpos, 5114 native_ordering); 5115 } else if (kind == PyUnicode_2BYTE_KIND) { 5116 ch = ucs2lib_utf16_decode(&q, e, 5117 PyUnicode_2BYTE_DATA(unicode), &outpos, 5118 native_ordering); 5119 } else { 5120 assert(kind == PyUnicode_4BYTE_KIND); 5121 ch = ucs4lib_utf16_decode(&q, e, 5122 PyUnicode_4BYTE_DATA(unicode), &outpos, 5123 native_ordering); 5124 } 5125 } 5126 5127 switch (ch) 5128 { 5129 case 0: 5130 /* remaining byte at the end? (size should be even) */ 5131 if (q == e || consumed) 5132 goto End; 5133 errmsg = "truncated data"; 5134 startinpos = ((const char *)q) - starts; 5135 endinpos = ((const char *)e) - starts; 5136 break; 5137 /* The remaining input chars are ignored if the callback 5138 chooses to skip the input */ 5139 case 1: 5140 errmsg = "unexpected end of data"; 5141 startinpos = ((const char *)q) - 2 - starts; 5142 endinpos = ((const char *)e) - starts; 5143 break; 5144 case 2: 5145 errmsg = "illegal encoding"; 5146 startinpos = ((const char *)q) - 2 - starts; 5147 endinpos = startinpos + 2; 5148 break; 5149 case 3: 5150 errmsg = "illegal UTF-16 surrogate"; 5151 startinpos = ((const char *)q) - 4 - starts; 5152 endinpos = startinpos + 2; 5153 break; 5154 default: 5155 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5156 goto onError; 5157 continue; 5158 } 5159 5160 if (unicode_decode_call_errorhandler( 5161 errors, 5162 &errorHandler, 5163 "utf16", errmsg, 5164 &starts, 5165 (const char **)&e, 5166 &startinpos, 5167 &endinpos, 5168 &exc, 5169 (const char **)&q, 5170 &unicode, 5171 &outpos)) 5172 goto onError; 5173 } 5174 5175End: 5176 if (consumed) 5177 *consumed = (const char *)q-starts; 5178 5179 /* Adjust length */ 5180 if (unicode_resize(&unicode, outpos) < 0) 5181 goto onError; 5182 5183 Py_XDECREF(errorHandler); 5184 Py_XDECREF(exc); 5185 return unicode_result(unicode); 5186 5187 onError: 5188 Py_DECREF(unicode); 5189 Py_XDECREF(errorHandler); 5190 Py_XDECREF(exc); 5191 return NULL; 5192} 5193 5194PyObject * 5195_PyUnicode_EncodeUTF16(PyObject *str, 5196 const char *errors, 5197 int byteorder) 5198{ 5199 enum PyUnicode_Kind kind; 5200 const void *data; 5201 Py_ssize_t len; 5202 PyObject *v; 5203 unsigned short *out; 5204 Py_ssize_t bytesize; 5205 Py_ssize_t pairs; 5206#if PY_BIG_ENDIAN 5207 int native_ordering = byteorder >= 0; 5208#else 5209 int native_ordering = byteorder <= 0; 5210#endif 5211 5212 if (!PyUnicode_Check(str)) { 5213 PyErr_BadArgument(); 5214 return NULL; 5215 } 5216 if (PyUnicode_READY(str) == -1) 5217 return NULL; 5218 kind = PyUnicode_KIND(str); 5219 data = PyUnicode_DATA(str); 5220 len = PyUnicode_GET_LENGTH(str); 5221 5222 pairs = 0; 5223 if (kind == PyUnicode_4BYTE_KIND) { 5224 const Py_UCS4 *in = (const Py_UCS4 *)data; 5225 const Py_UCS4 *end = in + len; 5226 while (in < end) 5227 if (*in++ >= 0x10000) 5228 pairs++; 5229 } 5230 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5231 return PyErr_NoMemory(); 5232 bytesize = (len + pairs + (byteorder == 0)) * 2; 5233 v = PyBytes_FromStringAndSize(NULL, bytesize); 5234 if (v == NULL) 5235 return NULL; 5236 5237 /* output buffer is 2-bytes aligned */ 5238 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5239 out = (unsigned short *)PyBytes_AS_STRING(v); 5240 if (byteorder == 0) 5241 *out++ = 0xFEFF; 5242 if (len == 0) 5243 goto done; 5244 5245 switch (kind) { 5246 case PyUnicode_1BYTE_KIND: { 5247 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5248 break; 5249 } 5250 case PyUnicode_2BYTE_KIND: { 5251 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5252 break; 5253 } 5254 case PyUnicode_4BYTE_KIND: { 5255 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5256 break; 5257 } 5258 default: 5259 assert(0); 5260 } 5261 5262 done: 5263 return v; 5264} 5265 5266PyObject * 5267PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5268 Py_ssize_t size, 5269 const char *errors, 5270 int byteorder) 5271{ 5272 PyObject *result; 5273 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5274 if (tmp == NULL) 5275 return NULL; 5276 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5277 Py_DECREF(tmp); 5278 return result; 5279} 5280 5281PyObject * 5282PyUnicode_AsUTF16String(PyObject *unicode) 5283{ 5284 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5285} 5286 5287/* --- Unicode Escape Codec ----------------------------------------------- */ 5288 5289/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5290 if all the escapes in the string make it still a valid ASCII string. 5291 Returns -1 if any escapes were found which cause the string to 5292 pop out of ASCII range. Otherwise returns the length of the 5293 required buffer to hold the string. 5294 */ 5295static Py_ssize_t 5296length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5297{ 5298 const unsigned char *p = (const unsigned char *)s; 5299 const unsigned char *end = p + size; 5300 Py_ssize_t length = 0; 5301 5302 if (size < 0) 5303 return -1; 5304 5305 for (; p < end; ++p) { 5306 if (*p > 127) { 5307 /* Non-ASCII */ 5308 return -1; 5309 } 5310 else if (*p != '\\') { 5311 /* Normal character */ 5312 ++length; 5313 } 5314 else { 5315 /* Backslash-escape, check next char */ 5316 ++p; 5317 /* Escape sequence reaches till end of string or 5318 non-ASCII follow-up. */ 5319 if (p >= end || *p > 127) 5320 return -1; 5321 switch (*p) { 5322 case '\n': 5323 /* backslash + \n result in zero characters */ 5324 break; 5325 case '\\': case '\'': case '\"': 5326 case 'b': case 'f': case 't': 5327 case 'n': case 'r': case 'v': case 'a': 5328 ++length; 5329 break; 5330 case '0': case '1': case '2': case '3': 5331 case '4': case '5': case '6': case '7': 5332 case 'x': case 'u': case 'U': case 'N': 5333 /* these do not guarantee ASCII characters */ 5334 return -1; 5335 default: 5336 /* count the backslash + the other character */ 5337 length += 2; 5338 } 5339 } 5340 } 5341 return length; 5342} 5343 5344static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5345 5346PyObject * 5347PyUnicode_DecodeUnicodeEscape(const char *s, 5348 Py_ssize_t size, 5349 const char *errors) 5350{ 5351 const char *starts = s; 5352 Py_ssize_t startinpos; 5353 Py_ssize_t endinpos; 5354 int j; 5355 PyObject *v; 5356 const char *end; 5357 char* message; 5358 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5359 PyObject *errorHandler = NULL; 5360 PyObject *exc = NULL; 5361 Py_ssize_t len; 5362 Py_ssize_t i; 5363 5364 len = length_of_escaped_ascii_string(s, size); 5365 5366 /* After length_of_escaped_ascii_string() there are two alternatives, 5367 either the string is pure ASCII with named escapes like \n, etc. 5368 and we determined it's exact size (common case) 5369 or it contains \x, \u, ... escape sequences. then we create a 5370 legacy wchar string and resize it at the end of this function. */ 5371 if (len >= 0) { 5372 v = PyUnicode_New(len, 127); 5373 if (!v) 5374 goto onError; 5375 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5376 } 5377 else { 5378 /* Escaped strings will always be longer than the resulting 5379 Unicode string, so we start with size here and then reduce the 5380 length after conversion to the true value. 5381 (but if the error callback returns a long replacement string 5382 we'll have to allocate more space) */ 5383 v = PyUnicode_New(size, 127); 5384 if (!v) 5385 goto onError; 5386 len = size; 5387 } 5388 5389 if (size == 0) 5390 return v; 5391 i = 0; 5392 end = s + size; 5393 5394 while (s < end) { 5395 unsigned char c; 5396 Py_UCS4 x; 5397 int digits; 5398 5399 /* The only case in which i == ascii_length is a backslash 5400 followed by a newline. */ 5401 assert(i <= len); 5402 5403 /* Non-escape characters are interpreted as Unicode ordinals */ 5404 if (*s != '\\') { 5405 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5406 goto onError; 5407 continue; 5408 } 5409 5410 startinpos = s-starts; 5411 /* \ - Escapes */ 5412 s++; 5413 c = *s++; 5414 if (s > end) 5415 c = '\0'; /* Invalid after \ */ 5416 5417 /* The only case in which i == ascii_length is a backslash 5418 followed by a newline. */ 5419 assert(i < len || (i == len && c == '\n')); 5420 5421 switch (c) { 5422 5423 /* \x escapes */ 5424#define WRITECHAR(ch) \ 5425 do { \ 5426 if (unicode_putchar(&v, &i, ch) < 0) \ 5427 goto onError; \ 5428 }while(0) 5429 5430 case '\n': break; 5431 case '\\': WRITECHAR('\\'); break; 5432 case '\'': WRITECHAR('\''); break; 5433 case '\"': WRITECHAR('\"'); break; 5434 case 'b': WRITECHAR('\b'); break; 5435 /* FF */ 5436 case 'f': WRITECHAR('\014'); break; 5437 case 't': WRITECHAR('\t'); break; 5438 case 'n': WRITECHAR('\n'); break; 5439 case 'r': WRITECHAR('\r'); break; 5440 /* VT */ 5441 case 'v': WRITECHAR('\013'); break; 5442 /* BEL, not classic C */ 5443 case 'a': WRITECHAR('\007'); break; 5444 5445 /* \OOO (octal) escapes */ 5446 case '0': case '1': case '2': case '3': 5447 case '4': case '5': case '6': case '7': 5448 x = s[-1] - '0'; 5449 if (s < end && '0' <= *s && *s <= '7') { 5450 x = (x<<3) + *s++ - '0'; 5451 if (s < end && '0' <= *s && *s <= '7') 5452 x = (x<<3) + *s++ - '0'; 5453 } 5454 WRITECHAR(x); 5455 break; 5456 5457 /* hex escapes */ 5458 /* \xXX */ 5459 case 'x': 5460 digits = 2; 5461 message = "truncated \\xXX escape"; 5462 goto hexescape; 5463 5464 /* \uXXXX */ 5465 case 'u': 5466 digits = 4; 5467 message = "truncated \\uXXXX escape"; 5468 goto hexescape; 5469 5470 /* \UXXXXXXXX */ 5471 case 'U': 5472 digits = 8; 5473 message = "truncated \\UXXXXXXXX escape"; 5474 hexescape: 5475 chr = 0; 5476 if (s+digits>end) { 5477 endinpos = size; 5478 if (unicode_decode_call_errorhandler( 5479 errors, &errorHandler, 5480 "unicodeescape", "end of string in escape sequence", 5481 &starts, &end, &startinpos, &endinpos, &exc, &s, 5482 &v, &i)) 5483 goto onError; 5484 goto nextByte; 5485 } 5486 for (j = 0; j < digits; ++j) { 5487 c = (unsigned char) s[j]; 5488 if (!Py_ISXDIGIT(c)) { 5489 endinpos = (s+j+1)-starts; 5490 if (unicode_decode_call_errorhandler( 5491 errors, &errorHandler, 5492 "unicodeescape", message, 5493 &starts, &end, &startinpos, &endinpos, &exc, &s, 5494 &v, &i)) 5495 goto onError; 5496 len = PyUnicode_GET_LENGTH(v); 5497 goto nextByte; 5498 } 5499 chr = (chr<<4) & ~0xF; 5500 if (c >= '0' && c <= '9') 5501 chr += c - '0'; 5502 else if (c >= 'a' && c <= 'f') 5503 chr += 10 + c - 'a'; 5504 else 5505 chr += 10 + c - 'A'; 5506 } 5507 s += j; 5508 if (chr == 0xffffffff && PyErr_Occurred()) 5509 /* _decoding_error will have already written into the 5510 target buffer. */ 5511 break; 5512 store: 5513 /* when we get here, chr is a 32-bit unicode character */ 5514 if (chr <= MAX_UNICODE) { 5515 WRITECHAR(chr); 5516 } else { 5517 endinpos = s-starts; 5518 if (unicode_decode_call_errorhandler( 5519 errors, &errorHandler, 5520 "unicodeescape", "illegal Unicode character", 5521 &starts, &end, &startinpos, &endinpos, &exc, &s, 5522 &v, &i)) 5523 goto onError; 5524 } 5525 break; 5526 5527 /* \N{name} */ 5528 case 'N': 5529 message = "malformed \\N character escape"; 5530 if (ucnhash_CAPI == NULL) { 5531 /* load the unicode data module */ 5532 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5533 PyUnicodeData_CAPSULE_NAME, 1); 5534 if (ucnhash_CAPI == NULL) 5535 goto ucnhashError; 5536 } 5537 if (*s == '{') { 5538 const char *start = s+1; 5539 /* look for the closing brace */ 5540 while (*s != '}' && s < end) 5541 s++; 5542 if (s > start && s < end && *s == '}') { 5543 /* found a name. look it up in the unicode database */ 5544 message = "unknown Unicode character name"; 5545 s++; 5546 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5547 &chr, 0)) 5548 goto store; 5549 } 5550 } 5551 endinpos = s-starts; 5552 if (unicode_decode_call_errorhandler( 5553 errors, &errorHandler, 5554 "unicodeescape", message, 5555 &starts, &end, &startinpos, &endinpos, &exc, &s, 5556 &v, &i)) 5557 goto onError; 5558 break; 5559 5560 default: 5561 if (s > end) { 5562 message = "\\ at end of string"; 5563 s--; 5564 endinpos = s-starts; 5565 if (unicode_decode_call_errorhandler( 5566 errors, &errorHandler, 5567 "unicodeescape", message, 5568 &starts, &end, &startinpos, &endinpos, &exc, &s, 5569 &v, &i)) 5570 goto onError; 5571 } 5572 else { 5573 WRITECHAR('\\'); 5574 WRITECHAR(s[-1]); 5575 } 5576 break; 5577 } 5578 nextByte: 5579 ; 5580 } 5581#undef WRITECHAR 5582 5583 if (unicode_resize(&v, i) < 0) 5584 goto onError; 5585 Py_XDECREF(errorHandler); 5586 Py_XDECREF(exc); 5587 return unicode_result(v); 5588 5589 ucnhashError: 5590 PyErr_SetString( 5591 PyExc_UnicodeError, 5592 "\\N escapes not supported (can't load unicodedata module)" 5593 ); 5594 Py_XDECREF(v); 5595 Py_XDECREF(errorHandler); 5596 Py_XDECREF(exc); 5597 return NULL; 5598 5599 onError: 5600 Py_XDECREF(v); 5601 Py_XDECREF(errorHandler); 5602 Py_XDECREF(exc); 5603 return NULL; 5604} 5605 5606/* Return a Unicode-Escape string version of the Unicode object. 5607 5608 If quotes is true, the string is enclosed in u"" or u'' quotes as 5609 appropriate. 5610 5611*/ 5612 5613PyObject * 5614PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5615{ 5616 Py_ssize_t i, len; 5617 PyObject *repr; 5618 char *p; 5619 int kind; 5620 void *data; 5621 Py_ssize_t expandsize = 0; 5622 5623 /* Initial allocation is based on the longest-possible character 5624 escape. 5625 5626 For UCS1 strings it's '\xxx', 4 bytes per source character. 5627 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5628 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5629 */ 5630 5631 if (!PyUnicode_Check(unicode)) { 5632 PyErr_BadArgument(); 5633 return NULL; 5634 } 5635 if (PyUnicode_READY(unicode) == -1) 5636 return NULL; 5637 len = PyUnicode_GET_LENGTH(unicode); 5638 kind = PyUnicode_KIND(unicode); 5639 data = PyUnicode_DATA(unicode); 5640 switch (kind) { 5641 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5642 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5643 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5644 } 5645 5646 if (len == 0) 5647 return PyBytes_FromStringAndSize(NULL, 0); 5648 5649 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5650 return PyErr_NoMemory(); 5651 5652 repr = PyBytes_FromStringAndSize(NULL, 5653 2 5654 + expandsize*len 5655 + 1); 5656 if (repr == NULL) 5657 return NULL; 5658 5659 p = PyBytes_AS_STRING(repr); 5660 5661 for (i = 0; i < len; i++) { 5662 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5663 5664 /* Escape backslashes */ 5665 if (ch == '\\') { 5666 *p++ = '\\'; 5667 *p++ = (char) ch; 5668 continue; 5669 } 5670 5671 /* Map 21-bit characters to '\U00xxxxxx' */ 5672 else if (ch >= 0x10000) { 5673 assert(ch <= MAX_UNICODE); 5674 *p++ = '\\'; 5675 *p++ = 'U'; 5676 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5677 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5678 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5679 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5680 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5681 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5682 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5683 *p++ = Py_hexdigits[ch & 0x0000000F]; 5684 continue; 5685 } 5686 5687 /* Map 16-bit characters to '\uxxxx' */ 5688 if (ch >= 256) { 5689 *p++ = '\\'; 5690 *p++ = 'u'; 5691 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5692 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5693 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5694 *p++ = Py_hexdigits[ch & 0x000F]; 5695 } 5696 5697 /* Map special whitespace to '\t', \n', '\r' */ 5698 else if (ch == '\t') { 5699 *p++ = '\\'; 5700 *p++ = 't'; 5701 } 5702 else if (ch == '\n') { 5703 *p++ = '\\'; 5704 *p++ = 'n'; 5705 } 5706 else if (ch == '\r') { 5707 *p++ = '\\'; 5708 *p++ = 'r'; 5709 } 5710 5711 /* Map non-printable US ASCII to '\xhh' */ 5712 else if (ch < ' ' || ch >= 0x7F) { 5713 *p++ = '\\'; 5714 *p++ = 'x'; 5715 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5716 *p++ = Py_hexdigits[ch & 0x000F]; 5717 } 5718 5719 /* Copy everything else as-is */ 5720 else 5721 *p++ = (char) ch; 5722 } 5723 5724 assert(p - PyBytes_AS_STRING(repr) > 0); 5725 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5726 return NULL; 5727 return repr; 5728} 5729 5730PyObject * 5731PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5732 Py_ssize_t size) 5733{ 5734 PyObject *result; 5735 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5736 if (tmp == NULL) 5737 return NULL; 5738 result = PyUnicode_AsUnicodeEscapeString(tmp); 5739 Py_DECREF(tmp); 5740 return result; 5741} 5742 5743/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5744 5745PyObject * 5746PyUnicode_DecodeRawUnicodeEscape(const char *s, 5747 Py_ssize_t size, 5748 const char *errors) 5749{ 5750 const char *starts = s; 5751 Py_ssize_t startinpos; 5752 Py_ssize_t endinpos; 5753 Py_ssize_t outpos; 5754 PyObject *v; 5755 const char *end; 5756 const char *bs; 5757 PyObject *errorHandler = NULL; 5758 PyObject *exc = NULL; 5759 5760 /* Escaped strings will always be longer than the resulting 5761 Unicode string, so we start with size here and then reduce the 5762 length after conversion to the true value. (But decoding error 5763 handler might have to resize the string) */ 5764 v = PyUnicode_New(size, 127); 5765 if (v == NULL) 5766 goto onError; 5767 if (size == 0) 5768 return v; 5769 outpos = 0; 5770 end = s + size; 5771 while (s < end) { 5772 unsigned char c; 5773 Py_UCS4 x; 5774 int i; 5775 int count; 5776 5777 /* Non-escape characters are interpreted as Unicode ordinals */ 5778 if (*s != '\\') { 5779 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5780 goto onError; 5781 continue; 5782 } 5783 startinpos = s-starts; 5784 5785 /* \u-escapes are only interpreted iff the number of leading 5786 backslashes if odd */ 5787 bs = s; 5788 for (;s < end;) { 5789 if (*s != '\\') 5790 break; 5791 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5792 goto onError; 5793 } 5794 if (((s - bs) & 1) == 0 || 5795 s >= end || 5796 (*s != 'u' && *s != 'U')) { 5797 continue; 5798 } 5799 outpos--; 5800 count = *s=='u' ? 4 : 8; 5801 s++; 5802 5803 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5804 for (x = 0, i = 0; i < count; ++i, ++s) { 5805 c = (unsigned char)*s; 5806 if (!Py_ISXDIGIT(c)) { 5807 endinpos = s-starts; 5808 if (unicode_decode_call_errorhandler( 5809 errors, &errorHandler, 5810 "rawunicodeescape", "truncated \\uXXXX", 5811 &starts, &end, &startinpos, &endinpos, &exc, &s, 5812 &v, &outpos)) 5813 goto onError; 5814 goto nextByte; 5815 } 5816 x = (x<<4) & ~0xF; 5817 if (c >= '0' && c <= '9') 5818 x += c - '0'; 5819 else if (c >= 'a' && c <= 'f') 5820 x += 10 + c - 'a'; 5821 else 5822 x += 10 + c - 'A'; 5823 } 5824 if (x <= MAX_UNICODE) { 5825 if (unicode_putchar(&v, &outpos, x) < 0) 5826 goto onError; 5827 } else { 5828 endinpos = s-starts; 5829 if (unicode_decode_call_errorhandler( 5830 errors, &errorHandler, 5831 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5832 &starts, &end, &startinpos, &endinpos, &exc, &s, 5833 &v, &outpos)) 5834 goto onError; 5835 } 5836 nextByte: 5837 ; 5838 } 5839 if (unicode_resize(&v, outpos) < 0) 5840 goto onError; 5841 Py_XDECREF(errorHandler); 5842 Py_XDECREF(exc); 5843 return unicode_result(v); 5844 5845 onError: 5846 Py_XDECREF(v); 5847 Py_XDECREF(errorHandler); 5848 Py_XDECREF(exc); 5849 return NULL; 5850} 5851 5852 5853PyObject * 5854PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5855{ 5856 PyObject *repr; 5857 char *p; 5858 char *q; 5859 Py_ssize_t expandsize, pos; 5860 int kind; 5861 void *data; 5862 Py_ssize_t len; 5863 5864 if (!PyUnicode_Check(unicode)) { 5865 PyErr_BadArgument(); 5866 return NULL; 5867 } 5868 if (PyUnicode_READY(unicode) == -1) 5869 return NULL; 5870 kind = PyUnicode_KIND(unicode); 5871 data = PyUnicode_DATA(unicode); 5872 len = PyUnicode_GET_LENGTH(unicode); 5873 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 5874 bytes, and 1 byte characters 4. */ 5875 expandsize = kind * 2 + 2; 5876 5877 if (len > PY_SSIZE_T_MAX / expandsize) 5878 return PyErr_NoMemory(); 5879 5880 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 5881 if (repr == NULL) 5882 return NULL; 5883 if (len == 0) 5884 return repr; 5885 5886 p = q = PyBytes_AS_STRING(repr); 5887 for (pos = 0; pos < len; pos++) { 5888 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 5889 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5890 if (ch >= 0x10000) { 5891 assert(ch <= MAX_UNICODE); 5892 *p++ = '\\'; 5893 *p++ = 'U'; 5894 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 5895 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 5896 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 5897 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 5898 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 5899 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 5900 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 5901 *p++ = Py_hexdigits[ch & 15]; 5902 } 5903 /* Map 16-bit characters to '\uxxxx' */ 5904 else if (ch >= 256) { 5905 *p++ = '\\'; 5906 *p++ = 'u'; 5907 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 5908 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 5909 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 5910 *p++ = Py_hexdigits[ch & 15]; 5911 } 5912 /* Copy everything else as-is */ 5913 else 5914 *p++ = (char) ch; 5915 } 5916 5917 assert(p > q); 5918 if (_PyBytes_Resize(&repr, p - q) < 0) 5919 return NULL; 5920 return repr; 5921} 5922 5923PyObject * 5924PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5925 Py_ssize_t size) 5926{ 5927 PyObject *result; 5928 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5929 if (tmp == NULL) 5930 return NULL; 5931 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 5932 Py_DECREF(tmp); 5933 return result; 5934} 5935 5936/* --- Unicode Internal Codec ------------------------------------------- */ 5937 5938PyObject * 5939_PyUnicode_DecodeUnicodeInternal(const char *s, 5940 Py_ssize_t size, 5941 const char *errors) 5942{ 5943 const char *starts = s; 5944 Py_ssize_t startinpos; 5945 Py_ssize_t endinpos; 5946 Py_ssize_t outpos; 5947 PyObject *v; 5948 const char *end; 5949 const char *reason; 5950 PyObject *errorHandler = NULL; 5951 PyObject *exc = NULL; 5952 5953 if (PyErr_WarnEx(PyExc_DeprecationWarning, 5954 "unicode_internal codec has been deprecated", 5955 1)) 5956 return NULL; 5957 5958 /* XXX overflow detection missing */ 5959 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 5960 if (v == NULL) 5961 goto onError; 5962 if (PyUnicode_GET_LENGTH(v) == 0) 5963 return v; 5964 outpos = 0; 5965 end = s + size; 5966 5967 while (s < end) { 5968 Py_UNICODE uch; 5969 Py_UCS4 ch; 5970 /* We copy the raw representation one byte at a time because the 5971 pointer may be unaligned (see test_codeccallbacks). */ 5972 ((char *) &uch)[0] = s[0]; 5973 ((char *) &uch)[1] = s[1]; 5974#ifdef Py_UNICODE_WIDE 5975 ((char *) &uch)[2] = s[2]; 5976 ((char *) &uch)[3] = s[3]; 5977#endif 5978 ch = uch; 5979 5980 /* We have to sanity check the raw data, otherwise doom looms for 5981 some malformed UCS-4 data. */ 5982 if ( 5983#ifdef Py_UNICODE_WIDE 5984 ch > 0x10ffff || 5985#endif 5986 end-s < Py_UNICODE_SIZE 5987 ) 5988 { 5989 startinpos = s - starts; 5990 if (end-s < Py_UNICODE_SIZE) { 5991 endinpos = end-starts; 5992 reason = "truncated input"; 5993 } 5994 else { 5995 endinpos = s - starts + Py_UNICODE_SIZE; 5996 reason = "illegal code point (> 0x10FFFF)"; 5997 } 5998 if (unicode_decode_call_errorhandler( 5999 errors, &errorHandler, 6000 "unicode_internal", reason, 6001 &starts, &end, &startinpos, &endinpos, &exc, &s, 6002 &v, &outpos)) 6003 goto onError; 6004 continue; 6005 } 6006 6007 s += Py_UNICODE_SIZE; 6008#ifndef Py_UNICODE_WIDE 6009 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6010 { 6011 Py_UNICODE uch2; 6012 ((char *) &uch2)[0] = s[0]; 6013 ((char *) &uch2)[1] = s[1]; 6014 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6015 { 6016 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6017 s += Py_UNICODE_SIZE; 6018 } 6019 } 6020#endif 6021 6022 if (unicode_putchar(&v, &outpos, ch) < 0) 6023 goto onError; 6024 } 6025 6026 if (unicode_resize(&v, outpos) < 0) 6027 goto onError; 6028 Py_XDECREF(errorHandler); 6029 Py_XDECREF(exc); 6030 return unicode_result(v); 6031 6032 onError: 6033 Py_XDECREF(v); 6034 Py_XDECREF(errorHandler); 6035 Py_XDECREF(exc); 6036 return NULL; 6037} 6038 6039/* --- Latin-1 Codec ------------------------------------------------------ */ 6040 6041PyObject * 6042PyUnicode_DecodeLatin1(const char *s, 6043 Py_ssize_t size, 6044 const char *errors) 6045{ 6046 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6047 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6048} 6049 6050/* create or adjust a UnicodeEncodeError */ 6051static void 6052make_encode_exception(PyObject **exceptionObject, 6053 const char *encoding, 6054 PyObject *unicode, 6055 Py_ssize_t startpos, Py_ssize_t endpos, 6056 const char *reason) 6057{ 6058 if (*exceptionObject == NULL) { 6059 *exceptionObject = PyObject_CallFunction( 6060 PyExc_UnicodeEncodeError, "sOnns", 6061 encoding, unicode, startpos, endpos, reason); 6062 } 6063 else { 6064 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6065 goto onError; 6066 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6067 goto onError; 6068 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6069 goto onError; 6070 return; 6071 onError: 6072 Py_DECREF(*exceptionObject); 6073 *exceptionObject = NULL; 6074 } 6075} 6076 6077/* raises a UnicodeEncodeError */ 6078static void 6079raise_encode_exception(PyObject **exceptionObject, 6080 const char *encoding, 6081 PyObject *unicode, 6082 Py_ssize_t startpos, Py_ssize_t endpos, 6083 const char *reason) 6084{ 6085 make_encode_exception(exceptionObject, 6086 encoding, unicode, startpos, endpos, reason); 6087 if (*exceptionObject != NULL) 6088 PyCodec_StrictErrors(*exceptionObject); 6089} 6090 6091/* error handling callback helper: 6092 build arguments, call the callback and check the arguments, 6093 put the result into newpos and return the replacement string, which 6094 has to be freed by the caller */ 6095static PyObject * 6096unicode_encode_call_errorhandler(const char *errors, 6097 PyObject **errorHandler, 6098 const char *encoding, const char *reason, 6099 PyObject *unicode, PyObject **exceptionObject, 6100 Py_ssize_t startpos, Py_ssize_t endpos, 6101 Py_ssize_t *newpos) 6102{ 6103 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6104 Py_ssize_t len; 6105 PyObject *restuple; 6106 PyObject *resunicode; 6107 6108 if (*errorHandler == NULL) { 6109 *errorHandler = PyCodec_LookupError(errors); 6110 if (*errorHandler == NULL) 6111 return NULL; 6112 } 6113 6114 if (PyUnicode_READY(unicode) == -1) 6115 return NULL; 6116 len = PyUnicode_GET_LENGTH(unicode); 6117 6118 make_encode_exception(exceptionObject, 6119 encoding, unicode, startpos, endpos, reason); 6120 if (*exceptionObject == NULL) 6121 return NULL; 6122 6123 restuple = PyObject_CallFunctionObjArgs( 6124 *errorHandler, *exceptionObject, NULL); 6125 if (restuple == NULL) 6126 return NULL; 6127 if (!PyTuple_Check(restuple)) { 6128 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6129 Py_DECREF(restuple); 6130 return NULL; 6131 } 6132 if (!PyArg_ParseTuple(restuple, argparse, 6133 &resunicode, newpos)) { 6134 Py_DECREF(restuple); 6135 return NULL; 6136 } 6137 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6138 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6139 Py_DECREF(restuple); 6140 return NULL; 6141 } 6142 if (*newpos<0) 6143 *newpos = len + *newpos; 6144 if (*newpos<0 || *newpos>len) { 6145 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6146 Py_DECREF(restuple); 6147 return NULL; 6148 } 6149 Py_INCREF(resunicode); 6150 Py_DECREF(restuple); 6151 return resunicode; 6152} 6153 6154static PyObject * 6155unicode_encode_ucs1(PyObject *unicode, 6156 const char *errors, 6157 unsigned int limit) 6158{ 6159 /* input state */ 6160 Py_ssize_t pos=0, size; 6161 int kind; 6162 void *data; 6163 /* output object */ 6164 PyObject *res; 6165 /* pointer into the output */ 6166 char *str; 6167 /* current output position */ 6168 Py_ssize_t ressize; 6169 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6170 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6171 PyObject *errorHandler = NULL; 6172 PyObject *exc = NULL; 6173 /* the following variable is used for caching string comparisons 6174 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6175 int known_errorHandler = -1; 6176 6177 if (PyUnicode_READY(unicode) == -1) 6178 return NULL; 6179 size = PyUnicode_GET_LENGTH(unicode); 6180 kind = PyUnicode_KIND(unicode); 6181 data = PyUnicode_DATA(unicode); 6182 /* allocate enough for a simple encoding without 6183 replacements, if we need more, we'll resize */ 6184 if (size == 0) 6185 return PyBytes_FromStringAndSize(NULL, 0); 6186 res = PyBytes_FromStringAndSize(NULL, size); 6187 if (res == NULL) 6188 return NULL; 6189 str = PyBytes_AS_STRING(res); 6190 ressize = size; 6191 6192 while (pos < size) { 6193 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6194 6195 /* can we encode this? */ 6196 if (c<limit) { 6197 /* no overflow check, because we know that the space is enough */ 6198 *str++ = (char)c; 6199 ++pos; 6200 } 6201 else { 6202 Py_ssize_t requiredsize; 6203 PyObject *repunicode; 6204 Py_ssize_t repsize, newpos, respos, i; 6205 /* startpos for collecting unencodable chars */ 6206 Py_ssize_t collstart = pos; 6207 Py_ssize_t collend = pos; 6208 /* find all unecodable characters */ 6209 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6210 ++collend; 6211 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6212 if (known_errorHandler==-1) { 6213 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6214 known_errorHandler = 1; 6215 else if (!strcmp(errors, "replace")) 6216 known_errorHandler = 2; 6217 else if (!strcmp(errors, "ignore")) 6218 known_errorHandler = 3; 6219 else if (!strcmp(errors, "xmlcharrefreplace")) 6220 known_errorHandler = 4; 6221 else 6222 known_errorHandler = 0; 6223 } 6224 switch (known_errorHandler) { 6225 case 1: /* strict */ 6226 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6227 goto onError; 6228 case 2: /* replace */ 6229 while (collstart++<collend) 6230 *str++ = '?'; /* fall through */ 6231 case 3: /* ignore */ 6232 pos = collend; 6233 break; 6234 case 4: /* xmlcharrefreplace */ 6235 respos = str - PyBytes_AS_STRING(res); 6236 /* determine replacement size */ 6237 for (i = collstart, repsize = 0; i < collend; ++i) { 6238 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6239 if (ch < 10) 6240 repsize += 2+1+1; 6241 else if (ch < 100) 6242 repsize += 2+2+1; 6243 else if (ch < 1000) 6244 repsize += 2+3+1; 6245 else if (ch < 10000) 6246 repsize += 2+4+1; 6247 else if (ch < 100000) 6248 repsize += 2+5+1; 6249 else if (ch < 1000000) 6250 repsize += 2+6+1; 6251 else { 6252 assert(ch <= MAX_UNICODE); 6253 repsize += 2+7+1; 6254 } 6255 } 6256 requiredsize = respos+repsize+(size-collend); 6257 if (requiredsize > ressize) { 6258 if (requiredsize<2*ressize) 6259 requiredsize = 2*ressize; 6260 if (_PyBytes_Resize(&res, requiredsize)) 6261 goto onError; 6262 str = PyBytes_AS_STRING(res) + respos; 6263 ressize = requiredsize; 6264 } 6265 /* generate replacement */ 6266 for (i = collstart; i < collend; ++i) { 6267 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6268 } 6269 pos = collend; 6270 break; 6271 default: 6272 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6273 encoding, reason, unicode, &exc, 6274 collstart, collend, &newpos); 6275 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6276 PyUnicode_READY(repunicode) == -1)) 6277 goto onError; 6278 if (PyBytes_Check(repunicode)) { 6279 /* Directly copy bytes result to output. */ 6280 repsize = PyBytes_Size(repunicode); 6281 if (repsize > 1) { 6282 /* Make room for all additional bytes. */ 6283 respos = str - PyBytes_AS_STRING(res); 6284 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6285 Py_DECREF(repunicode); 6286 goto onError; 6287 } 6288 str = PyBytes_AS_STRING(res) + respos; 6289 ressize += repsize-1; 6290 } 6291 memcpy(str, PyBytes_AsString(repunicode), repsize); 6292 str += repsize; 6293 pos = newpos; 6294 Py_DECREF(repunicode); 6295 break; 6296 } 6297 /* need more space? (at least enough for what we 6298 have+the replacement+the rest of the string, so 6299 we won't have to check space for encodable characters) */ 6300 respos = str - PyBytes_AS_STRING(res); 6301 repsize = PyUnicode_GET_LENGTH(repunicode); 6302 requiredsize = respos+repsize+(size-collend); 6303 if (requiredsize > ressize) { 6304 if (requiredsize<2*ressize) 6305 requiredsize = 2*ressize; 6306 if (_PyBytes_Resize(&res, requiredsize)) { 6307 Py_DECREF(repunicode); 6308 goto onError; 6309 } 6310 str = PyBytes_AS_STRING(res) + respos; 6311 ressize = requiredsize; 6312 } 6313 /* check if there is anything unencodable in the replacement 6314 and copy it to the output */ 6315 for (i = 0; repsize-->0; ++i, ++str) { 6316 c = PyUnicode_READ_CHAR(repunicode, i); 6317 if (c >= limit) { 6318 raise_encode_exception(&exc, encoding, unicode, 6319 pos, pos+1, reason); 6320 Py_DECREF(repunicode); 6321 goto onError; 6322 } 6323 *str = (char)c; 6324 } 6325 pos = newpos; 6326 Py_DECREF(repunicode); 6327 } 6328 } 6329 } 6330 /* Resize if we allocated to much */ 6331 size = str - PyBytes_AS_STRING(res); 6332 if (size < ressize) { /* If this falls res will be NULL */ 6333 assert(size >= 0); 6334 if (_PyBytes_Resize(&res, size) < 0) 6335 goto onError; 6336 } 6337 6338 Py_XDECREF(errorHandler); 6339 Py_XDECREF(exc); 6340 return res; 6341 6342 onError: 6343 Py_XDECREF(res); 6344 Py_XDECREF(errorHandler); 6345 Py_XDECREF(exc); 6346 return NULL; 6347} 6348 6349/* Deprecated */ 6350PyObject * 6351PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6352 Py_ssize_t size, 6353 const char *errors) 6354{ 6355 PyObject *result; 6356 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6357 if (unicode == NULL) 6358 return NULL; 6359 result = unicode_encode_ucs1(unicode, errors, 256); 6360 Py_DECREF(unicode); 6361 return result; 6362} 6363 6364PyObject * 6365_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6366{ 6367 if (!PyUnicode_Check(unicode)) { 6368 PyErr_BadArgument(); 6369 return NULL; 6370 } 6371 if (PyUnicode_READY(unicode) == -1) 6372 return NULL; 6373 /* Fast path: if it is a one-byte string, construct 6374 bytes object directly. */ 6375 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6376 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6377 PyUnicode_GET_LENGTH(unicode)); 6378 /* Non-Latin-1 characters present. Defer to above function to 6379 raise the exception. */ 6380 return unicode_encode_ucs1(unicode, errors, 256); 6381} 6382 6383PyObject* 6384PyUnicode_AsLatin1String(PyObject *unicode) 6385{ 6386 return _PyUnicode_AsLatin1String(unicode, NULL); 6387} 6388 6389/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6390 6391PyObject * 6392PyUnicode_DecodeASCII(const char *s, 6393 Py_ssize_t size, 6394 const char *errors) 6395{ 6396 const char *starts = s; 6397 PyObject *unicode; 6398 int kind; 6399 void *data; 6400 Py_ssize_t startinpos; 6401 Py_ssize_t endinpos; 6402 Py_ssize_t outpos; 6403 const char *e; 6404 PyObject *errorHandler = NULL; 6405 PyObject *exc = NULL; 6406 6407 if (size == 0) { 6408 Py_INCREF(unicode_empty); 6409 return unicode_empty; 6410 } 6411 6412 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6413 if (size == 1 && (unsigned char)s[0] < 128) 6414 return get_latin1_char((unsigned char)s[0]); 6415 6416 unicode = PyUnicode_New(size, 127); 6417 if (unicode == NULL) 6418 goto onError; 6419 6420 e = s + size; 6421 data = PyUnicode_1BYTE_DATA(unicode); 6422 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6423 if (outpos == size) 6424 return unicode; 6425 6426 s += outpos; 6427 kind = PyUnicode_1BYTE_KIND; 6428 while (s < e) { 6429 register unsigned char c = (unsigned char)*s; 6430 if (c < 128) { 6431 PyUnicode_WRITE(kind, data, outpos++, c); 6432 ++s; 6433 } 6434 else { 6435 startinpos = s-starts; 6436 endinpos = startinpos + 1; 6437 if (unicode_decode_call_errorhandler( 6438 errors, &errorHandler, 6439 "ascii", "ordinal not in range(128)", 6440 &starts, &e, &startinpos, &endinpos, &exc, &s, 6441 &unicode, &outpos)) 6442 goto onError; 6443 kind = PyUnicode_KIND(unicode); 6444 data = PyUnicode_DATA(unicode); 6445 } 6446 } 6447 if (unicode_resize(&unicode, outpos) < 0) 6448 goto onError; 6449 Py_XDECREF(errorHandler); 6450 Py_XDECREF(exc); 6451 assert(_PyUnicode_CheckConsistency(unicode, 1)); 6452 return unicode; 6453 6454 onError: 6455 Py_XDECREF(unicode); 6456 Py_XDECREF(errorHandler); 6457 Py_XDECREF(exc); 6458 return NULL; 6459} 6460 6461/* Deprecated */ 6462PyObject * 6463PyUnicode_EncodeASCII(const Py_UNICODE *p, 6464 Py_ssize_t size, 6465 const char *errors) 6466{ 6467 PyObject *result; 6468 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6469 if (unicode == NULL) 6470 return NULL; 6471 result = unicode_encode_ucs1(unicode, errors, 128); 6472 Py_DECREF(unicode); 6473 return result; 6474} 6475 6476PyObject * 6477_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6478{ 6479 if (!PyUnicode_Check(unicode)) { 6480 PyErr_BadArgument(); 6481 return NULL; 6482 } 6483 if (PyUnicode_READY(unicode) == -1) 6484 return NULL; 6485 /* Fast path: if it is an ASCII-only string, construct bytes object 6486 directly. Else defer to above function to raise the exception. */ 6487 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6488 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6489 PyUnicode_GET_LENGTH(unicode)); 6490 return unicode_encode_ucs1(unicode, errors, 128); 6491} 6492 6493PyObject * 6494PyUnicode_AsASCIIString(PyObject *unicode) 6495{ 6496 return _PyUnicode_AsASCIIString(unicode, NULL); 6497} 6498 6499#ifdef HAVE_MBCS 6500 6501/* --- MBCS codecs for Windows -------------------------------------------- */ 6502 6503#if SIZEOF_INT < SIZEOF_SIZE_T 6504#define NEED_RETRY 6505#endif 6506 6507#ifndef WC_ERR_INVALID_CHARS 6508# define WC_ERR_INVALID_CHARS 0x0080 6509#endif 6510 6511static char* 6512code_page_name(UINT code_page, PyObject **obj) 6513{ 6514 *obj = NULL; 6515 if (code_page == CP_ACP) 6516 return "mbcs"; 6517 if (code_page == CP_UTF7) 6518 return "CP_UTF7"; 6519 if (code_page == CP_UTF8) 6520 return "CP_UTF8"; 6521 6522 *obj = PyBytes_FromFormat("cp%u", code_page); 6523 if (*obj == NULL) 6524 return NULL; 6525 return PyBytes_AS_STRING(*obj); 6526} 6527 6528static int 6529is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6530{ 6531 const char *curr = s + offset; 6532 const char *prev; 6533 6534 if (!IsDBCSLeadByteEx(code_page, *curr)) 6535 return 0; 6536 6537 prev = CharPrevExA(code_page, s, curr, 0); 6538 if (prev == curr) 6539 return 1; 6540 /* FIXME: This code is limited to "true" double-byte encodings, 6541 as it assumes an incomplete character consists of a single 6542 byte. */ 6543 if (curr - prev == 2) 6544 return 1; 6545 if (!IsDBCSLeadByteEx(code_page, *prev)) 6546 return 1; 6547 return 0; 6548} 6549 6550static DWORD 6551decode_code_page_flags(UINT code_page) 6552{ 6553 if (code_page == CP_UTF7) { 6554 /* The CP_UTF7 decoder only supports flags=0 */ 6555 return 0; 6556 } 6557 else 6558 return MB_ERR_INVALID_CHARS; 6559} 6560 6561/* 6562 * Decode a byte string from a Windows code page into unicode object in strict 6563 * mode. 6564 * 6565 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6566 * WindowsError and returns -1 on other error. 6567 */ 6568static int 6569decode_code_page_strict(UINT code_page, 6570 PyObject **v, 6571 const char *in, 6572 int insize) 6573{ 6574 const DWORD flags = decode_code_page_flags(code_page); 6575 wchar_t *out; 6576 DWORD outsize; 6577 6578 /* First get the size of the result */ 6579 assert(insize > 0); 6580 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6581 if (outsize <= 0) 6582 goto error; 6583 6584 if (*v == NULL) { 6585 /* Create unicode object */ 6586 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6587 *v = (PyObject*)_PyUnicode_New(outsize); 6588 if (*v == NULL) 6589 return -1; 6590 out = PyUnicode_AS_UNICODE(*v); 6591 } 6592 else { 6593 /* Extend unicode object */ 6594 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6595 if (unicode_resize(v, n + outsize) < 0) 6596 return -1; 6597 out = PyUnicode_AS_UNICODE(*v) + n; 6598 } 6599 6600 /* Do the conversion */ 6601 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6602 if (outsize <= 0) 6603 goto error; 6604 return insize; 6605 6606error: 6607 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6608 return -2; 6609 PyErr_SetFromWindowsErr(0); 6610 return -1; 6611} 6612 6613/* 6614 * Decode a byte string from a code page into unicode object with an error 6615 * handler. 6616 * 6617 * Returns consumed size if succeed, or raise a WindowsError or 6618 * UnicodeDecodeError exception and returns -1 on error. 6619 */ 6620static int 6621decode_code_page_errors(UINT code_page, 6622 PyObject **v, 6623 const char *in, const int size, 6624 const char *errors) 6625{ 6626 const char *startin = in; 6627 const char *endin = in + size; 6628 const DWORD flags = decode_code_page_flags(code_page); 6629 /* Ideally, we should get reason from FormatMessage. This is the Windows 6630 2000 English version of the message. */ 6631 const char *reason = "No mapping for the Unicode character exists " 6632 "in the target code page."; 6633 /* each step cannot decode more than 1 character, but a character can be 6634 represented as a surrogate pair */ 6635 wchar_t buffer[2], *startout, *out; 6636 int insize, outsize; 6637 PyObject *errorHandler = NULL; 6638 PyObject *exc = NULL; 6639 PyObject *encoding_obj = NULL; 6640 char *encoding; 6641 DWORD err; 6642 int ret = -1; 6643 6644 assert(size > 0); 6645 6646 encoding = code_page_name(code_page, &encoding_obj); 6647 if (encoding == NULL) 6648 return -1; 6649 6650 if (errors == NULL || strcmp(errors, "strict") == 0) { 6651 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6652 UnicodeDecodeError. */ 6653 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6654 if (exc != NULL) { 6655 PyCodec_StrictErrors(exc); 6656 Py_CLEAR(exc); 6657 } 6658 goto error; 6659 } 6660 6661 if (*v == NULL) { 6662 /* Create unicode object */ 6663 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6664 PyErr_NoMemory(); 6665 goto error; 6666 } 6667 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6668 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6669 if (*v == NULL) 6670 goto error; 6671 startout = PyUnicode_AS_UNICODE(*v); 6672 } 6673 else { 6674 /* Extend unicode object */ 6675 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6676 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6677 PyErr_NoMemory(); 6678 goto error; 6679 } 6680 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6681 goto error; 6682 startout = PyUnicode_AS_UNICODE(*v) + n; 6683 } 6684 6685 /* Decode the byte string character per character */ 6686 out = startout; 6687 while (in < endin) 6688 { 6689 /* Decode a character */ 6690 insize = 1; 6691 do 6692 { 6693 outsize = MultiByteToWideChar(code_page, flags, 6694 in, insize, 6695 buffer, Py_ARRAY_LENGTH(buffer)); 6696 if (outsize > 0) 6697 break; 6698 err = GetLastError(); 6699 if (err != ERROR_NO_UNICODE_TRANSLATION 6700 && err != ERROR_INSUFFICIENT_BUFFER) 6701 { 6702 PyErr_SetFromWindowsErr(0); 6703 goto error; 6704 } 6705 insize++; 6706 } 6707 /* 4=maximum length of a UTF-8 sequence */ 6708 while (insize <= 4 && (in + insize) <= endin); 6709 6710 if (outsize <= 0) { 6711 Py_ssize_t startinpos, endinpos, outpos; 6712 6713 startinpos = in - startin; 6714 endinpos = startinpos + 1; 6715 outpos = out - PyUnicode_AS_UNICODE(*v); 6716 if (unicode_decode_call_errorhandler( 6717 errors, &errorHandler, 6718 encoding, reason, 6719 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6720 v, &outpos)) 6721 { 6722 goto error; 6723 } 6724 out = PyUnicode_AS_UNICODE(*v) + outpos; 6725 } 6726 else { 6727 in += insize; 6728 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6729 out += outsize; 6730 } 6731 } 6732 6733 /* write a NUL character at the end */ 6734 *out = 0; 6735 6736 /* Extend unicode object */ 6737 outsize = out - startout; 6738 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6739 if (unicode_resize(v, outsize) < 0) 6740 goto error; 6741 ret = size; 6742 6743error: 6744 Py_XDECREF(encoding_obj); 6745 Py_XDECREF(errorHandler); 6746 Py_XDECREF(exc); 6747 return ret; 6748} 6749 6750static PyObject * 6751decode_code_page_stateful(int code_page, 6752 const char *s, Py_ssize_t size, 6753 const char *errors, Py_ssize_t *consumed) 6754{ 6755 PyObject *v = NULL; 6756 int chunk_size, final, converted, done; 6757 6758 if (code_page < 0) { 6759 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6760 return NULL; 6761 } 6762 6763 if (consumed) 6764 *consumed = 0; 6765 6766 do 6767 { 6768#ifdef NEED_RETRY 6769 if (size > INT_MAX) { 6770 chunk_size = INT_MAX; 6771 final = 0; 6772 done = 0; 6773 } 6774 else 6775#endif 6776 { 6777 chunk_size = (int)size; 6778 final = (consumed == NULL); 6779 done = 1; 6780 } 6781 6782 /* Skip trailing lead-byte unless 'final' is set */ 6783 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6784 --chunk_size; 6785 6786 if (chunk_size == 0 && done) { 6787 if (v != NULL) 6788 break; 6789 Py_INCREF(unicode_empty); 6790 return unicode_empty; 6791 } 6792 6793 6794 converted = decode_code_page_strict(code_page, &v, 6795 s, chunk_size); 6796 if (converted == -2) 6797 converted = decode_code_page_errors(code_page, &v, 6798 s, chunk_size, 6799 errors); 6800 assert(converted != 0); 6801 6802 if (converted < 0) { 6803 Py_XDECREF(v); 6804 return NULL; 6805 } 6806 6807 if (consumed) 6808 *consumed += converted; 6809 6810 s += converted; 6811 size -= converted; 6812 } while (!done); 6813 6814 return unicode_result(v); 6815} 6816 6817PyObject * 6818PyUnicode_DecodeCodePageStateful(int code_page, 6819 const char *s, 6820 Py_ssize_t size, 6821 const char *errors, 6822 Py_ssize_t *consumed) 6823{ 6824 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6825} 6826 6827PyObject * 6828PyUnicode_DecodeMBCSStateful(const char *s, 6829 Py_ssize_t size, 6830 const char *errors, 6831 Py_ssize_t *consumed) 6832{ 6833 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6834} 6835 6836PyObject * 6837PyUnicode_DecodeMBCS(const char *s, 6838 Py_ssize_t size, 6839 const char *errors) 6840{ 6841 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6842} 6843 6844static DWORD 6845encode_code_page_flags(UINT code_page, const char *errors) 6846{ 6847 if (code_page == CP_UTF8) { 6848 if (winver.dwMajorVersion >= 6) 6849 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 6850 and later */ 6851 return WC_ERR_INVALID_CHARS; 6852 else 6853 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 6854 return 0; 6855 } 6856 else if (code_page == CP_UTF7) { 6857 /* CP_UTF7 only supports flags=0 */ 6858 return 0; 6859 } 6860 else { 6861 if (errors != NULL && strcmp(errors, "replace") == 0) 6862 return 0; 6863 else 6864 return WC_NO_BEST_FIT_CHARS; 6865 } 6866} 6867 6868/* 6869 * Encode a Unicode string to a Windows code page into a byte string in strict 6870 * mode. 6871 * 6872 * Returns consumed characters if succeed, returns -2 on encode error, or raise 6873 * a WindowsError and returns -1 on other error. 6874 */ 6875static int 6876encode_code_page_strict(UINT code_page, PyObject **outbytes, 6877 PyObject *unicode, Py_ssize_t offset, int len, 6878 const char* errors) 6879{ 6880 BOOL usedDefaultChar = FALSE; 6881 BOOL *pusedDefaultChar = &usedDefaultChar; 6882 int outsize; 6883 PyObject *exc = NULL; 6884 wchar_t *p; 6885 Py_ssize_t size; 6886 const DWORD flags = encode_code_page_flags(code_page, NULL); 6887 char *out; 6888 /* Create a substring so that we can get the UTF-16 representation 6889 of just the slice under consideration. */ 6890 PyObject *substring; 6891 6892 assert(len > 0); 6893 6894 if (code_page != CP_UTF8 && code_page != CP_UTF7) 6895 pusedDefaultChar = &usedDefaultChar; 6896 else 6897 pusedDefaultChar = NULL; 6898 6899 substring = PyUnicode_Substring(unicode, offset, offset+len); 6900 if (substring == NULL) 6901 return -1; 6902 p = PyUnicode_AsUnicodeAndSize(substring, &size); 6903 if (p == NULL) { 6904 Py_DECREF(substring); 6905 return -1; 6906 } 6907 6908 /* First get the size of the result */ 6909 outsize = WideCharToMultiByte(code_page, flags, 6910 p, size, 6911 NULL, 0, 6912 NULL, pusedDefaultChar); 6913 if (outsize <= 0) 6914 goto error; 6915 /* If we used a default char, then we failed! */ 6916 if (pusedDefaultChar && *pusedDefaultChar) { 6917 Py_DECREF(substring); 6918 return -2; 6919 } 6920 6921 if (*outbytes == NULL) { 6922 /* Create string object */ 6923 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 6924 if (*outbytes == NULL) { 6925 Py_DECREF(substring); 6926 return -1; 6927 } 6928 out = PyBytes_AS_STRING(*outbytes); 6929 } 6930 else { 6931 /* Extend string object */ 6932 const Py_ssize_t n = PyBytes_Size(*outbytes); 6933 if (outsize > PY_SSIZE_T_MAX - n) { 6934 PyErr_NoMemory(); 6935 Py_DECREF(substring); 6936 return -1; 6937 } 6938 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 6939 Py_DECREF(substring); 6940 return -1; 6941 } 6942 out = PyBytes_AS_STRING(*outbytes) + n; 6943 } 6944 6945 /* Do the conversion */ 6946 outsize = WideCharToMultiByte(code_page, flags, 6947 p, size, 6948 out, outsize, 6949 NULL, pusedDefaultChar); 6950 Py_CLEAR(substring); 6951 if (outsize <= 0) 6952 goto error; 6953 if (pusedDefaultChar && *pusedDefaultChar) 6954 return -2; 6955 return 0; 6956 6957error: 6958 Py_XDECREF(substring); 6959 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6960 return -2; 6961 PyErr_SetFromWindowsErr(0); 6962 return -1; 6963} 6964 6965/* 6966 * Encode a Unicode string to a Windows code page into a byte string using a 6967 * error handler. 6968 * 6969 * Returns consumed characters if succeed, or raise a WindowsError and returns 6970 * -1 on other error. 6971 */ 6972static int 6973encode_code_page_errors(UINT code_page, PyObject **outbytes, 6974 PyObject *unicode, Py_ssize_t unicode_offset, 6975 Py_ssize_t insize, const char* errors) 6976{ 6977 const DWORD flags = encode_code_page_flags(code_page, errors); 6978 Py_ssize_t pos = unicode_offset; 6979 Py_ssize_t endin = unicode_offset + insize; 6980 /* Ideally, we should get reason from FormatMessage. This is the Windows 6981 2000 English version of the message. */ 6982 const char *reason = "invalid character"; 6983 /* 4=maximum length of a UTF-8 sequence */ 6984 char buffer[4]; 6985 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 6986 Py_ssize_t outsize; 6987 char *out; 6988 PyObject *errorHandler = NULL; 6989 PyObject *exc = NULL; 6990 PyObject *encoding_obj = NULL; 6991 char *encoding; 6992 Py_ssize_t newpos, newoutsize; 6993 PyObject *rep; 6994 int ret = -1; 6995 6996 assert(insize > 0); 6997 6998 encoding = code_page_name(code_page, &encoding_obj); 6999 if (encoding == NULL) 7000 return -1; 7001 7002 if (errors == NULL || strcmp(errors, "strict") == 0) { 7003 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7004 then we raise a UnicodeEncodeError. */ 7005 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7006 if (exc != NULL) { 7007 PyCodec_StrictErrors(exc); 7008 Py_DECREF(exc); 7009 } 7010 Py_XDECREF(encoding_obj); 7011 return -1; 7012 } 7013 7014 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7015 pusedDefaultChar = &usedDefaultChar; 7016 else 7017 pusedDefaultChar = NULL; 7018 7019 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7020 PyErr_NoMemory(); 7021 goto error; 7022 } 7023 outsize = insize * Py_ARRAY_LENGTH(buffer); 7024 7025 if (*outbytes == NULL) { 7026 /* Create string object */ 7027 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7028 if (*outbytes == NULL) 7029 goto error; 7030 out = PyBytes_AS_STRING(*outbytes); 7031 } 7032 else { 7033 /* Extend string object */ 7034 Py_ssize_t n = PyBytes_Size(*outbytes); 7035 if (n > PY_SSIZE_T_MAX - outsize) { 7036 PyErr_NoMemory(); 7037 goto error; 7038 } 7039 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7040 goto error; 7041 out = PyBytes_AS_STRING(*outbytes) + n; 7042 } 7043 7044 /* Encode the string character per character */ 7045 while (pos < endin) 7046 { 7047 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7048 wchar_t chars[2]; 7049 int charsize; 7050 if (ch < 0x10000) { 7051 chars[0] = (wchar_t)ch; 7052 charsize = 1; 7053 } 7054 else { 7055 ch -= 0x10000; 7056 chars[0] = 0xd800 + (ch >> 10); 7057 chars[1] = 0xdc00 + (ch & 0x3ff); 7058 charsize = 2; 7059 } 7060 7061 outsize = WideCharToMultiByte(code_page, flags, 7062 chars, charsize, 7063 buffer, Py_ARRAY_LENGTH(buffer), 7064 NULL, pusedDefaultChar); 7065 if (outsize > 0) { 7066 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7067 { 7068 pos++; 7069 memcpy(out, buffer, outsize); 7070 out += outsize; 7071 continue; 7072 } 7073 } 7074 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7075 PyErr_SetFromWindowsErr(0); 7076 goto error; 7077 } 7078 7079 rep = unicode_encode_call_errorhandler( 7080 errors, &errorHandler, encoding, reason, 7081 unicode, &exc, 7082 pos, pos + 1, &newpos); 7083 if (rep == NULL) 7084 goto error; 7085 pos = newpos; 7086 7087 if (PyBytes_Check(rep)) { 7088 outsize = PyBytes_GET_SIZE(rep); 7089 if (outsize != 1) { 7090 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7091 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7092 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7093 Py_DECREF(rep); 7094 goto error; 7095 } 7096 out = PyBytes_AS_STRING(*outbytes) + offset; 7097 } 7098 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7099 out += outsize; 7100 } 7101 else { 7102 Py_ssize_t i; 7103 enum PyUnicode_Kind kind; 7104 void *data; 7105 7106 if (PyUnicode_READY(rep) == -1) { 7107 Py_DECREF(rep); 7108 goto error; 7109 } 7110 7111 outsize = PyUnicode_GET_LENGTH(rep); 7112 if (outsize != 1) { 7113 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7114 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7115 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7116 Py_DECREF(rep); 7117 goto error; 7118 } 7119 out = PyBytes_AS_STRING(*outbytes) + offset; 7120 } 7121 kind = PyUnicode_KIND(rep); 7122 data = PyUnicode_DATA(rep); 7123 for (i=0; i < outsize; i++) { 7124 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7125 if (ch > 127) { 7126 raise_encode_exception(&exc, 7127 encoding, unicode, 7128 pos, pos + 1, 7129 "unable to encode error handler result to ASCII"); 7130 Py_DECREF(rep); 7131 goto error; 7132 } 7133 *out = (unsigned char)ch; 7134 out++; 7135 } 7136 } 7137 Py_DECREF(rep); 7138 } 7139 /* write a NUL byte */ 7140 *out = 0; 7141 outsize = out - PyBytes_AS_STRING(*outbytes); 7142 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7143 if (_PyBytes_Resize(outbytes, outsize) < 0) 7144 goto error; 7145 ret = 0; 7146 7147error: 7148 Py_XDECREF(encoding_obj); 7149 Py_XDECREF(errorHandler); 7150 Py_XDECREF(exc); 7151 return ret; 7152} 7153 7154static PyObject * 7155encode_code_page(int code_page, 7156 PyObject *unicode, 7157 const char *errors) 7158{ 7159 Py_ssize_t len; 7160 PyObject *outbytes = NULL; 7161 Py_ssize_t offset; 7162 int chunk_len, ret, done; 7163 7164 if (PyUnicode_READY(unicode) == -1) 7165 return NULL; 7166 len = PyUnicode_GET_LENGTH(unicode); 7167 7168 if (code_page < 0) { 7169 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7170 return NULL; 7171 } 7172 7173 if (len == 0) 7174 return PyBytes_FromStringAndSize(NULL, 0); 7175 7176 offset = 0; 7177 do 7178 { 7179#ifdef NEED_RETRY 7180 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7181 chunks. */ 7182 if (len > INT_MAX/2) { 7183 chunk_len = INT_MAX/2; 7184 done = 0; 7185 } 7186 else 7187#endif 7188 { 7189 chunk_len = (int)len; 7190 done = 1; 7191 } 7192 7193 ret = encode_code_page_strict(code_page, &outbytes, 7194 unicode, offset, chunk_len, 7195 errors); 7196 if (ret == -2) 7197 ret = encode_code_page_errors(code_page, &outbytes, 7198 unicode, offset, 7199 chunk_len, errors); 7200 if (ret < 0) { 7201 Py_XDECREF(outbytes); 7202 return NULL; 7203 } 7204 7205 offset += chunk_len; 7206 len -= chunk_len; 7207 } while (!done); 7208 7209 return outbytes; 7210} 7211 7212PyObject * 7213PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7214 Py_ssize_t size, 7215 const char *errors) 7216{ 7217 PyObject *unicode, *res; 7218 unicode = PyUnicode_FromUnicode(p, size); 7219 if (unicode == NULL) 7220 return NULL; 7221 res = encode_code_page(CP_ACP, unicode, errors); 7222 Py_DECREF(unicode); 7223 return res; 7224} 7225 7226PyObject * 7227PyUnicode_EncodeCodePage(int code_page, 7228 PyObject *unicode, 7229 const char *errors) 7230{ 7231 return encode_code_page(code_page, unicode, errors); 7232} 7233 7234PyObject * 7235PyUnicode_AsMBCSString(PyObject *unicode) 7236{ 7237 if (!PyUnicode_Check(unicode)) { 7238 PyErr_BadArgument(); 7239 return NULL; 7240 } 7241 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7242} 7243 7244#undef NEED_RETRY 7245 7246#endif /* HAVE_MBCS */ 7247 7248/* --- Character Mapping Codec -------------------------------------------- */ 7249 7250PyObject * 7251PyUnicode_DecodeCharmap(const char *s, 7252 Py_ssize_t size, 7253 PyObject *mapping, 7254 const char *errors) 7255{ 7256 const char *starts = s; 7257 Py_ssize_t startinpos; 7258 Py_ssize_t endinpos; 7259 Py_ssize_t outpos; 7260 const char *e; 7261 PyObject *v; 7262 Py_ssize_t extrachars = 0; 7263 PyObject *errorHandler = NULL; 7264 PyObject *exc = NULL; 7265 7266 /* Default to Latin-1 */ 7267 if (mapping == NULL) 7268 return PyUnicode_DecodeLatin1(s, size, errors); 7269 7270 v = PyUnicode_New(size, 127); 7271 if (v == NULL) 7272 goto onError; 7273 if (size == 0) 7274 return v; 7275 outpos = 0; 7276 e = s + size; 7277 if (PyUnicode_CheckExact(mapping)) { 7278 Py_ssize_t maplen; 7279 enum PyUnicode_Kind mapkind; 7280 void *mapdata; 7281 Py_UCS4 x; 7282 7283 if (PyUnicode_READY(mapping) == -1) 7284 return NULL; 7285 7286 maplen = PyUnicode_GET_LENGTH(mapping); 7287 mapdata = PyUnicode_DATA(mapping); 7288 mapkind = PyUnicode_KIND(mapping); 7289 while (s < e) { 7290 unsigned char ch; 7291 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7292 enum PyUnicode_Kind outkind = PyUnicode_KIND(v); 7293 if (outkind == PyUnicode_1BYTE_KIND) { 7294 void *outdata = PyUnicode_DATA(v); 7295 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v); 7296 while (s < e) { 7297 unsigned char ch = *s; 7298 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7299 if (x > maxchar) 7300 goto Error; 7301 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x); 7302 ++s; 7303 } 7304 break; 7305 } 7306 else if (outkind == PyUnicode_2BYTE_KIND) { 7307 void *outdata = PyUnicode_DATA(v); 7308 while (s < e) { 7309 unsigned char ch = *s; 7310 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7311 if (x == 0xFFFE) 7312 goto Error; 7313 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x); 7314 ++s; 7315 } 7316 break; 7317 } 7318 } 7319 ch = *s; 7320 7321 if (ch < maplen) 7322 x = PyUnicode_READ(mapkind, mapdata, ch); 7323 else 7324 x = 0xfffe; /* invalid value */ 7325Error: 7326 if (x == 0xfffe) 7327 { 7328 /* undefined mapping */ 7329 startinpos = s-starts; 7330 endinpos = startinpos+1; 7331 if (unicode_decode_call_errorhandler( 7332 errors, &errorHandler, 7333 "charmap", "character maps to <undefined>", 7334 &starts, &e, &startinpos, &endinpos, &exc, &s, 7335 &v, &outpos)) { 7336 goto onError; 7337 } 7338 continue; 7339 } 7340 7341 if (unicode_putchar(&v, &outpos, x) < 0) 7342 goto onError; 7343 ++s; 7344 } 7345 } 7346 else { 7347 while (s < e) { 7348 unsigned char ch = *s; 7349 PyObject *w, *x; 7350 7351 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7352 w = PyLong_FromLong((long)ch); 7353 if (w == NULL) 7354 goto onError; 7355 x = PyObject_GetItem(mapping, w); 7356 Py_DECREF(w); 7357 if (x == NULL) { 7358 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7359 /* No mapping found means: mapping is undefined. */ 7360 PyErr_Clear(); 7361 x = Py_None; 7362 Py_INCREF(x); 7363 } else 7364 goto onError; 7365 } 7366 7367 /* Apply mapping */ 7368 if (PyLong_Check(x)) { 7369 long value = PyLong_AS_LONG(x); 7370 if (value < 0 || value > MAX_UNICODE) { 7371 PyErr_Format(PyExc_TypeError, 7372 "character mapping must be in range(0x%lx)", 7373 (unsigned long)MAX_UNICODE + 1); 7374 Py_DECREF(x); 7375 goto onError; 7376 } 7377 if (unicode_putchar(&v, &outpos, value) < 0) 7378 goto onError; 7379 } 7380 else if (x == Py_None) { 7381 /* undefined mapping */ 7382 startinpos = s-starts; 7383 endinpos = startinpos+1; 7384 if (unicode_decode_call_errorhandler( 7385 errors, &errorHandler, 7386 "charmap", "character maps to <undefined>", 7387 &starts, &e, &startinpos, &endinpos, &exc, &s, 7388 &v, &outpos)) { 7389 Py_DECREF(x); 7390 goto onError; 7391 } 7392 Py_DECREF(x); 7393 continue; 7394 } 7395 else if (PyUnicode_Check(x)) { 7396 Py_ssize_t targetsize; 7397 7398 if (PyUnicode_READY(x) == -1) 7399 goto onError; 7400 targetsize = PyUnicode_GET_LENGTH(x); 7401 7402 if (targetsize == 1) { 7403 /* 1-1 mapping */ 7404 if (unicode_putchar(&v, &outpos, 7405 PyUnicode_READ_CHAR(x, 0)) < 0) 7406 goto onError; 7407 } 7408 else if (targetsize > 1) { 7409 /* 1-n mapping */ 7410 if (targetsize > extrachars) { 7411 /* resize first */ 7412 Py_ssize_t needed = (targetsize - extrachars) + \ 7413 (targetsize << 2); 7414 extrachars += needed; 7415 /* XXX overflow detection missing */ 7416 if (unicode_resize(&v, 7417 PyUnicode_GET_LENGTH(v) + needed) < 0) 7418 { 7419 Py_DECREF(x); 7420 goto onError; 7421 } 7422 } 7423 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7424 goto onError; 7425 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7426 outpos += targetsize; 7427 extrachars -= targetsize; 7428 } 7429 /* 1-0 mapping: skip the character */ 7430 } 7431 else { 7432 /* wrong return value */ 7433 PyErr_SetString(PyExc_TypeError, 7434 "character mapping must return integer, None or str"); 7435 Py_DECREF(x); 7436 goto onError; 7437 } 7438 Py_DECREF(x); 7439 ++s; 7440 } 7441 } 7442 if (unicode_resize(&v, outpos) < 0) 7443 goto onError; 7444 Py_XDECREF(errorHandler); 7445 Py_XDECREF(exc); 7446 return unicode_result(v); 7447 7448 onError: 7449 Py_XDECREF(errorHandler); 7450 Py_XDECREF(exc); 7451 Py_XDECREF(v); 7452 return NULL; 7453} 7454 7455/* Charmap encoding: the lookup table */ 7456 7457struct encoding_map { 7458 PyObject_HEAD 7459 unsigned char level1[32]; 7460 int count2, count3; 7461 unsigned char level23[1]; 7462}; 7463 7464static PyObject* 7465encoding_map_size(PyObject *obj, PyObject* args) 7466{ 7467 struct encoding_map *map = (struct encoding_map*)obj; 7468 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7469 128*map->count3); 7470} 7471 7472static PyMethodDef encoding_map_methods[] = { 7473 {"size", encoding_map_size, METH_NOARGS, 7474 PyDoc_STR("Return the size (in bytes) of this object") }, 7475 { 0 } 7476}; 7477 7478static void 7479encoding_map_dealloc(PyObject* o) 7480{ 7481 PyObject_FREE(o); 7482} 7483 7484static PyTypeObject EncodingMapType = { 7485 PyVarObject_HEAD_INIT(NULL, 0) 7486 "EncodingMap", /*tp_name*/ 7487 sizeof(struct encoding_map), /*tp_basicsize*/ 7488 0, /*tp_itemsize*/ 7489 /* methods */ 7490 encoding_map_dealloc, /*tp_dealloc*/ 7491 0, /*tp_print*/ 7492 0, /*tp_getattr*/ 7493 0, /*tp_setattr*/ 7494 0, /*tp_reserved*/ 7495 0, /*tp_repr*/ 7496 0, /*tp_as_number*/ 7497 0, /*tp_as_sequence*/ 7498 0, /*tp_as_mapping*/ 7499 0, /*tp_hash*/ 7500 0, /*tp_call*/ 7501 0, /*tp_str*/ 7502 0, /*tp_getattro*/ 7503 0, /*tp_setattro*/ 7504 0, /*tp_as_buffer*/ 7505 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7506 0, /*tp_doc*/ 7507 0, /*tp_traverse*/ 7508 0, /*tp_clear*/ 7509 0, /*tp_richcompare*/ 7510 0, /*tp_weaklistoffset*/ 7511 0, /*tp_iter*/ 7512 0, /*tp_iternext*/ 7513 encoding_map_methods, /*tp_methods*/ 7514 0, /*tp_members*/ 7515 0, /*tp_getset*/ 7516 0, /*tp_base*/ 7517 0, /*tp_dict*/ 7518 0, /*tp_descr_get*/ 7519 0, /*tp_descr_set*/ 7520 0, /*tp_dictoffset*/ 7521 0, /*tp_init*/ 7522 0, /*tp_alloc*/ 7523 0, /*tp_new*/ 7524 0, /*tp_free*/ 7525 0, /*tp_is_gc*/ 7526}; 7527 7528PyObject* 7529PyUnicode_BuildEncodingMap(PyObject* string) 7530{ 7531 PyObject *result; 7532 struct encoding_map *mresult; 7533 int i; 7534 int need_dict = 0; 7535 unsigned char level1[32]; 7536 unsigned char level2[512]; 7537 unsigned char *mlevel1, *mlevel2, *mlevel3; 7538 int count2 = 0, count3 = 0; 7539 int kind; 7540 void *data; 7541 Py_ssize_t length; 7542 Py_UCS4 ch; 7543 7544 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7545 PyErr_BadArgument(); 7546 return NULL; 7547 } 7548 kind = PyUnicode_KIND(string); 7549 data = PyUnicode_DATA(string); 7550 length = PyUnicode_GET_LENGTH(string); 7551 length = Py_MIN(length, 256); 7552 memset(level1, 0xFF, sizeof level1); 7553 memset(level2, 0xFF, sizeof level2); 7554 7555 /* If there isn't a one-to-one mapping of NULL to \0, 7556 or if there are non-BMP characters, we need to use 7557 a mapping dictionary. */ 7558 if (PyUnicode_READ(kind, data, 0) != 0) 7559 need_dict = 1; 7560 for (i = 1; i < length; i++) { 7561 int l1, l2; 7562 ch = PyUnicode_READ(kind, data, i); 7563 if (ch == 0 || ch > 0xFFFF) { 7564 need_dict = 1; 7565 break; 7566 } 7567 if (ch == 0xFFFE) 7568 /* unmapped character */ 7569 continue; 7570 l1 = ch >> 11; 7571 l2 = ch >> 7; 7572 if (level1[l1] == 0xFF) 7573 level1[l1] = count2++; 7574 if (level2[l2] == 0xFF) 7575 level2[l2] = count3++; 7576 } 7577 7578 if (count2 >= 0xFF || count3 >= 0xFF) 7579 need_dict = 1; 7580 7581 if (need_dict) { 7582 PyObject *result = PyDict_New(); 7583 PyObject *key, *value; 7584 if (!result) 7585 return NULL; 7586 for (i = 0; i < length; i++) { 7587 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7588 value = PyLong_FromLong(i); 7589 if (!key || !value) 7590 goto failed1; 7591 if (PyDict_SetItem(result, key, value) == -1) 7592 goto failed1; 7593 Py_DECREF(key); 7594 Py_DECREF(value); 7595 } 7596 return result; 7597 failed1: 7598 Py_XDECREF(key); 7599 Py_XDECREF(value); 7600 Py_DECREF(result); 7601 return NULL; 7602 } 7603 7604 /* Create a three-level trie */ 7605 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7606 16*count2 + 128*count3 - 1); 7607 if (!result) 7608 return PyErr_NoMemory(); 7609 PyObject_Init(result, &EncodingMapType); 7610 mresult = (struct encoding_map*)result; 7611 mresult->count2 = count2; 7612 mresult->count3 = count3; 7613 mlevel1 = mresult->level1; 7614 mlevel2 = mresult->level23; 7615 mlevel3 = mresult->level23 + 16*count2; 7616 memcpy(mlevel1, level1, 32); 7617 memset(mlevel2, 0xFF, 16*count2); 7618 memset(mlevel3, 0, 128*count3); 7619 count3 = 0; 7620 for (i = 1; i < length; i++) { 7621 int o1, o2, o3, i2, i3; 7622 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7623 if (ch == 0xFFFE) 7624 /* unmapped character */ 7625 continue; 7626 o1 = ch>>11; 7627 o2 = (ch>>7) & 0xF; 7628 i2 = 16*mlevel1[o1] + o2; 7629 if (mlevel2[i2] == 0xFF) 7630 mlevel2[i2] = count3++; 7631 o3 = ch & 0x7F; 7632 i3 = 128*mlevel2[i2] + o3; 7633 mlevel3[i3] = i; 7634 } 7635 return result; 7636} 7637 7638static int 7639encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7640{ 7641 struct encoding_map *map = (struct encoding_map*)mapping; 7642 int l1 = c>>11; 7643 int l2 = (c>>7) & 0xF; 7644 int l3 = c & 0x7F; 7645 int i; 7646 7647 if (c > 0xFFFF) 7648 return -1; 7649 if (c == 0) 7650 return 0; 7651 /* level 1*/ 7652 i = map->level1[l1]; 7653 if (i == 0xFF) { 7654 return -1; 7655 } 7656 /* level 2*/ 7657 i = map->level23[16*i+l2]; 7658 if (i == 0xFF) { 7659 return -1; 7660 } 7661 /* level 3 */ 7662 i = map->level23[16*map->count2 + 128*i + l3]; 7663 if (i == 0) { 7664 return -1; 7665 } 7666 return i; 7667} 7668 7669/* Lookup the character ch in the mapping. If the character 7670 can't be found, Py_None is returned (or NULL, if another 7671 error occurred). */ 7672static PyObject * 7673charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7674{ 7675 PyObject *w = PyLong_FromLong((long)c); 7676 PyObject *x; 7677 7678 if (w == NULL) 7679 return NULL; 7680 x = PyObject_GetItem(mapping, w); 7681 Py_DECREF(w); 7682 if (x == NULL) { 7683 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7684 /* No mapping found means: mapping is undefined. */ 7685 PyErr_Clear(); 7686 x = Py_None; 7687 Py_INCREF(x); 7688 return x; 7689 } else 7690 return NULL; 7691 } 7692 else if (x == Py_None) 7693 return x; 7694 else if (PyLong_Check(x)) { 7695 long value = PyLong_AS_LONG(x); 7696 if (value < 0 || value > 255) { 7697 PyErr_SetString(PyExc_TypeError, 7698 "character mapping must be in range(256)"); 7699 Py_DECREF(x); 7700 return NULL; 7701 } 7702 return x; 7703 } 7704 else if (PyBytes_Check(x)) 7705 return x; 7706 else { 7707 /* wrong return value */ 7708 PyErr_Format(PyExc_TypeError, 7709 "character mapping must return integer, bytes or None, not %.400s", 7710 x->ob_type->tp_name); 7711 Py_DECREF(x); 7712 return NULL; 7713 } 7714} 7715 7716static int 7717charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7718{ 7719 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7720 /* exponentially overallocate to minimize reallocations */ 7721 if (requiredsize < 2*outsize) 7722 requiredsize = 2*outsize; 7723 if (_PyBytes_Resize(outobj, requiredsize)) 7724 return -1; 7725 return 0; 7726} 7727 7728typedef enum charmapencode_result { 7729 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7730} charmapencode_result; 7731/* lookup the character, put the result in the output string and adjust 7732 various state variables. Resize the output bytes object if not enough 7733 space is available. Return a new reference to the object that 7734 was put in the output buffer, or Py_None, if the mapping was undefined 7735 (in which case no character was written) or NULL, if a 7736 reallocation error occurred. The caller must decref the result */ 7737static charmapencode_result 7738charmapencode_output(Py_UCS4 c, PyObject *mapping, 7739 PyObject **outobj, Py_ssize_t *outpos) 7740{ 7741 PyObject *rep; 7742 char *outstart; 7743 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7744 7745 if (Py_TYPE(mapping) == &EncodingMapType) { 7746 int res = encoding_map_lookup(c, mapping); 7747 Py_ssize_t requiredsize = *outpos+1; 7748 if (res == -1) 7749 return enc_FAILED; 7750 if (outsize<requiredsize) 7751 if (charmapencode_resize(outobj, outpos, requiredsize)) 7752 return enc_EXCEPTION; 7753 outstart = PyBytes_AS_STRING(*outobj); 7754 outstart[(*outpos)++] = (char)res; 7755 return enc_SUCCESS; 7756 } 7757 7758 rep = charmapencode_lookup(c, mapping); 7759 if (rep==NULL) 7760 return enc_EXCEPTION; 7761 else if (rep==Py_None) { 7762 Py_DECREF(rep); 7763 return enc_FAILED; 7764 } else { 7765 if (PyLong_Check(rep)) { 7766 Py_ssize_t requiredsize = *outpos+1; 7767 if (outsize<requiredsize) 7768 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7769 Py_DECREF(rep); 7770 return enc_EXCEPTION; 7771 } 7772 outstart = PyBytes_AS_STRING(*outobj); 7773 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7774 } 7775 else { 7776 const char *repchars = PyBytes_AS_STRING(rep); 7777 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7778 Py_ssize_t requiredsize = *outpos+repsize; 7779 if (outsize<requiredsize) 7780 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7781 Py_DECREF(rep); 7782 return enc_EXCEPTION; 7783 } 7784 outstart = PyBytes_AS_STRING(*outobj); 7785 memcpy(outstart + *outpos, repchars, repsize); 7786 *outpos += repsize; 7787 } 7788 } 7789 Py_DECREF(rep); 7790 return enc_SUCCESS; 7791} 7792 7793/* handle an error in PyUnicode_EncodeCharmap 7794 Return 0 on success, -1 on error */ 7795static int 7796charmap_encoding_error( 7797 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7798 PyObject **exceptionObject, 7799 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7800 PyObject **res, Py_ssize_t *respos) 7801{ 7802 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7803 Py_ssize_t size, repsize; 7804 Py_ssize_t newpos; 7805 enum PyUnicode_Kind kind; 7806 void *data; 7807 Py_ssize_t index; 7808 /* startpos for collecting unencodable chars */ 7809 Py_ssize_t collstartpos = *inpos; 7810 Py_ssize_t collendpos = *inpos+1; 7811 Py_ssize_t collpos; 7812 char *encoding = "charmap"; 7813 char *reason = "character maps to <undefined>"; 7814 charmapencode_result x; 7815 Py_UCS4 ch; 7816 int val; 7817 7818 if (PyUnicode_READY(unicode) == -1) 7819 return -1; 7820 size = PyUnicode_GET_LENGTH(unicode); 7821 /* find all unencodable characters */ 7822 while (collendpos < size) { 7823 PyObject *rep; 7824 if (Py_TYPE(mapping) == &EncodingMapType) { 7825 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7826 val = encoding_map_lookup(ch, mapping); 7827 if (val != -1) 7828 break; 7829 ++collendpos; 7830 continue; 7831 } 7832 7833 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7834 rep = charmapencode_lookup(ch, mapping); 7835 if (rep==NULL) 7836 return -1; 7837 else if (rep!=Py_None) { 7838 Py_DECREF(rep); 7839 break; 7840 } 7841 Py_DECREF(rep); 7842 ++collendpos; 7843 } 7844 /* cache callback name lookup 7845 * (if not done yet, i.e. it's the first error) */ 7846 if (*known_errorHandler==-1) { 7847 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7848 *known_errorHandler = 1; 7849 else if (!strcmp(errors, "replace")) 7850 *known_errorHandler = 2; 7851 else if (!strcmp(errors, "ignore")) 7852 *known_errorHandler = 3; 7853 else if (!strcmp(errors, "xmlcharrefreplace")) 7854 *known_errorHandler = 4; 7855 else 7856 *known_errorHandler = 0; 7857 } 7858 switch (*known_errorHandler) { 7859 case 1: /* strict */ 7860 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7861 return -1; 7862 case 2: /* replace */ 7863 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7864 x = charmapencode_output('?', mapping, res, respos); 7865 if (x==enc_EXCEPTION) { 7866 return -1; 7867 } 7868 else if (x==enc_FAILED) { 7869 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7870 return -1; 7871 } 7872 } 7873 /* fall through */ 7874 case 3: /* ignore */ 7875 *inpos = collendpos; 7876 break; 7877 case 4: /* xmlcharrefreplace */ 7878 /* generate replacement (temporarily (mis)uses p) */ 7879 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7880 char buffer[2+29+1+1]; 7881 char *cp; 7882 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 7883 for (cp = buffer; *cp; ++cp) { 7884 x = charmapencode_output(*cp, mapping, res, respos); 7885 if (x==enc_EXCEPTION) 7886 return -1; 7887 else if (x==enc_FAILED) { 7888 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7889 return -1; 7890 } 7891 } 7892 } 7893 *inpos = collendpos; 7894 break; 7895 default: 7896 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7897 encoding, reason, unicode, exceptionObject, 7898 collstartpos, collendpos, &newpos); 7899 if (repunicode == NULL) 7900 return -1; 7901 if (PyBytes_Check(repunicode)) { 7902 /* Directly copy bytes result to output. */ 7903 Py_ssize_t outsize = PyBytes_Size(*res); 7904 Py_ssize_t requiredsize; 7905 repsize = PyBytes_Size(repunicode); 7906 requiredsize = *respos + repsize; 7907 if (requiredsize > outsize) 7908 /* Make room for all additional bytes. */ 7909 if (charmapencode_resize(res, respos, requiredsize)) { 7910 Py_DECREF(repunicode); 7911 return -1; 7912 } 7913 memcpy(PyBytes_AsString(*res) + *respos, 7914 PyBytes_AsString(repunicode), repsize); 7915 *respos += repsize; 7916 *inpos = newpos; 7917 Py_DECREF(repunicode); 7918 break; 7919 } 7920 /* generate replacement */ 7921 if (PyUnicode_READY(repunicode) == -1) { 7922 Py_DECREF(repunicode); 7923 return -1; 7924 } 7925 repsize = PyUnicode_GET_LENGTH(repunicode); 7926 data = PyUnicode_DATA(repunicode); 7927 kind = PyUnicode_KIND(repunicode); 7928 for (index = 0; index < repsize; index++) { 7929 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 7930 x = charmapencode_output(repch, mapping, res, respos); 7931 if (x==enc_EXCEPTION) { 7932 Py_DECREF(repunicode); 7933 return -1; 7934 } 7935 else if (x==enc_FAILED) { 7936 Py_DECREF(repunicode); 7937 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7938 return -1; 7939 } 7940 } 7941 *inpos = newpos; 7942 Py_DECREF(repunicode); 7943 } 7944 return 0; 7945} 7946 7947PyObject * 7948_PyUnicode_EncodeCharmap(PyObject *unicode, 7949 PyObject *mapping, 7950 const char *errors) 7951{ 7952 /* output object */ 7953 PyObject *res = NULL; 7954 /* current input position */ 7955 Py_ssize_t inpos = 0; 7956 Py_ssize_t size; 7957 /* current output position */ 7958 Py_ssize_t respos = 0; 7959 PyObject *errorHandler = NULL; 7960 PyObject *exc = NULL; 7961 /* the following variable is used for caching string comparisons 7962 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7963 * 3=ignore, 4=xmlcharrefreplace */ 7964 int known_errorHandler = -1; 7965 7966 if (PyUnicode_READY(unicode) == -1) 7967 return NULL; 7968 size = PyUnicode_GET_LENGTH(unicode); 7969 7970 /* Default to Latin-1 */ 7971 if (mapping == NULL) 7972 return unicode_encode_ucs1(unicode, errors, 256); 7973 7974 /* allocate enough for a simple encoding without 7975 replacements, if we need more, we'll resize */ 7976 res = PyBytes_FromStringAndSize(NULL, size); 7977 if (res == NULL) 7978 goto onError; 7979 if (size == 0) 7980 return res; 7981 7982 while (inpos<size) { 7983 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 7984 /* try to encode it */ 7985 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 7986 if (x==enc_EXCEPTION) /* error */ 7987 goto onError; 7988 if (x==enc_FAILED) { /* unencodable character */ 7989 if (charmap_encoding_error(unicode, &inpos, mapping, 7990 &exc, 7991 &known_errorHandler, &errorHandler, errors, 7992 &res, &respos)) { 7993 goto onError; 7994 } 7995 } 7996 else 7997 /* done with this character => adjust input position */ 7998 ++inpos; 7999 } 8000 8001 /* Resize if we allocated to much */ 8002 if (respos<PyBytes_GET_SIZE(res)) 8003 if (_PyBytes_Resize(&res, respos) < 0) 8004 goto onError; 8005 8006 Py_XDECREF(exc); 8007 Py_XDECREF(errorHandler); 8008 return res; 8009 8010 onError: 8011 Py_XDECREF(res); 8012 Py_XDECREF(exc); 8013 Py_XDECREF(errorHandler); 8014 return NULL; 8015} 8016 8017/* Deprecated */ 8018PyObject * 8019PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8020 Py_ssize_t size, 8021 PyObject *mapping, 8022 const char *errors) 8023{ 8024 PyObject *result; 8025 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8026 if (unicode == NULL) 8027 return NULL; 8028 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8029 Py_DECREF(unicode); 8030 return result; 8031} 8032 8033PyObject * 8034PyUnicode_AsCharmapString(PyObject *unicode, 8035 PyObject *mapping) 8036{ 8037 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8038 PyErr_BadArgument(); 8039 return NULL; 8040 } 8041 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8042} 8043 8044/* create or adjust a UnicodeTranslateError */ 8045static void 8046make_translate_exception(PyObject **exceptionObject, 8047 PyObject *unicode, 8048 Py_ssize_t startpos, Py_ssize_t endpos, 8049 const char *reason) 8050{ 8051 if (*exceptionObject == NULL) { 8052 *exceptionObject = _PyUnicodeTranslateError_Create( 8053 unicode, startpos, endpos, reason); 8054 } 8055 else { 8056 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8057 goto onError; 8058 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8059 goto onError; 8060 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8061 goto onError; 8062 return; 8063 onError: 8064 Py_DECREF(*exceptionObject); 8065 *exceptionObject = NULL; 8066 } 8067} 8068 8069/* error handling callback helper: 8070 build arguments, call the callback and check the arguments, 8071 put the result into newpos and return the replacement string, which 8072 has to be freed by the caller */ 8073static PyObject * 8074unicode_translate_call_errorhandler(const char *errors, 8075 PyObject **errorHandler, 8076 const char *reason, 8077 PyObject *unicode, PyObject **exceptionObject, 8078 Py_ssize_t startpos, Py_ssize_t endpos, 8079 Py_ssize_t *newpos) 8080{ 8081 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8082 8083 Py_ssize_t i_newpos; 8084 PyObject *restuple; 8085 PyObject *resunicode; 8086 8087 if (*errorHandler == NULL) { 8088 *errorHandler = PyCodec_LookupError(errors); 8089 if (*errorHandler == NULL) 8090 return NULL; 8091 } 8092 8093 make_translate_exception(exceptionObject, 8094 unicode, startpos, endpos, reason); 8095 if (*exceptionObject == NULL) 8096 return NULL; 8097 8098 restuple = PyObject_CallFunctionObjArgs( 8099 *errorHandler, *exceptionObject, NULL); 8100 if (restuple == NULL) 8101 return NULL; 8102 if (!PyTuple_Check(restuple)) { 8103 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8104 Py_DECREF(restuple); 8105 return NULL; 8106 } 8107 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8108 &resunicode, &i_newpos)) { 8109 Py_DECREF(restuple); 8110 return NULL; 8111 } 8112 if (i_newpos<0) 8113 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8114 else 8115 *newpos = i_newpos; 8116 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8117 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8118 Py_DECREF(restuple); 8119 return NULL; 8120 } 8121 Py_INCREF(resunicode); 8122 Py_DECREF(restuple); 8123 return resunicode; 8124} 8125 8126/* Lookup the character ch in the mapping and put the result in result, 8127 which must be decrefed by the caller. 8128 Return 0 on success, -1 on error */ 8129static int 8130charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8131{ 8132 PyObject *w = PyLong_FromLong((long)c); 8133 PyObject *x; 8134 8135 if (w == NULL) 8136 return -1; 8137 x = PyObject_GetItem(mapping, w); 8138 Py_DECREF(w); 8139 if (x == NULL) { 8140 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8141 /* No mapping found means: use 1:1 mapping. */ 8142 PyErr_Clear(); 8143 *result = NULL; 8144 return 0; 8145 } else 8146 return -1; 8147 } 8148 else if (x == Py_None) { 8149 *result = x; 8150 return 0; 8151 } 8152 else if (PyLong_Check(x)) { 8153 long value = PyLong_AS_LONG(x); 8154 long max = PyUnicode_GetMax(); 8155 if (value < 0 || value > max) { 8156 PyErr_Format(PyExc_TypeError, 8157 "character mapping must be in range(0x%x)", max+1); 8158 Py_DECREF(x); 8159 return -1; 8160 } 8161 *result = x; 8162 return 0; 8163 } 8164 else if (PyUnicode_Check(x)) { 8165 *result = x; 8166 return 0; 8167 } 8168 else { 8169 /* wrong return value */ 8170 PyErr_SetString(PyExc_TypeError, 8171 "character mapping must return integer, None or str"); 8172 Py_DECREF(x); 8173 return -1; 8174 } 8175} 8176/* ensure that *outobj is at least requiredsize characters long, 8177 if not reallocate and adjust various state variables. 8178 Return 0 on success, -1 on error */ 8179static int 8180charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8181 Py_ssize_t requiredsize) 8182{ 8183 Py_ssize_t oldsize = *psize; 8184 Py_UCS4 *new_outobj; 8185 if (requiredsize > oldsize) { 8186 /* exponentially overallocate to minimize reallocations */ 8187 if (requiredsize < 2 * oldsize) 8188 requiredsize = 2 * oldsize; 8189 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8190 if (new_outobj == 0) 8191 return -1; 8192 *outobj = new_outobj; 8193 *psize = requiredsize; 8194 } 8195 return 0; 8196} 8197/* lookup the character, put the result in the output string and adjust 8198 various state variables. Return a new reference to the object that 8199 was put in the output buffer in *result, or Py_None, if the mapping was 8200 undefined (in which case no character was written). 8201 The called must decref result. 8202 Return 0 on success, -1 on error. */ 8203static int 8204charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8205 PyObject *mapping, Py_UCS4 **output, 8206 Py_ssize_t *osize, Py_ssize_t *opos, 8207 PyObject **res) 8208{ 8209 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8210 if (charmaptranslate_lookup(curinp, mapping, res)) 8211 return -1; 8212 if (*res==NULL) { 8213 /* not found => default to 1:1 mapping */ 8214 (*output)[(*opos)++] = curinp; 8215 } 8216 else if (*res==Py_None) 8217 ; 8218 else if (PyLong_Check(*res)) { 8219 /* no overflow check, because we know that the space is enough */ 8220 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8221 } 8222 else if (PyUnicode_Check(*res)) { 8223 Py_ssize_t repsize; 8224 if (PyUnicode_READY(*res) == -1) 8225 return -1; 8226 repsize = PyUnicode_GET_LENGTH(*res); 8227 if (repsize==1) { 8228 /* no overflow check, because we know that the space is enough */ 8229 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8230 } 8231 else if (repsize!=0) { 8232 /* more than one character */ 8233 Py_ssize_t requiredsize = *opos + 8234 (PyUnicode_GET_LENGTH(input) - ipos) + 8235 repsize - 1; 8236 Py_ssize_t i; 8237 if (charmaptranslate_makespace(output, osize, requiredsize)) 8238 return -1; 8239 for(i = 0; i < repsize; i++) 8240 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8241 } 8242 } 8243 else 8244 return -1; 8245 return 0; 8246} 8247 8248PyObject * 8249_PyUnicode_TranslateCharmap(PyObject *input, 8250 PyObject *mapping, 8251 const char *errors) 8252{ 8253 /* input object */ 8254 char *idata; 8255 Py_ssize_t size, i; 8256 int kind; 8257 /* output buffer */ 8258 Py_UCS4 *output = NULL; 8259 Py_ssize_t osize; 8260 PyObject *res; 8261 /* current output position */ 8262 Py_ssize_t opos; 8263 char *reason = "character maps to <undefined>"; 8264 PyObject *errorHandler = NULL; 8265 PyObject *exc = NULL; 8266 /* the following variable is used for caching string comparisons 8267 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8268 * 3=ignore, 4=xmlcharrefreplace */ 8269 int known_errorHandler = -1; 8270 8271 if (mapping == NULL) { 8272 PyErr_BadArgument(); 8273 return NULL; 8274 } 8275 8276 if (PyUnicode_READY(input) == -1) 8277 return NULL; 8278 idata = (char*)PyUnicode_DATA(input); 8279 kind = PyUnicode_KIND(input); 8280 size = PyUnicode_GET_LENGTH(input); 8281 i = 0; 8282 8283 if (size == 0) { 8284 Py_INCREF(input); 8285 return input; 8286 } 8287 8288 /* allocate enough for a simple 1:1 translation without 8289 replacements, if we need more, we'll resize */ 8290 osize = size; 8291 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8292 opos = 0; 8293 if (output == NULL) { 8294 PyErr_NoMemory(); 8295 goto onError; 8296 } 8297 8298 while (i<size) { 8299 /* try to encode it */ 8300 PyObject *x = NULL; 8301 if (charmaptranslate_output(input, i, mapping, 8302 &output, &osize, &opos, &x)) { 8303 Py_XDECREF(x); 8304 goto onError; 8305 } 8306 Py_XDECREF(x); 8307 if (x!=Py_None) /* it worked => adjust input pointer */ 8308 ++i; 8309 else { /* untranslatable character */ 8310 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8311 Py_ssize_t repsize; 8312 Py_ssize_t newpos; 8313 Py_ssize_t uni2; 8314 /* startpos for collecting untranslatable chars */ 8315 Py_ssize_t collstart = i; 8316 Py_ssize_t collend = i+1; 8317 Py_ssize_t coll; 8318 8319 /* find all untranslatable characters */ 8320 while (collend < size) { 8321 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8322 goto onError; 8323 Py_XDECREF(x); 8324 if (x!=Py_None) 8325 break; 8326 ++collend; 8327 } 8328 /* cache callback name lookup 8329 * (if not done yet, i.e. it's the first error) */ 8330 if (known_errorHandler==-1) { 8331 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8332 known_errorHandler = 1; 8333 else if (!strcmp(errors, "replace")) 8334 known_errorHandler = 2; 8335 else if (!strcmp(errors, "ignore")) 8336 known_errorHandler = 3; 8337 else if (!strcmp(errors, "xmlcharrefreplace")) 8338 known_errorHandler = 4; 8339 else 8340 known_errorHandler = 0; 8341 } 8342 switch (known_errorHandler) { 8343 case 1: /* strict */ 8344 make_translate_exception(&exc, 8345 input, collstart, collend, reason); 8346 if (exc != NULL) 8347 PyCodec_StrictErrors(exc); 8348 goto onError; 8349 case 2: /* replace */ 8350 /* No need to check for space, this is a 1:1 replacement */ 8351 for (coll = collstart; coll<collend; coll++) 8352 output[opos++] = '?'; 8353 /* fall through */ 8354 case 3: /* ignore */ 8355 i = collend; 8356 break; 8357 case 4: /* xmlcharrefreplace */ 8358 /* generate replacement (temporarily (mis)uses i) */ 8359 for (i = collstart; i < collend; ++i) { 8360 char buffer[2+29+1+1]; 8361 char *cp; 8362 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8363 if (charmaptranslate_makespace(&output, &osize, 8364 opos+strlen(buffer)+(size-collend))) 8365 goto onError; 8366 for (cp = buffer; *cp; ++cp) 8367 output[opos++] = *cp; 8368 } 8369 i = collend; 8370 break; 8371 default: 8372 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8373 reason, input, &exc, 8374 collstart, collend, &newpos); 8375 if (repunicode == NULL) 8376 goto onError; 8377 if (PyUnicode_READY(repunicode) == -1) { 8378 Py_DECREF(repunicode); 8379 goto onError; 8380 } 8381 /* generate replacement */ 8382 repsize = PyUnicode_GET_LENGTH(repunicode); 8383 if (charmaptranslate_makespace(&output, &osize, 8384 opos+repsize+(size-collend))) { 8385 Py_DECREF(repunicode); 8386 goto onError; 8387 } 8388 for (uni2 = 0; repsize-->0; ++uni2) 8389 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8390 i = newpos; 8391 Py_DECREF(repunicode); 8392 } 8393 } 8394 } 8395 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8396 if (!res) 8397 goto onError; 8398 PyMem_Free(output); 8399 Py_XDECREF(exc); 8400 Py_XDECREF(errorHandler); 8401 return res; 8402 8403 onError: 8404 PyMem_Free(output); 8405 Py_XDECREF(exc); 8406 Py_XDECREF(errorHandler); 8407 return NULL; 8408} 8409 8410/* Deprecated. Use PyUnicode_Translate instead. */ 8411PyObject * 8412PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8413 Py_ssize_t size, 8414 PyObject *mapping, 8415 const char *errors) 8416{ 8417 PyObject *result; 8418 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8419 if (!unicode) 8420 return NULL; 8421 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8422 Py_DECREF(unicode); 8423 return result; 8424} 8425 8426PyObject * 8427PyUnicode_Translate(PyObject *str, 8428 PyObject *mapping, 8429 const char *errors) 8430{ 8431 PyObject *result; 8432 8433 str = PyUnicode_FromObject(str); 8434 if (str == NULL) 8435 return NULL; 8436 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8437 Py_DECREF(str); 8438 return result; 8439} 8440 8441static Py_UCS4 8442fix_decimal_and_space_to_ascii(PyObject *self) 8443{ 8444 /* No need to call PyUnicode_READY(self) because this function is only 8445 called as a callback from fixup() which does it already. */ 8446 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8447 const int kind = PyUnicode_KIND(self); 8448 void *data = PyUnicode_DATA(self); 8449 Py_UCS4 maxchar = 127, ch, fixed; 8450 int modified = 0; 8451 Py_ssize_t i; 8452 8453 for (i = 0; i < len; ++i) { 8454 ch = PyUnicode_READ(kind, data, i); 8455 fixed = 0; 8456 if (ch > 127) { 8457 if (Py_UNICODE_ISSPACE(ch)) 8458 fixed = ' '; 8459 else { 8460 const int decimal = Py_UNICODE_TODECIMAL(ch); 8461 if (decimal >= 0) 8462 fixed = '0' + decimal; 8463 } 8464 if (fixed != 0) { 8465 modified = 1; 8466 maxchar = MAX_MAXCHAR(maxchar, fixed); 8467 PyUnicode_WRITE(kind, data, i, fixed); 8468 } 8469 else 8470 maxchar = MAX_MAXCHAR(maxchar, ch); 8471 } 8472 } 8473 8474 return (modified) ? maxchar : 0; 8475} 8476 8477PyObject * 8478_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8479{ 8480 if (!PyUnicode_Check(unicode)) { 8481 PyErr_BadInternalCall(); 8482 return NULL; 8483 } 8484 if (PyUnicode_READY(unicode) == -1) 8485 return NULL; 8486 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8487 /* If the string is already ASCII, just return the same string */ 8488 Py_INCREF(unicode); 8489 return unicode; 8490 } 8491 return fixup(unicode, fix_decimal_and_space_to_ascii); 8492} 8493 8494PyObject * 8495PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8496 Py_ssize_t length) 8497{ 8498 PyObject *decimal; 8499 Py_ssize_t i; 8500 Py_UCS4 maxchar; 8501 enum PyUnicode_Kind kind; 8502 void *data; 8503 8504 maxchar = 127; 8505 for (i = 0; i < length; i++) { 8506 Py_UNICODE ch = s[i]; 8507 if (ch > 127) { 8508 int decimal = Py_UNICODE_TODECIMAL(ch); 8509 if (decimal >= 0) 8510 ch = '0' + decimal; 8511 maxchar = MAX_MAXCHAR(maxchar, ch); 8512 } 8513 } 8514 8515 /* Copy to a new string */ 8516 decimal = PyUnicode_New(length, maxchar); 8517 if (decimal == NULL) 8518 return decimal; 8519 kind = PyUnicode_KIND(decimal); 8520 data = PyUnicode_DATA(decimal); 8521 /* Iterate over code points */ 8522 for (i = 0; i < length; i++) { 8523 Py_UNICODE ch = s[i]; 8524 if (ch > 127) { 8525 int decimal = Py_UNICODE_TODECIMAL(ch); 8526 if (decimal >= 0) 8527 ch = '0' + decimal; 8528 } 8529 PyUnicode_WRITE(kind, data, i, ch); 8530 } 8531 return unicode_result(decimal); 8532} 8533/* --- Decimal Encoder ---------------------------------------------------- */ 8534 8535int 8536PyUnicode_EncodeDecimal(Py_UNICODE *s, 8537 Py_ssize_t length, 8538 char *output, 8539 const char *errors) 8540{ 8541 PyObject *unicode; 8542 Py_ssize_t i; 8543 enum PyUnicode_Kind kind; 8544 void *data; 8545 8546 if (output == NULL) { 8547 PyErr_BadArgument(); 8548 return -1; 8549 } 8550 8551 unicode = PyUnicode_FromUnicode(s, length); 8552 if (unicode == NULL) 8553 return -1; 8554 8555 if (PyUnicode_READY(unicode) == -1) { 8556 Py_DECREF(unicode); 8557 return -1; 8558 } 8559 kind = PyUnicode_KIND(unicode); 8560 data = PyUnicode_DATA(unicode); 8561 8562 for (i=0; i < length; ) { 8563 PyObject *exc; 8564 Py_UCS4 ch; 8565 int decimal; 8566 Py_ssize_t startpos; 8567 8568 ch = PyUnicode_READ(kind, data, i); 8569 8570 if (Py_UNICODE_ISSPACE(ch)) { 8571 *output++ = ' '; 8572 i++; 8573 continue; 8574 } 8575 decimal = Py_UNICODE_TODECIMAL(ch); 8576 if (decimal >= 0) { 8577 *output++ = '0' + decimal; 8578 i++; 8579 continue; 8580 } 8581 if (0 < ch && ch < 256) { 8582 *output++ = (char)ch; 8583 i++; 8584 continue; 8585 } 8586 8587 startpos = i; 8588 exc = NULL; 8589 raise_encode_exception(&exc, "decimal", unicode, 8590 startpos, startpos+1, 8591 "invalid decimal Unicode string"); 8592 Py_XDECREF(exc); 8593 Py_DECREF(unicode); 8594 return -1; 8595 } 8596 /* 0-terminate the output string */ 8597 *output++ = '\0'; 8598 Py_DECREF(unicode); 8599 return 0; 8600} 8601 8602/* --- Helpers ------------------------------------------------------------ */ 8603 8604static Py_ssize_t 8605any_find_slice(int direction, PyObject* s1, PyObject* s2, 8606 Py_ssize_t start, 8607 Py_ssize_t end) 8608{ 8609 int kind1, kind2, kind; 8610 void *buf1, *buf2; 8611 Py_ssize_t len1, len2, result; 8612 8613 kind1 = PyUnicode_KIND(s1); 8614 kind2 = PyUnicode_KIND(s2); 8615 kind = kind1 > kind2 ? kind1 : kind2; 8616 buf1 = PyUnicode_DATA(s1); 8617 buf2 = PyUnicode_DATA(s2); 8618 if (kind1 != kind) 8619 buf1 = _PyUnicode_AsKind(s1, kind); 8620 if (!buf1) 8621 return -2; 8622 if (kind2 != kind) 8623 buf2 = _PyUnicode_AsKind(s2, kind); 8624 if (!buf2) { 8625 if (kind1 != kind) PyMem_Free(buf1); 8626 return -2; 8627 } 8628 len1 = PyUnicode_GET_LENGTH(s1); 8629 len2 = PyUnicode_GET_LENGTH(s2); 8630 8631 if (direction > 0) { 8632 switch (kind) { 8633 case PyUnicode_1BYTE_KIND: 8634 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8635 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8636 else 8637 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8638 break; 8639 case PyUnicode_2BYTE_KIND: 8640 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8641 break; 8642 case PyUnicode_4BYTE_KIND: 8643 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8644 break; 8645 default: 8646 assert(0); result = -2; 8647 } 8648 } 8649 else { 8650 switch (kind) { 8651 case PyUnicode_1BYTE_KIND: 8652 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8653 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8654 else 8655 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8656 break; 8657 case PyUnicode_2BYTE_KIND: 8658 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8659 break; 8660 case PyUnicode_4BYTE_KIND: 8661 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8662 break; 8663 default: 8664 assert(0); result = -2; 8665 } 8666 } 8667 8668 if (kind1 != kind) 8669 PyMem_Free(buf1); 8670 if (kind2 != kind) 8671 PyMem_Free(buf2); 8672 8673 return result; 8674} 8675 8676Py_ssize_t 8677_PyUnicode_InsertThousandsGrouping( 8678 PyObject *unicode, Py_ssize_t index, 8679 Py_ssize_t n_buffer, 8680 void *digits, Py_ssize_t n_digits, 8681 Py_ssize_t min_width, 8682 const char *grouping, PyObject *thousands_sep, 8683 Py_UCS4 *maxchar) 8684{ 8685 unsigned int kind, thousands_sep_kind; 8686 char *data, *thousands_sep_data; 8687 Py_ssize_t thousands_sep_len; 8688 Py_ssize_t len; 8689 8690 if (unicode != NULL) { 8691 kind = PyUnicode_KIND(unicode); 8692 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8693 } 8694 else { 8695 kind = PyUnicode_1BYTE_KIND; 8696 data = NULL; 8697 } 8698 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8699 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8700 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8701 if (unicode != NULL && thousands_sep_kind != kind) { 8702 if (thousands_sep_kind < kind) { 8703 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8704 if (!thousands_sep_data) 8705 return -1; 8706 } 8707 else { 8708 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8709 if (!data) 8710 return -1; 8711 } 8712 } 8713 8714 switch (kind) { 8715 case PyUnicode_1BYTE_KIND: 8716 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8717 len = asciilib_InsertThousandsGrouping( 8718 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8719 min_width, grouping, 8720 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8721 else 8722 len = ucs1lib_InsertThousandsGrouping( 8723 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8724 min_width, grouping, 8725 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8726 break; 8727 case PyUnicode_2BYTE_KIND: 8728 len = ucs2lib_InsertThousandsGrouping( 8729 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8730 min_width, grouping, 8731 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8732 break; 8733 case PyUnicode_4BYTE_KIND: 8734 len = ucs4lib_InsertThousandsGrouping( 8735 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8736 min_width, grouping, 8737 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8738 break; 8739 default: 8740 assert(0); 8741 return -1; 8742 } 8743 if (unicode != NULL && thousands_sep_kind != kind) { 8744 if (thousands_sep_kind < kind) 8745 PyMem_Free(thousands_sep_data); 8746 else 8747 PyMem_Free(data); 8748 } 8749 if (unicode == NULL) { 8750 *maxchar = 127; 8751 if (len != n_digits) { 8752 *maxchar = MAX_MAXCHAR(*maxchar, 8753 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8754 } 8755 } 8756 return len; 8757} 8758 8759 8760/* helper macro to fixup start/end slice values */ 8761#define ADJUST_INDICES(start, end, len) \ 8762 if (end > len) \ 8763 end = len; \ 8764 else if (end < 0) { \ 8765 end += len; \ 8766 if (end < 0) \ 8767 end = 0; \ 8768 } \ 8769 if (start < 0) { \ 8770 start += len; \ 8771 if (start < 0) \ 8772 start = 0; \ 8773 } 8774 8775Py_ssize_t 8776PyUnicode_Count(PyObject *str, 8777 PyObject *substr, 8778 Py_ssize_t start, 8779 Py_ssize_t end) 8780{ 8781 Py_ssize_t result; 8782 PyObject* str_obj; 8783 PyObject* sub_obj; 8784 int kind1, kind2, kind; 8785 void *buf1 = NULL, *buf2 = NULL; 8786 Py_ssize_t len1, len2; 8787 8788 str_obj = PyUnicode_FromObject(str); 8789 if (!str_obj) 8790 return -1; 8791 sub_obj = PyUnicode_FromObject(substr); 8792 if (!sub_obj) { 8793 Py_DECREF(str_obj); 8794 return -1; 8795 } 8796 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8797 Py_DECREF(sub_obj); 8798 Py_DECREF(str_obj); 8799 return -1; 8800 } 8801 8802 kind1 = PyUnicode_KIND(str_obj); 8803 kind2 = PyUnicode_KIND(sub_obj); 8804 kind = kind1; 8805 buf1 = PyUnicode_DATA(str_obj); 8806 buf2 = PyUnicode_DATA(sub_obj); 8807 if (kind2 != kind) { 8808 if (kind2 > kind) { 8809 Py_DECREF(sub_obj); 8810 Py_DECREF(str_obj); 8811 return 0; 8812 } 8813 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8814 } 8815 if (!buf2) 8816 goto onError; 8817 len1 = PyUnicode_GET_LENGTH(str_obj); 8818 len2 = PyUnicode_GET_LENGTH(sub_obj); 8819 8820 ADJUST_INDICES(start, end, len1); 8821 switch (kind) { 8822 case PyUnicode_1BYTE_KIND: 8823 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8824 result = asciilib_count( 8825 ((Py_UCS1*)buf1) + start, end - start, 8826 buf2, len2, PY_SSIZE_T_MAX 8827 ); 8828 else 8829 result = ucs1lib_count( 8830 ((Py_UCS1*)buf1) + start, end - start, 8831 buf2, len2, PY_SSIZE_T_MAX 8832 ); 8833 break; 8834 case PyUnicode_2BYTE_KIND: 8835 result = ucs2lib_count( 8836 ((Py_UCS2*)buf1) + start, end - start, 8837 buf2, len2, PY_SSIZE_T_MAX 8838 ); 8839 break; 8840 case PyUnicode_4BYTE_KIND: 8841 result = ucs4lib_count( 8842 ((Py_UCS4*)buf1) + start, end - start, 8843 buf2, len2, PY_SSIZE_T_MAX 8844 ); 8845 break; 8846 default: 8847 assert(0); result = 0; 8848 } 8849 8850 Py_DECREF(sub_obj); 8851 Py_DECREF(str_obj); 8852 8853 if (kind2 != kind) 8854 PyMem_Free(buf2); 8855 8856 return result; 8857 onError: 8858 Py_DECREF(sub_obj); 8859 Py_DECREF(str_obj); 8860 if (kind2 != kind && buf2) 8861 PyMem_Free(buf2); 8862 return -1; 8863} 8864 8865Py_ssize_t 8866PyUnicode_Find(PyObject *str, 8867 PyObject *sub, 8868 Py_ssize_t start, 8869 Py_ssize_t end, 8870 int direction) 8871{ 8872 Py_ssize_t result; 8873 8874 str = PyUnicode_FromObject(str); 8875 if (!str) 8876 return -2; 8877 sub = PyUnicode_FromObject(sub); 8878 if (!sub) { 8879 Py_DECREF(str); 8880 return -2; 8881 } 8882 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 8883 Py_DECREF(sub); 8884 Py_DECREF(str); 8885 return -2; 8886 } 8887 8888 result = any_find_slice(direction, 8889 str, sub, start, end 8890 ); 8891 8892 Py_DECREF(str); 8893 Py_DECREF(sub); 8894 8895 return result; 8896} 8897 8898Py_ssize_t 8899PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8900 Py_ssize_t start, Py_ssize_t end, 8901 int direction) 8902{ 8903 int kind; 8904 Py_ssize_t result; 8905 if (PyUnicode_READY(str) == -1) 8906 return -2; 8907 if (start < 0 || end < 0) { 8908 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8909 return -2; 8910 } 8911 if (end > PyUnicode_GET_LENGTH(str)) 8912 end = PyUnicode_GET_LENGTH(str); 8913 kind = PyUnicode_KIND(str); 8914 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 8915 kind, end-start, ch, direction); 8916 if (result == -1) 8917 return -1; 8918 else 8919 return start + result; 8920} 8921 8922static int 8923tailmatch(PyObject *self, 8924 PyObject *substring, 8925 Py_ssize_t start, 8926 Py_ssize_t end, 8927 int direction) 8928{ 8929 int kind_self; 8930 int kind_sub; 8931 void *data_self; 8932 void *data_sub; 8933 Py_ssize_t offset; 8934 Py_ssize_t i; 8935 Py_ssize_t end_sub; 8936 8937 if (PyUnicode_READY(self) == -1 || 8938 PyUnicode_READY(substring) == -1) 8939 return 0; 8940 8941 if (PyUnicode_GET_LENGTH(substring) == 0) 8942 return 1; 8943 8944 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8945 end -= PyUnicode_GET_LENGTH(substring); 8946 if (end < start) 8947 return 0; 8948 8949 kind_self = PyUnicode_KIND(self); 8950 data_self = PyUnicode_DATA(self); 8951 kind_sub = PyUnicode_KIND(substring); 8952 data_sub = PyUnicode_DATA(substring); 8953 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8954 8955 if (direction > 0) 8956 offset = end; 8957 else 8958 offset = start; 8959 8960 if (PyUnicode_READ(kind_self, data_self, offset) == 8961 PyUnicode_READ(kind_sub, data_sub, 0) && 8962 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8963 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8964 /* If both are of the same kind, memcmp is sufficient */ 8965 if (kind_self == kind_sub) { 8966 return ! memcmp((char *)data_self + 8967 (offset * PyUnicode_KIND(substring)), 8968 data_sub, 8969 PyUnicode_GET_LENGTH(substring) * 8970 PyUnicode_KIND(substring)); 8971 } 8972 /* otherwise we have to compare each character by first accesing it */ 8973 else { 8974 /* We do not need to compare 0 and len(substring)-1 because 8975 the if statement above ensured already that they are equal 8976 when we end up here. */ 8977 /* TODO: honor direction and do a forward or backwards search */ 8978 for (i = 1; i < end_sub; ++i) { 8979 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8980 PyUnicode_READ(kind_sub, data_sub, i)) 8981 return 0; 8982 } 8983 return 1; 8984 } 8985 } 8986 8987 return 0; 8988} 8989 8990Py_ssize_t 8991PyUnicode_Tailmatch(PyObject *str, 8992 PyObject *substr, 8993 Py_ssize_t start, 8994 Py_ssize_t end, 8995 int direction) 8996{ 8997 Py_ssize_t result; 8998 8999 str = PyUnicode_FromObject(str); 9000 if (str == NULL) 9001 return -1; 9002 substr = PyUnicode_FromObject(substr); 9003 if (substr == NULL) { 9004 Py_DECREF(str); 9005 return -1; 9006 } 9007 9008 result = tailmatch(str, substr, 9009 start, end, direction); 9010 Py_DECREF(str); 9011 Py_DECREF(substr); 9012 return result; 9013} 9014 9015/* Apply fixfct filter to the Unicode object self and return a 9016 reference to the modified object */ 9017 9018static PyObject * 9019fixup(PyObject *self, 9020 Py_UCS4 (*fixfct)(PyObject *s)) 9021{ 9022 PyObject *u; 9023 Py_UCS4 maxchar_old, maxchar_new = 0; 9024 PyObject *v; 9025 9026 u = _PyUnicode_Copy(self); 9027 if (u == NULL) 9028 return NULL; 9029 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9030 9031 /* fix functions return the new maximum character in a string, 9032 if the kind of the resulting unicode object does not change, 9033 everything is fine. Otherwise we need to change the string kind 9034 and re-run the fix function. */ 9035 maxchar_new = fixfct(u); 9036 9037 if (maxchar_new == 0) { 9038 /* no changes */; 9039 if (PyUnicode_CheckExact(self)) { 9040 Py_DECREF(u); 9041 Py_INCREF(self); 9042 return self; 9043 } 9044 else 9045 return u; 9046 } 9047 9048 maxchar_new = align_maxchar(maxchar_new); 9049 9050 if (maxchar_new == maxchar_old) 9051 return u; 9052 9053 /* In case the maximum character changed, we need to 9054 convert the string to the new category. */ 9055 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9056 if (v == NULL) { 9057 Py_DECREF(u); 9058 return NULL; 9059 } 9060 if (maxchar_new > maxchar_old) { 9061 /* If the maxchar increased so that the kind changed, not all 9062 characters are representable anymore and we need to fix the 9063 string again. This only happens in very few cases. */ 9064 _PyUnicode_FastCopyCharacters(v, 0, 9065 self, 0, PyUnicode_GET_LENGTH(self)); 9066 maxchar_old = fixfct(v); 9067 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9068 } 9069 else { 9070 _PyUnicode_FastCopyCharacters(v, 0, 9071 u, 0, PyUnicode_GET_LENGTH(self)); 9072 } 9073 Py_DECREF(u); 9074 assert(_PyUnicode_CheckConsistency(v, 1)); 9075 return v; 9076} 9077 9078static PyObject * 9079ascii_upper_or_lower(PyObject *self, int lower) 9080{ 9081 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9082 char *resdata, *data = PyUnicode_DATA(self); 9083 PyObject *res; 9084 9085 res = PyUnicode_New(len, 127); 9086 if (res == NULL) 9087 return NULL; 9088 resdata = PyUnicode_DATA(res); 9089 if (lower) 9090 _Py_bytes_lower(resdata, data, len); 9091 else 9092 _Py_bytes_upper(resdata, data, len); 9093 return res; 9094} 9095 9096static Py_UCS4 9097handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9098{ 9099 Py_ssize_t j; 9100 int final_sigma; 9101 Py_UCS4 c; 9102 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9103 9104 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9105 9106 where ! is a negation and \p{xxx} is a character with property xxx. 9107 */ 9108 for (j = i - 1; j >= 0; j--) { 9109 c = PyUnicode_READ(kind, data, j); 9110 if (!_PyUnicode_IsCaseIgnorable(c)) 9111 break; 9112 } 9113 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9114 if (final_sigma) { 9115 for (j = i + 1; j < length; j++) { 9116 c = PyUnicode_READ(kind, data, j); 9117 if (!_PyUnicode_IsCaseIgnorable(c)) 9118 break; 9119 } 9120 final_sigma = j == length || !_PyUnicode_IsCased(c); 9121 } 9122 return (final_sigma) ? 0x3C2 : 0x3C3; 9123} 9124 9125static int 9126lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9127 Py_UCS4 c, Py_UCS4 *mapped) 9128{ 9129 /* Obscure special case. */ 9130 if (c == 0x3A3) { 9131 mapped[0] = handle_capital_sigma(kind, data, length, i); 9132 return 1; 9133 } 9134 return _PyUnicode_ToLowerFull(c, mapped); 9135} 9136 9137static Py_ssize_t 9138do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9139{ 9140 Py_ssize_t i, k = 0; 9141 int n_res, j; 9142 Py_UCS4 c, mapped[3]; 9143 9144 c = PyUnicode_READ(kind, data, 0); 9145 n_res = _PyUnicode_ToUpperFull(c, mapped); 9146 for (j = 0; j < n_res; j++) { 9147 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9148 res[k++] = mapped[j]; 9149 } 9150 for (i = 1; i < length; i++) { 9151 c = PyUnicode_READ(kind, data, i); 9152 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9153 for (j = 0; j < n_res; j++) { 9154 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9155 res[k++] = mapped[j]; 9156 } 9157 } 9158 return k; 9159} 9160 9161static Py_ssize_t 9162do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9163 Py_ssize_t i, k = 0; 9164 9165 for (i = 0; i < length; i++) { 9166 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9167 int n_res, j; 9168 if (Py_UNICODE_ISUPPER(c)) { 9169 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9170 } 9171 else if (Py_UNICODE_ISLOWER(c)) { 9172 n_res = _PyUnicode_ToUpperFull(c, mapped); 9173 } 9174 else { 9175 n_res = 1; 9176 mapped[0] = c; 9177 } 9178 for (j = 0; j < n_res; j++) { 9179 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9180 res[k++] = mapped[j]; 9181 } 9182 } 9183 return k; 9184} 9185 9186static Py_ssize_t 9187do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9188 Py_UCS4 *maxchar, int lower) 9189{ 9190 Py_ssize_t i, k = 0; 9191 9192 for (i = 0; i < length; i++) { 9193 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9194 int n_res, j; 9195 if (lower) 9196 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9197 else 9198 n_res = _PyUnicode_ToUpperFull(c, mapped); 9199 for (j = 0; j < n_res; j++) { 9200 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9201 res[k++] = mapped[j]; 9202 } 9203 } 9204 return k; 9205} 9206 9207static Py_ssize_t 9208do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9209{ 9210 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9211} 9212 9213static Py_ssize_t 9214do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9215{ 9216 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9217} 9218 9219static Py_ssize_t 9220do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9221{ 9222 Py_ssize_t i, k = 0; 9223 9224 for (i = 0; i < length; i++) { 9225 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9226 Py_UCS4 mapped[3]; 9227 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9228 for (j = 0; j < n_res; j++) { 9229 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9230 res[k++] = mapped[j]; 9231 } 9232 } 9233 return k; 9234} 9235 9236static Py_ssize_t 9237do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9238{ 9239 Py_ssize_t i, k = 0; 9240 int previous_is_cased; 9241 9242 previous_is_cased = 0; 9243 for (i = 0; i < length; i++) { 9244 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9245 Py_UCS4 mapped[3]; 9246 int n_res, j; 9247 9248 if (previous_is_cased) 9249 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9250 else 9251 n_res = _PyUnicode_ToTitleFull(c, mapped); 9252 9253 for (j = 0; j < n_res; j++) { 9254 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9255 res[k++] = mapped[j]; 9256 } 9257 9258 previous_is_cased = _PyUnicode_IsCased(c); 9259 } 9260 return k; 9261} 9262 9263static PyObject * 9264case_operation(PyObject *self, 9265 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9266{ 9267 PyObject *res = NULL; 9268 Py_ssize_t length, newlength = 0; 9269 int kind, outkind; 9270 void *data, *outdata; 9271 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9272 9273 assert(PyUnicode_IS_READY(self)); 9274 9275 kind = PyUnicode_KIND(self); 9276 data = PyUnicode_DATA(self); 9277 length = PyUnicode_GET_LENGTH(self); 9278 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9279 if (tmp == NULL) 9280 return PyErr_NoMemory(); 9281 newlength = perform(kind, data, length, tmp, &maxchar); 9282 res = PyUnicode_New(newlength, maxchar); 9283 if (res == NULL) 9284 goto leave; 9285 tmpend = tmp + newlength; 9286 outdata = PyUnicode_DATA(res); 9287 outkind = PyUnicode_KIND(res); 9288 switch (outkind) { 9289 case PyUnicode_1BYTE_KIND: 9290 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9291 break; 9292 case PyUnicode_2BYTE_KIND: 9293 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9294 break; 9295 case PyUnicode_4BYTE_KIND: 9296 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9297 break; 9298 default: 9299 assert(0); 9300 break; 9301 } 9302 leave: 9303 PyMem_FREE(tmp); 9304 return res; 9305} 9306 9307PyObject * 9308PyUnicode_Join(PyObject *separator, PyObject *seq) 9309{ 9310 PyObject *sep = NULL; 9311 Py_ssize_t seplen; 9312 PyObject *res = NULL; /* the result */ 9313 PyObject *fseq; /* PySequence_Fast(seq) */ 9314 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9315 PyObject **items; 9316 PyObject *item; 9317 Py_ssize_t sz, i, res_offset; 9318 Py_UCS4 maxchar; 9319 Py_UCS4 item_maxchar; 9320 int use_memcpy; 9321 unsigned char *res_data = NULL, *sep_data = NULL; 9322 PyObject *last_obj; 9323 unsigned int kind = 0; 9324 9325 fseq = PySequence_Fast(seq, ""); 9326 if (fseq == NULL) { 9327 return NULL; 9328 } 9329 9330 /* NOTE: the following code can't call back into Python code, 9331 * so we are sure that fseq won't be mutated. 9332 */ 9333 9334 seqlen = PySequence_Fast_GET_SIZE(fseq); 9335 /* If empty sequence, return u"". */ 9336 if (seqlen == 0) { 9337 Py_DECREF(fseq); 9338 Py_INCREF(unicode_empty); 9339 res = unicode_empty; 9340 return res; 9341 } 9342 9343 /* If singleton sequence with an exact Unicode, return that. */ 9344 last_obj = NULL; 9345 items = PySequence_Fast_ITEMS(fseq); 9346 if (seqlen == 1) { 9347 if (PyUnicode_CheckExact(items[0])) { 9348 res = items[0]; 9349 Py_INCREF(res); 9350 Py_DECREF(fseq); 9351 return res; 9352 } 9353 seplen = 0; 9354 maxchar = 0; 9355 } 9356 else { 9357 /* Set up sep and seplen */ 9358 if (separator == NULL) { 9359 /* fall back to a blank space separator */ 9360 sep = PyUnicode_FromOrdinal(' '); 9361 if (!sep) 9362 goto onError; 9363 seplen = 1; 9364 maxchar = 32; 9365 } 9366 else { 9367 if (!PyUnicode_Check(separator)) { 9368 PyErr_Format(PyExc_TypeError, 9369 "separator: expected str instance," 9370 " %.80s found", 9371 Py_TYPE(separator)->tp_name); 9372 goto onError; 9373 } 9374 if (PyUnicode_READY(separator)) 9375 goto onError; 9376 sep = separator; 9377 seplen = PyUnicode_GET_LENGTH(separator); 9378 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9379 /* inc refcount to keep this code path symmetric with the 9380 above case of a blank separator */ 9381 Py_INCREF(sep); 9382 } 9383 last_obj = sep; 9384 } 9385 9386 /* There are at least two things to join, or else we have a subclass 9387 * of str in the sequence. 9388 * Do a pre-pass to figure out the total amount of space we'll 9389 * need (sz), and see whether all argument are strings. 9390 */ 9391 sz = 0; 9392#ifdef Py_DEBUG 9393 use_memcpy = 0; 9394#else 9395 use_memcpy = 1; 9396#endif 9397 for (i = 0; i < seqlen; i++) { 9398 const Py_ssize_t old_sz = sz; 9399 item = items[i]; 9400 if (!PyUnicode_Check(item)) { 9401 PyErr_Format(PyExc_TypeError, 9402 "sequence item %zd: expected str instance," 9403 " %.80s found", 9404 i, Py_TYPE(item)->tp_name); 9405 goto onError; 9406 } 9407 if (PyUnicode_READY(item) == -1) 9408 goto onError; 9409 sz += PyUnicode_GET_LENGTH(item); 9410 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9411 maxchar = MAX_MAXCHAR(maxchar, item_maxchar); 9412 if (i != 0) 9413 sz += seplen; 9414 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9415 PyErr_SetString(PyExc_OverflowError, 9416 "join() result is too long for a Python string"); 9417 goto onError; 9418 } 9419 if (use_memcpy && last_obj != NULL) { 9420 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9421 use_memcpy = 0; 9422 } 9423 last_obj = item; 9424 } 9425 9426 res = PyUnicode_New(sz, maxchar); 9427 if (res == NULL) 9428 goto onError; 9429 9430 /* Catenate everything. */ 9431#ifdef Py_DEBUG 9432 use_memcpy = 0; 9433#else 9434 if (use_memcpy) { 9435 res_data = PyUnicode_1BYTE_DATA(res); 9436 kind = PyUnicode_KIND(res); 9437 if (seplen != 0) 9438 sep_data = PyUnicode_1BYTE_DATA(sep); 9439 } 9440#endif 9441 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9442 Py_ssize_t itemlen; 9443 item = items[i]; 9444 /* Copy item, and maybe the separator. */ 9445 if (i && seplen != 0) { 9446 if (use_memcpy) { 9447 Py_MEMCPY(res_data, 9448 sep_data, 9449 kind * seplen); 9450 res_data += kind * seplen; 9451 } 9452 else { 9453 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9454 res_offset += seplen; 9455 } 9456 } 9457 itemlen = PyUnicode_GET_LENGTH(item); 9458 if (itemlen != 0) { 9459 if (use_memcpy) { 9460 Py_MEMCPY(res_data, 9461 PyUnicode_DATA(item), 9462 kind * itemlen); 9463 res_data += kind * itemlen; 9464 } 9465 else { 9466 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9467 res_offset += itemlen; 9468 } 9469 } 9470 } 9471 if (use_memcpy) 9472 assert(res_data == PyUnicode_1BYTE_DATA(res) 9473 + kind * PyUnicode_GET_LENGTH(res)); 9474 else 9475 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9476 9477 Py_DECREF(fseq); 9478 Py_XDECREF(sep); 9479 assert(_PyUnicode_CheckConsistency(res, 1)); 9480 return res; 9481 9482 onError: 9483 Py_DECREF(fseq); 9484 Py_XDECREF(sep); 9485 Py_XDECREF(res); 9486 return NULL; 9487} 9488 9489#define FILL(kind, data, value, start, length) \ 9490 do { \ 9491 Py_ssize_t i_ = 0; \ 9492 assert(kind != PyUnicode_WCHAR_KIND); \ 9493 switch ((kind)) { \ 9494 case PyUnicode_1BYTE_KIND: { \ 9495 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9496 memset(to_, (unsigned char)value, (length)); \ 9497 break; \ 9498 } \ 9499 case PyUnicode_2BYTE_KIND: { \ 9500 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9501 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9502 break; \ 9503 } \ 9504 case PyUnicode_4BYTE_KIND: { \ 9505 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9506 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9507 break; \ 9508 default: assert(0); \ 9509 } \ 9510 } \ 9511 } while (0) 9512 9513void 9514_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9515 Py_UCS4 fill_char) 9516{ 9517 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9518 const void *data = PyUnicode_DATA(unicode); 9519 assert(PyUnicode_IS_READY(unicode)); 9520 assert(unicode_modifiable(unicode)); 9521 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9522 assert(start >= 0); 9523 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9524 FILL(kind, data, fill_char, start, length); 9525} 9526 9527Py_ssize_t 9528PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9529 Py_UCS4 fill_char) 9530{ 9531 Py_ssize_t maxlen; 9532 9533 if (!PyUnicode_Check(unicode)) { 9534 PyErr_BadInternalCall(); 9535 return -1; 9536 } 9537 if (PyUnicode_READY(unicode) == -1) 9538 return -1; 9539 if (unicode_check_modifiable(unicode)) 9540 return -1; 9541 9542 if (start < 0) { 9543 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9544 return -1; 9545 } 9546 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9547 PyErr_SetString(PyExc_ValueError, 9548 "fill character is bigger than " 9549 "the string maximum character"); 9550 return -1; 9551 } 9552 9553 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9554 length = Py_MIN(maxlen, length); 9555 if (length <= 0) 9556 return 0; 9557 9558 _PyUnicode_FastFill(unicode, start, length, fill_char); 9559 return length; 9560} 9561 9562static PyObject * 9563pad(PyObject *self, 9564 Py_ssize_t left, 9565 Py_ssize_t right, 9566 Py_UCS4 fill) 9567{ 9568 PyObject *u; 9569 Py_UCS4 maxchar; 9570 int kind; 9571 void *data; 9572 9573 if (left < 0) 9574 left = 0; 9575 if (right < 0) 9576 right = 0; 9577 9578 if (left == 0 && right == 0) 9579 return unicode_result_unchanged(self); 9580 9581 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9582 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9583 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9584 return NULL; 9585 } 9586 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9587 maxchar = MAX_MAXCHAR(maxchar, fill); 9588 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9589 if (!u) 9590 return NULL; 9591 9592 kind = PyUnicode_KIND(u); 9593 data = PyUnicode_DATA(u); 9594 if (left) 9595 FILL(kind, data, fill, 0, left); 9596 if (right) 9597 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9598 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9599 assert(_PyUnicode_CheckConsistency(u, 1)); 9600 return u; 9601} 9602 9603PyObject * 9604PyUnicode_Splitlines(PyObject *string, int keepends) 9605{ 9606 PyObject *list; 9607 9608 string = PyUnicode_FromObject(string); 9609 if (string == NULL) 9610 return NULL; 9611 if (PyUnicode_READY(string) == -1) { 9612 Py_DECREF(string); 9613 return NULL; 9614 } 9615 9616 switch (PyUnicode_KIND(string)) { 9617 case PyUnicode_1BYTE_KIND: 9618 if (PyUnicode_IS_ASCII(string)) 9619 list = asciilib_splitlines( 9620 string, PyUnicode_1BYTE_DATA(string), 9621 PyUnicode_GET_LENGTH(string), keepends); 9622 else 9623 list = ucs1lib_splitlines( 9624 string, PyUnicode_1BYTE_DATA(string), 9625 PyUnicode_GET_LENGTH(string), keepends); 9626 break; 9627 case PyUnicode_2BYTE_KIND: 9628 list = ucs2lib_splitlines( 9629 string, PyUnicode_2BYTE_DATA(string), 9630 PyUnicode_GET_LENGTH(string), keepends); 9631 break; 9632 case PyUnicode_4BYTE_KIND: 9633 list = ucs4lib_splitlines( 9634 string, PyUnicode_4BYTE_DATA(string), 9635 PyUnicode_GET_LENGTH(string), keepends); 9636 break; 9637 default: 9638 assert(0); 9639 list = 0; 9640 } 9641 Py_DECREF(string); 9642 return list; 9643} 9644 9645static PyObject * 9646split(PyObject *self, 9647 PyObject *substring, 9648 Py_ssize_t maxcount) 9649{ 9650 int kind1, kind2, kind; 9651 void *buf1, *buf2; 9652 Py_ssize_t len1, len2; 9653 PyObject* out; 9654 9655 if (maxcount < 0) 9656 maxcount = PY_SSIZE_T_MAX; 9657 9658 if (PyUnicode_READY(self) == -1) 9659 return NULL; 9660 9661 if (substring == NULL) 9662 switch (PyUnicode_KIND(self)) { 9663 case PyUnicode_1BYTE_KIND: 9664 if (PyUnicode_IS_ASCII(self)) 9665 return asciilib_split_whitespace( 9666 self, PyUnicode_1BYTE_DATA(self), 9667 PyUnicode_GET_LENGTH(self), maxcount 9668 ); 9669 else 9670 return ucs1lib_split_whitespace( 9671 self, PyUnicode_1BYTE_DATA(self), 9672 PyUnicode_GET_LENGTH(self), maxcount 9673 ); 9674 case PyUnicode_2BYTE_KIND: 9675 return ucs2lib_split_whitespace( 9676 self, PyUnicode_2BYTE_DATA(self), 9677 PyUnicode_GET_LENGTH(self), maxcount 9678 ); 9679 case PyUnicode_4BYTE_KIND: 9680 return ucs4lib_split_whitespace( 9681 self, PyUnicode_4BYTE_DATA(self), 9682 PyUnicode_GET_LENGTH(self), maxcount 9683 ); 9684 default: 9685 assert(0); 9686 return NULL; 9687 } 9688 9689 if (PyUnicode_READY(substring) == -1) 9690 return NULL; 9691 9692 kind1 = PyUnicode_KIND(self); 9693 kind2 = PyUnicode_KIND(substring); 9694 kind = kind1 > kind2 ? kind1 : kind2; 9695 buf1 = PyUnicode_DATA(self); 9696 buf2 = PyUnicode_DATA(substring); 9697 if (kind1 != kind) 9698 buf1 = _PyUnicode_AsKind(self, kind); 9699 if (!buf1) 9700 return NULL; 9701 if (kind2 != kind) 9702 buf2 = _PyUnicode_AsKind(substring, kind); 9703 if (!buf2) { 9704 if (kind1 != kind) PyMem_Free(buf1); 9705 return NULL; 9706 } 9707 len1 = PyUnicode_GET_LENGTH(self); 9708 len2 = PyUnicode_GET_LENGTH(substring); 9709 9710 switch (kind) { 9711 case PyUnicode_1BYTE_KIND: 9712 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9713 out = asciilib_split( 9714 self, buf1, len1, buf2, len2, maxcount); 9715 else 9716 out = ucs1lib_split( 9717 self, buf1, len1, buf2, len2, maxcount); 9718 break; 9719 case PyUnicode_2BYTE_KIND: 9720 out = ucs2lib_split( 9721 self, buf1, len1, buf2, len2, maxcount); 9722 break; 9723 case PyUnicode_4BYTE_KIND: 9724 out = ucs4lib_split( 9725 self, buf1, len1, buf2, len2, maxcount); 9726 break; 9727 default: 9728 out = NULL; 9729 } 9730 if (kind1 != kind) 9731 PyMem_Free(buf1); 9732 if (kind2 != kind) 9733 PyMem_Free(buf2); 9734 return out; 9735} 9736 9737static PyObject * 9738rsplit(PyObject *self, 9739 PyObject *substring, 9740 Py_ssize_t maxcount) 9741{ 9742 int kind1, kind2, kind; 9743 void *buf1, *buf2; 9744 Py_ssize_t len1, len2; 9745 PyObject* out; 9746 9747 if (maxcount < 0) 9748 maxcount = PY_SSIZE_T_MAX; 9749 9750 if (PyUnicode_READY(self) == -1) 9751 return NULL; 9752 9753 if (substring == NULL) 9754 switch (PyUnicode_KIND(self)) { 9755 case PyUnicode_1BYTE_KIND: 9756 if (PyUnicode_IS_ASCII(self)) 9757 return asciilib_rsplit_whitespace( 9758 self, PyUnicode_1BYTE_DATA(self), 9759 PyUnicode_GET_LENGTH(self), maxcount 9760 ); 9761 else 9762 return ucs1lib_rsplit_whitespace( 9763 self, PyUnicode_1BYTE_DATA(self), 9764 PyUnicode_GET_LENGTH(self), maxcount 9765 ); 9766 case PyUnicode_2BYTE_KIND: 9767 return ucs2lib_rsplit_whitespace( 9768 self, PyUnicode_2BYTE_DATA(self), 9769 PyUnicode_GET_LENGTH(self), maxcount 9770 ); 9771 case PyUnicode_4BYTE_KIND: 9772 return ucs4lib_rsplit_whitespace( 9773 self, PyUnicode_4BYTE_DATA(self), 9774 PyUnicode_GET_LENGTH(self), maxcount 9775 ); 9776 default: 9777 assert(0); 9778 return NULL; 9779 } 9780 9781 if (PyUnicode_READY(substring) == -1) 9782 return NULL; 9783 9784 kind1 = PyUnicode_KIND(self); 9785 kind2 = PyUnicode_KIND(substring); 9786 kind = kind1 > kind2 ? kind1 : kind2; 9787 buf1 = PyUnicode_DATA(self); 9788 buf2 = PyUnicode_DATA(substring); 9789 if (kind1 != kind) 9790 buf1 = _PyUnicode_AsKind(self, kind); 9791 if (!buf1) 9792 return NULL; 9793 if (kind2 != kind) 9794 buf2 = _PyUnicode_AsKind(substring, kind); 9795 if (!buf2) { 9796 if (kind1 != kind) PyMem_Free(buf1); 9797 return NULL; 9798 } 9799 len1 = PyUnicode_GET_LENGTH(self); 9800 len2 = PyUnicode_GET_LENGTH(substring); 9801 9802 switch (kind) { 9803 case PyUnicode_1BYTE_KIND: 9804 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9805 out = asciilib_rsplit( 9806 self, buf1, len1, buf2, len2, maxcount); 9807 else 9808 out = ucs1lib_rsplit( 9809 self, buf1, len1, buf2, len2, maxcount); 9810 break; 9811 case PyUnicode_2BYTE_KIND: 9812 out = ucs2lib_rsplit( 9813 self, buf1, len1, buf2, len2, maxcount); 9814 break; 9815 case PyUnicode_4BYTE_KIND: 9816 out = ucs4lib_rsplit( 9817 self, buf1, len1, buf2, len2, maxcount); 9818 break; 9819 default: 9820 out = NULL; 9821 } 9822 if (kind1 != kind) 9823 PyMem_Free(buf1); 9824 if (kind2 != kind) 9825 PyMem_Free(buf2); 9826 return out; 9827} 9828 9829static Py_ssize_t 9830anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9831 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9832{ 9833 switch (kind) { 9834 case PyUnicode_1BYTE_KIND: 9835 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9836 return asciilib_find(buf1, len1, buf2, len2, offset); 9837 else 9838 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9839 case PyUnicode_2BYTE_KIND: 9840 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9841 case PyUnicode_4BYTE_KIND: 9842 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9843 } 9844 assert(0); 9845 return -1; 9846} 9847 9848static Py_ssize_t 9849anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9850 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9851{ 9852 switch (kind) { 9853 case PyUnicode_1BYTE_KIND: 9854 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 9855 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 9856 else 9857 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9858 case PyUnicode_2BYTE_KIND: 9859 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9860 case PyUnicode_4BYTE_KIND: 9861 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9862 } 9863 assert(0); 9864 return 0; 9865} 9866 9867static PyObject * 9868replace(PyObject *self, PyObject *str1, 9869 PyObject *str2, Py_ssize_t maxcount) 9870{ 9871 PyObject *u; 9872 char *sbuf = PyUnicode_DATA(self); 9873 char *buf1 = PyUnicode_DATA(str1); 9874 char *buf2 = PyUnicode_DATA(str2); 9875 int srelease = 0, release1 = 0, release2 = 0; 9876 int skind = PyUnicode_KIND(self); 9877 int kind1 = PyUnicode_KIND(str1); 9878 int kind2 = PyUnicode_KIND(str2); 9879 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9880 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9881 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9882 int mayshrink; 9883 Py_UCS4 maxchar, maxchar_str2; 9884 9885 if (maxcount < 0) 9886 maxcount = PY_SSIZE_T_MAX; 9887 else if (maxcount == 0 || slen == 0) 9888 goto nothing; 9889 9890 if (str1 == str2) 9891 goto nothing; 9892 if (skind < kind1) 9893 /* substring too wide to be present */ 9894 goto nothing; 9895 9896 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9897 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 9898 /* Replacing str1 with str2 may cause a maxchar reduction in the 9899 result string. */ 9900 mayshrink = (maxchar_str2 < maxchar); 9901 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); 9902 9903 if (len1 == len2) { 9904 /* same length */ 9905 if (len1 == 0) 9906 goto nothing; 9907 if (len1 == 1) { 9908 /* replace characters */ 9909 Py_UCS4 u1, u2; 9910 int rkind; 9911 Py_ssize_t index, pos; 9912 char *src; 9913 9914 u1 = PyUnicode_READ_CHAR(str1, 0); 9915 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); 9916 if (pos < 0) 9917 goto nothing; 9918 u2 = PyUnicode_READ_CHAR(str2, 0); 9919 u = PyUnicode_New(slen, maxchar); 9920 if (!u) 9921 goto error; 9922 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 9923 rkind = PyUnicode_KIND(u); 9924 9925 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); 9926 index = 0; 9927 src = sbuf; 9928 while (--maxcount) 9929 { 9930 pos++; 9931 src += pos * PyUnicode_KIND(self); 9932 slen -= pos; 9933 index += pos; 9934 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); 9935 if (pos < 0) 9936 break; 9937 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); 9938 } 9939 } 9940 else { 9941 int rkind = skind; 9942 char *res; 9943 Py_ssize_t i; 9944 9945 if (kind1 < rkind) { 9946 /* widen substring */ 9947 buf1 = _PyUnicode_AsKind(str1, rkind); 9948 if (!buf1) goto error; 9949 release1 = 1; 9950 } 9951 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 9952 if (i < 0) 9953 goto nothing; 9954 if (rkind > kind2) { 9955 /* widen replacement */ 9956 buf2 = _PyUnicode_AsKind(str2, rkind); 9957 if (!buf2) goto error; 9958 release2 = 1; 9959 } 9960 else if (rkind < kind2) { 9961 /* widen self and buf1 */ 9962 rkind = kind2; 9963 if (release1) PyMem_Free(buf1); 9964 sbuf = _PyUnicode_AsKind(self, rkind); 9965 if (!sbuf) goto error; 9966 srelease = 1; 9967 buf1 = _PyUnicode_AsKind(str1, rkind); 9968 if (!buf1) goto error; 9969 release1 = 1; 9970 } 9971 u = PyUnicode_New(slen, maxchar); 9972 if (!u) 9973 goto error; 9974 assert(PyUnicode_KIND(u) == rkind); 9975 res = PyUnicode_DATA(u); 9976 9977 memcpy(res, sbuf, rkind * slen); 9978 /* change everything in-place, starting with this one */ 9979 memcpy(res + rkind * i, 9980 buf2, 9981 rkind * len2); 9982 i += len1; 9983 9984 while ( --maxcount > 0) { 9985 i = anylib_find(rkind, self, 9986 sbuf+rkind*i, slen-i, 9987 str1, buf1, len1, i); 9988 if (i == -1) 9989 break; 9990 memcpy(res + rkind * i, 9991 buf2, 9992 rkind * len2); 9993 i += len1; 9994 } 9995 } 9996 } 9997 else { 9998 Py_ssize_t n, i, j, ires; 9999 Py_ssize_t new_size; 10000 int rkind = skind; 10001 char *res; 10002 10003 if (kind1 < rkind) { 10004 /* widen substring */ 10005 buf1 = _PyUnicode_AsKind(str1, rkind); 10006 if (!buf1) goto error; 10007 release1 = 1; 10008 } 10009 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10010 if (n == 0) 10011 goto nothing; 10012 if (kind2 < rkind) { 10013 /* widen replacement */ 10014 buf2 = _PyUnicode_AsKind(str2, rkind); 10015 if (!buf2) goto error; 10016 release2 = 1; 10017 } 10018 else if (kind2 > rkind) { 10019 /* widen self and buf1 */ 10020 rkind = kind2; 10021 sbuf = _PyUnicode_AsKind(self, rkind); 10022 if (!sbuf) goto error; 10023 srelease = 1; 10024 if (release1) PyMem_Free(buf1); 10025 buf1 = _PyUnicode_AsKind(str1, rkind); 10026 if (!buf1) goto error; 10027 release1 = 1; 10028 } 10029 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10030 PyUnicode_GET_LENGTH(str1))); */ 10031 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10032 PyErr_SetString(PyExc_OverflowError, 10033 "replace string is too long"); 10034 goto error; 10035 } 10036 new_size = slen + n * (len2 - len1); 10037 if (new_size == 0) { 10038 Py_INCREF(unicode_empty); 10039 u = unicode_empty; 10040 goto done; 10041 } 10042 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10043 PyErr_SetString(PyExc_OverflowError, 10044 "replace string is too long"); 10045 goto error; 10046 } 10047 u = PyUnicode_New(new_size, maxchar); 10048 if (!u) 10049 goto error; 10050 assert(PyUnicode_KIND(u) == rkind); 10051 res = PyUnicode_DATA(u); 10052 ires = i = 0; 10053 if (len1 > 0) { 10054 while (n-- > 0) { 10055 /* look for next match */ 10056 j = anylib_find(rkind, self, 10057 sbuf + rkind * i, slen-i, 10058 str1, buf1, len1, i); 10059 if (j == -1) 10060 break; 10061 else if (j > i) { 10062 /* copy unchanged part [i:j] */ 10063 memcpy(res + rkind * ires, 10064 sbuf + rkind * i, 10065 rkind * (j-i)); 10066 ires += j - i; 10067 } 10068 /* copy substitution string */ 10069 if (len2 > 0) { 10070 memcpy(res + rkind * ires, 10071 buf2, 10072 rkind * len2); 10073 ires += len2; 10074 } 10075 i = j + len1; 10076 } 10077 if (i < slen) 10078 /* copy tail [i:] */ 10079 memcpy(res + rkind * ires, 10080 sbuf + rkind * i, 10081 rkind * (slen-i)); 10082 } 10083 else { 10084 /* interleave */ 10085 while (n > 0) { 10086 memcpy(res + rkind * ires, 10087 buf2, 10088 rkind * len2); 10089 ires += len2; 10090 if (--n <= 0) 10091 break; 10092 memcpy(res + rkind * ires, 10093 sbuf + rkind * i, 10094 rkind); 10095 ires++; 10096 i++; 10097 } 10098 memcpy(res + rkind * ires, 10099 sbuf + rkind * i, 10100 rkind * (slen-i)); 10101 } 10102 } 10103 10104 if (mayshrink) { 10105 unicode_adjust_maxchar(&u); 10106 if (u == NULL) 10107 goto error; 10108 } 10109 10110 done: 10111 if (srelease) 10112 PyMem_FREE(sbuf); 10113 if (release1) 10114 PyMem_FREE(buf1); 10115 if (release2) 10116 PyMem_FREE(buf2); 10117 assert(_PyUnicode_CheckConsistency(u, 1)); 10118 return u; 10119 10120 nothing: 10121 /* nothing to replace; return original string (when possible) */ 10122 if (srelease) 10123 PyMem_FREE(sbuf); 10124 if (release1) 10125 PyMem_FREE(buf1); 10126 if (release2) 10127 PyMem_FREE(buf2); 10128 return unicode_result_unchanged(self); 10129 10130 error: 10131 if (srelease && sbuf) 10132 PyMem_FREE(sbuf); 10133 if (release1 && buf1) 10134 PyMem_FREE(buf1); 10135 if (release2 && buf2) 10136 PyMem_FREE(buf2); 10137 return NULL; 10138} 10139 10140/* --- Unicode Object Methods --------------------------------------------- */ 10141 10142PyDoc_STRVAR(title__doc__, 10143 "S.title() -> str\n\ 10144\n\ 10145Return a titlecased version of S, i.e. words start with title case\n\ 10146characters, all remaining cased characters have lower case."); 10147 10148static PyObject* 10149unicode_title(PyObject *self) 10150{ 10151 if (PyUnicode_READY(self) == -1) 10152 return NULL; 10153 return case_operation(self, do_title); 10154} 10155 10156PyDoc_STRVAR(capitalize__doc__, 10157 "S.capitalize() -> str\n\ 10158\n\ 10159Return a capitalized version of S, i.e. make the first character\n\ 10160have upper case and the rest lower case."); 10161 10162static PyObject* 10163unicode_capitalize(PyObject *self) 10164{ 10165 if (PyUnicode_READY(self) == -1) 10166 return NULL; 10167 if (PyUnicode_GET_LENGTH(self) == 0) 10168 return unicode_result_unchanged(self); 10169 return case_operation(self, do_capitalize); 10170} 10171 10172PyDoc_STRVAR(casefold__doc__, 10173 "S.casefold() -> str\n\ 10174\n\ 10175Return a version of S suitable for caseless comparisons."); 10176 10177static PyObject * 10178unicode_casefold(PyObject *self) 10179{ 10180 if (PyUnicode_READY(self) == -1) 10181 return NULL; 10182 if (PyUnicode_IS_ASCII(self)) 10183 return ascii_upper_or_lower(self, 1); 10184 return case_operation(self, do_casefold); 10185} 10186 10187 10188/* Argument converter. Coerces to a single unicode character */ 10189 10190static int 10191convert_uc(PyObject *obj, void *addr) 10192{ 10193 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10194 PyObject *uniobj; 10195 10196 uniobj = PyUnicode_FromObject(obj); 10197 if (uniobj == NULL) { 10198 PyErr_SetString(PyExc_TypeError, 10199 "The fill character cannot be converted to Unicode"); 10200 return 0; 10201 } 10202 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10203 PyErr_SetString(PyExc_TypeError, 10204 "The fill character must be exactly one character long"); 10205 Py_DECREF(uniobj); 10206 return 0; 10207 } 10208 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10209 Py_DECREF(uniobj); 10210 return 1; 10211} 10212 10213PyDoc_STRVAR(center__doc__, 10214 "S.center(width[, fillchar]) -> str\n\ 10215\n\ 10216Return S centered in a string of length width. Padding is\n\ 10217done using the specified fill character (default is a space)"); 10218 10219static PyObject * 10220unicode_center(PyObject *self, PyObject *args) 10221{ 10222 Py_ssize_t marg, left; 10223 Py_ssize_t width; 10224 Py_UCS4 fillchar = ' '; 10225 10226 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10227 return NULL; 10228 10229 if (PyUnicode_READY(self) == -1) 10230 return NULL; 10231 10232 if (PyUnicode_GET_LENGTH(self) >= width) 10233 return unicode_result_unchanged(self); 10234 10235 marg = width - PyUnicode_GET_LENGTH(self); 10236 left = marg / 2 + (marg & width & 1); 10237 10238 return pad(self, left, marg - left, fillchar); 10239} 10240 10241/* This function assumes that str1 and str2 are readied by the caller. */ 10242 10243static int 10244unicode_compare(PyObject *str1, PyObject *str2) 10245{ 10246 int kind1, kind2; 10247 void *data1, *data2; 10248 Py_ssize_t len1, len2; 10249 Py_ssize_t i, len; 10250 10251 /* a string is equal to itself */ 10252 if (str1 == str2) 10253 return 0; 10254 10255 kind1 = PyUnicode_KIND(str1); 10256 kind2 = PyUnicode_KIND(str2); 10257 data1 = PyUnicode_DATA(str1); 10258 data2 = PyUnicode_DATA(str2); 10259 len1 = PyUnicode_GET_LENGTH(str1); 10260 len2 = PyUnicode_GET_LENGTH(str2); 10261 len = Py_MIN(len1, len2); 10262 10263 if (kind1 == 1 && kind2 == 1) { 10264 int cmp = memcmp(data1, data2, len); 10265 /* normalize result of memcmp() into the range [-1; 1] */ 10266 if (cmp < 0) 10267 return -1; 10268 if (cmp > 0) 10269 return 1; 10270 } 10271 else { 10272 for (i = 0; i < len; ++i) { 10273 Py_UCS4 c1, c2; 10274 c1 = PyUnicode_READ(kind1, data1, i); 10275 c2 = PyUnicode_READ(kind2, data2, i); 10276 10277 if (c1 != c2) 10278 return (c1 < c2) ? -1 : 1; 10279 } 10280 } 10281 10282 if (len1 == len2) 10283 return 0; 10284 if (len1 < len2) 10285 return -1; 10286 else 10287 return 1; 10288} 10289 10290static int 10291unicode_compare_eq(PyObject *str1, PyObject *str2) 10292{ 10293 int kind; 10294 void *data1, *data2; 10295 Py_ssize_t len; 10296 int cmp; 10297 10298 /* a string is equal to itself */ 10299 if (str1 == str2) 10300 return 1; 10301 10302 len = PyUnicode_GET_LENGTH(str1); 10303 if (PyUnicode_GET_LENGTH(str2) != len) 10304 return 0; 10305 kind = PyUnicode_KIND(str1); 10306 if (PyUnicode_KIND(str2) != kind) 10307 return 0; 10308 data1 = PyUnicode_DATA(str1); 10309 data2 = PyUnicode_DATA(str2); 10310 10311 cmp = memcmp(data1, data2, len * kind); 10312 return (cmp == 0); 10313} 10314 10315 10316int 10317PyUnicode_Compare(PyObject *left, PyObject *right) 10318{ 10319 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10320 if (PyUnicode_READY(left) == -1 || 10321 PyUnicode_READY(right) == -1) 10322 return -1; 10323 return unicode_compare(left, right); 10324 } 10325 PyErr_Format(PyExc_TypeError, 10326 "Can't compare %.100s and %.100s", 10327 left->ob_type->tp_name, 10328 right->ob_type->tp_name); 10329 return -1; 10330} 10331 10332int 10333PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10334{ 10335 Py_ssize_t i; 10336 int kind; 10337 void *data; 10338 Py_UCS4 chr; 10339 10340 assert(_PyUnicode_CHECK(uni)); 10341 if (PyUnicode_READY(uni) == -1) 10342 return -1; 10343 kind = PyUnicode_KIND(uni); 10344 data = PyUnicode_DATA(uni); 10345 /* Compare Unicode string and source character set string */ 10346 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10347 if (chr != str[i]) 10348 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10349 /* This check keeps Python strings that end in '\0' from comparing equal 10350 to C strings identical up to that point. */ 10351 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10352 return 1; /* uni is longer */ 10353 if (str[i]) 10354 return -1; /* str is longer */ 10355 return 0; 10356} 10357 10358 10359#define TEST_COND(cond) \ 10360 ((cond) ? Py_True : Py_False) 10361 10362PyObject * 10363PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10364{ 10365 int result; 10366 PyObject *v; 10367 10368 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10369 Py_RETURN_NOTIMPLEMENTED; 10370 10371 if (PyUnicode_READY(left) == -1 || 10372 PyUnicode_READY(right) == -1) 10373 return NULL; 10374 10375 if (op == Py_EQ || op == Py_NE) { 10376 result = unicode_compare_eq(left, right); 10377 if (op == Py_EQ) 10378 v = TEST_COND(result); 10379 else 10380 v = TEST_COND(!result); 10381 } 10382 else { 10383 result = unicode_compare(left, right); 10384 10385 /* Convert the return value to a Boolean */ 10386 switch (op) { 10387 case Py_LE: 10388 v = TEST_COND(result <= 0); 10389 break; 10390 case Py_GE: 10391 v = TEST_COND(result >= 0); 10392 break; 10393 case Py_LT: 10394 v = TEST_COND(result == -1); 10395 break; 10396 case Py_GT: 10397 v = TEST_COND(result == 1); 10398 break; 10399 default: 10400 PyErr_BadArgument(); 10401 return NULL; 10402 } 10403 } 10404 Py_INCREF(v); 10405 return v; 10406} 10407 10408int 10409PyUnicode_Contains(PyObject *container, PyObject *element) 10410{ 10411 PyObject *str, *sub; 10412 int kind1, kind2, kind; 10413 void *buf1, *buf2; 10414 Py_ssize_t len1, len2; 10415 int result; 10416 10417 /* Coerce the two arguments */ 10418 sub = PyUnicode_FromObject(element); 10419 if (!sub) { 10420 PyErr_Format(PyExc_TypeError, 10421 "'in <string>' requires string as left operand, not %s", 10422 element->ob_type->tp_name); 10423 return -1; 10424 } 10425 10426 str = PyUnicode_FromObject(container); 10427 if (!str) { 10428 Py_DECREF(sub); 10429 return -1; 10430 } 10431 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 10432 Py_DECREF(sub); 10433 Py_DECREF(str); 10434 } 10435 10436 kind1 = PyUnicode_KIND(str); 10437 kind2 = PyUnicode_KIND(sub); 10438 kind = kind1; 10439 buf1 = PyUnicode_DATA(str); 10440 buf2 = PyUnicode_DATA(sub); 10441 if (kind2 != kind) { 10442 if (kind2 > kind) { 10443 Py_DECREF(sub); 10444 Py_DECREF(str); 10445 return 0; 10446 } 10447 buf2 = _PyUnicode_AsKind(sub, kind); 10448 } 10449 if (!buf2) { 10450 Py_DECREF(sub); 10451 Py_DECREF(str); 10452 return -1; 10453 } 10454 len1 = PyUnicode_GET_LENGTH(str); 10455 len2 = PyUnicode_GET_LENGTH(sub); 10456 10457 switch (kind) { 10458 case PyUnicode_1BYTE_KIND: 10459 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10460 break; 10461 case PyUnicode_2BYTE_KIND: 10462 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10463 break; 10464 case PyUnicode_4BYTE_KIND: 10465 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10466 break; 10467 default: 10468 result = -1; 10469 assert(0); 10470 } 10471 10472 Py_DECREF(str); 10473 Py_DECREF(sub); 10474 10475 if (kind2 != kind) 10476 PyMem_Free(buf2); 10477 10478 return result; 10479} 10480 10481/* Concat to string or Unicode object giving a new Unicode object. */ 10482 10483PyObject * 10484PyUnicode_Concat(PyObject *left, PyObject *right) 10485{ 10486 PyObject *u = NULL, *v = NULL, *w; 10487 Py_UCS4 maxchar, maxchar2; 10488 Py_ssize_t u_len, v_len, new_len; 10489 10490 /* Coerce the two arguments */ 10491 u = PyUnicode_FromObject(left); 10492 if (u == NULL) 10493 goto onError; 10494 v = PyUnicode_FromObject(right); 10495 if (v == NULL) 10496 goto onError; 10497 10498 /* Shortcuts */ 10499 if (v == unicode_empty) { 10500 Py_DECREF(v); 10501 return u; 10502 } 10503 if (u == unicode_empty) { 10504 Py_DECREF(u); 10505 return v; 10506 } 10507 10508 u_len = PyUnicode_GET_LENGTH(u); 10509 v_len = PyUnicode_GET_LENGTH(v); 10510 if (u_len > PY_SSIZE_T_MAX - v_len) { 10511 PyErr_SetString(PyExc_OverflowError, 10512 "strings are too large to concat"); 10513 goto onError; 10514 } 10515 new_len = u_len + v_len; 10516 10517 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10518 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10519 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10520 10521 /* Concat the two Unicode strings */ 10522 w = PyUnicode_New(new_len, maxchar); 10523 if (w == NULL) 10524 goto onError; 10525 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10526 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10527 Py_DECREF(u); 10528 Py_DECREF(v); 10529 assert(_PyUnicode_CheckConsistency(w, 1)); 10530 return w; 10531 10532 onError: 10533 Py_XDECREF(u); 10534 Py_XDECREF(v); 10535 return NULL; 10536} 10537 10538void 10539PyUnicode_Append(PyObject **p_left, PyObject *right) 10540{ 10541 PyObject *left, *res; 10542 Py_UCS4 maxchar, maxchar2; 10543 Py_ssize_t left_len, right_len, new_len; 10544 10545 if (p_left == NULL) { 10546 if (!PyErr_Occurred()) 10547 PyErr_BadInternalCall(); 10548 return; 10549 } 10550 left = *p_left; 10551 if (right == NULL || !PyUnicode_Check(left)) { 10552 if (!PyErr_Occurred()) 10553 PyErr_BadInternalCall(); 10554 goto error; 10555 } 10556 10557 if (PyUnicode_READY(left) == -1) 10558 goto error; 10559 if (PyUnicode_READY(right) == -1) 10560 goto error; 10561 10562 /* Shortcuts */ 10563 if (left == unicode_empty) { 10564 Py_DECREF(left); 10565 Py_INCREF(right); 10566 *p_left = right; 10567 return; 10568 } 10569 if (right == unicode_empty) 10570 return; 10571 10572 left_len = PyUnicode_GET_LENGTH(left); 10573 right_len = PyUnicode_GET_LENGTH(right); 10574 if (left_len > PY_SSIZE_T_MAX - right_len) { 10575 PyErr_SetString(PyExc_OverflowError, 10576 "strings are too large to concat"); 10577 goto error; 10578 } 10579 new_len = left_len + right_len; 10580 10581 if (unicode_modifiable(left) 10582 && PyUnicode_CheckExact(right) 10583 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10584 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10585 to change the structure size, but characters are stored just after 10586 the structure, and so it requires to move all characters which is 10587 not so different than duplicating the string. */ 10588 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10589 { 10590 /* append inplace */ 10591 if (unicode_resize(p_left, new_len) != 0) { 10592 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10593 * deallocated so it cannot be put back into 10594 * 'variable'. The MemoryError is raised when there 10595 * is no value in 'variable', which might (very 10596 * remotely) be a cause of incompatibilities. 10597 */ 10598 goto error; 10599 } 10600 /* copy 'right' into the newly allocated area of 'left' */ 10601 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10602 } 10603 else { 10604 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10605 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10606 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10607 10608 /* Concat the two Unicode strings */ 10609 res = PyUnicode_New(new_len, maxchar); 10610 if (res == NULL) 10611 goto error; 10612 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10613 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10614 Py_DECREF(left); 10615 *p_left = res; 10616 } 10617 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10618 return; 10619 10620error: 10621 Py_CLEAR(*p_left); 10622} 10623 10624void 10625PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10626{ 10627 PyUnicode_Append(pleft, right); 10628 Py_XDECREF(right); 10629} 10630 10631PyDoc_STRVAR(count__doc__, 10632 "S.count(sub[, start[, end]]) -> int\n\ 10633\n\ 10634Return the number of non-overlapping occurrences of substring sub in\n\ 10635string S[start:end]. Optional arguments start and end are\n\ 10636interpreted as in slice notation."); 10637 10638static PyObject * 10639unicode_count(PyObject *self, PyObject *args) 10640{ 10641 PyObject *substring; 10642 Py_ssize_t start = 0; 10643 Py_ssize_t end = PY_SSIZE_T_MAX; 10644 PyObject *result; 10645 int kind1, kind2, kind; 10646 void *buf1, *buf2; 10647 Py_ssize_t len1, len2, iresult; 10648 10649 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10650 &start, &end)) 10651 return NULL; 10652 10653 kind1 = PyUnicode_KIND(self); 10654 kind2 = PyUnicode_KIND(substring); 10655 if (kind2 > kind1) 10656 return PyLong_FromLong(0); 10657 kind = kind1; 10658 buf1 = PyUnicode_DATA(self); 10659 buf2 = PyUnicode_DATA(substring); 10660 if (kind2 != kind) 10661 buf2 = _PyUnicode_AsKind(substring, kind); 10662 if (!buf2) { 10663 Py_DECREF(substring); 10664 return NULL; 10665 } 10666 len1 = PyUnicode_GET_LENGTH(self); 10667 len2 = PyUnicode_GET_LENGTH(substring); 10668 10669 ADJUST_INDICES(start, end, len1); 10670 switch (kind) { 10671 case PyUnicode_1BYTE_KIND: 10672 iresult = ucs1lib_count( 10673 ((Py_UCS1*)buf1) + start, end - start, 10674 buf2, len2, PY_SSIZE_T_MAX 10675 ); 10676 break; 10677 case PyUnicode_2BYTE_KIND: 10678 iresult = ucs2lib_count( 10679 ((Py_UCS2*)buf1) + start, end - start, 10680 buf2, len2, PY_SSIZE_T_MAX 10681 ); 10682 break; 10683 case PyUnicode_4BYTE_KIND: 10684 iresult = ucs4lib_count( 10685 ((Py_UCS4*)buf1) + start, end - start, 10686 buf2, len2, PY_SSIZE_T_MAX 10687 ); 10688 break; 10689 default: 10690 assert(0); iresult = 0; 10691 } 10692 10693 result = PyLong_FromSsize_t(iresult); 10694 10695 if (kind2 != kind) 10696 PyMem_Free(buf2); 10697 10698 Py_DECREF(substring); 10699 10700 return result; 10701} 10702 10703PyDoc_STRVAR(encode__doc__, 10704 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10705\n\ 10706Encode S using the codec registered for encoding. Default encoding\n\ 10707is 'utf-8'. errors may be given to set a different error\n\ 10708handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10709a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10710'xmlcharrefreplace' as well as any other name registered with\n\ 10711codecs.register_error that can handle UnicodeEncodeErrors."); 10712 10713static PyObject * 10714unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10715{ 10716 static char *kwlist[] = {"encoding", "errors", 0}; 10717 char *encoding = NULL; 10718 char *errors = NULL; 10719 10720 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10721 kwlist, &encoding, &errors)) 10722 return NULL; 10723 return PyUnicode_AsEncodedString(self, encoding, errors); 10724} 10725 10726PyDoc_STRVAR(expandtabs__doc__, 10727 "S.expandtabs([tabsize]) -> str\n\ 10728\n\ 10729Return a copy of S where all tab characters are expanded using spaces.\n\ 10730If tabsize is not given, a tab size of 8 characters is assumed."); 10731 10732static PyObject* 10733unicode_expandtabs(PyObject *self, PyObject *args) 10734{ 10735 Py_ssize_t i, j, line_pos, src_len, incr; 10736 Py_UCS4 ch; 10737 PyObject *u; 10738 void *src_data, *dest_data; 10739 int tabsize = 8; 10740 int kind; 10741 int found; 10742 10743 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10744 return NULL; 10745 10746 if (PyUnicode_READY(self) == -1) 10747 return NULL; 10748 10749 /* First pass: determine size of output string */ 10750 src_len = PyUnicode_GET_LENGTH(self); 10751 i = j = line_pos = 0; 10752 kind = PyUnicode_KIND(self); 10753 src_data = PyUnicode_DATA(self); 10754 found = 0; 10755 for (; i < src_len; i++) { 10756 ch = PyUnicode_READ(kind, src_data, i); 10757 if (ch == '\t') { 10758 found = 1; 10759 if (tabsize > 0) { 10760 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10761 if (j > PY_SSIZE_T_MAX - incr) 10762 goto overflow; 10763 line_pos += incr; 10764 j += incr; 10765 } 10766 } 10767 else { 10768 if (j > PY_SSIZE_T_MAX - 1) 10769 goto overflow; 10770 line_pos++; 10771 j++; 10772 if (ch == '\n' || ch == '\r') 10773 line_pos = 0; 10774 } 10775 } 10776 if (!found) 10777 return unicode_result_unchanged(self); 10778 10779 /* Second pass: create output string and fill it */ 10780 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10781 if (!u) 10782 return NULL; 10783 dest_data = PyUnicode_DATA(u); 10784 10785 i = j = line_pos = 0; 10786 10787 for (; i < src_len; i++) { 10788 ch = PyUnicode_READ(kind, src_data, i); 10789 if (ch == '\t') { 10790 if (tabsize > 0) { 10791 incr = tabsize - (line_pos % tabsize); 10792 line_pos += incr; 10793 FILL(kind, dest_data, ' ', j, incr); 10794 j += incr; 10795 } 10796 } 10797 else { 10798 line_pos++; 10799 PyUnicode_WRITE(kind, dest_data, j, ch); 10800 j++; 10801 if (ch == '\n' || ch == '\r') 10802 line_pos = 0; 10803 } 10804 } 10805 assert (j == PyUnicode_GET_LENGTH(u)); 10806 return unicode_result(u); 10807 10808 overflow: 10809 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10810 return NULL; 10811} 10812 10813PyDoc_STRVAR(find__doc__, 10814 "S.find(sub[, start[, end]]) -> int\n\ 10815\n\ 10816Return the lowest index in S where substring sub is found,\n\ 10817such that sub is contained within S[start:end]. Optional\n\ 10818arguments start and end are interpreted as in slice notation.\n\ 10819\n\ 10820Return -1 on failure."); 10821 10822static PyObject * 10823unicode_find(PyObject *self, PyObject *args) 10824{ 10825 PyObject *substring; 10826 Py_ssize_t start; 10827 Py_ssize_t end; 10828 Py_ssize_t result; 10829 10830 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10831 &start, &end)) 10832 return NULL; 10833 10834 if (PyUnicode_READY(self) == -1) 10835 return NULL; 10836 if (PyUnicode_READY(substring) == -1) 10837 return NULL; 10838 10839 result = any_find_slice(1, self, substring, start, end); 10840 10841 Py_DECREF(substring); 10842 10843 if (result == -2) 10844 return NULL; 10845 10846 return PyLong_FromSsize_t(result); 10847} 10848 10849static PyObject * 10850unicode_getitem(PyObject *self, Py_ssize_t index) 10851{ 10852 void *data; 10853 enum PyUnicode_Kind kind; 10854 Py_UCS4 ch; 10855 PyObject *res; 10856 10857 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 10858 PyErr_BadArgument(); 10859 return NULL; 10860 } 10861 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 10862 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10863 return NULL; 10864 } 10865 kind = PyUnicode_KIND(self); 10866 data = PyUnicode_DATA(self); 10867 ch = PyUnicode_READ(kind, data, index); 10868 if (ch < 256) 10869 return get_latin1_char(ch); 10870 10871 res = PyUnicode_New(1, ch); 10872 if (res == NULL) 10873 return NULL; 10874 kind = PyUnicode_KIND(res); 10875 data = PyUnicode_DATA(res); 10876 PyUnicode_WRITE(kind, data, 0, ch); 10877 assert(_PyUnicode_CheckConsistency(res, 1)); 10878 return res; 10879} 10880 10881/* Believe it or not, this produces the same value for ASCII strings 10882 as bytes_hash(). */ 10883static Py_hash_t 10884unicode_hash(PyObject *self) 10885{ 10886 Py_ssize_t len; 10887 Py_uhash_t x; 10888 10889#ifdef Py_DEBUG 10890 assert(_Py_HashSecret_Initialized); 10891#endif 10892 if (_PyUnicode_HASH(self) != -1) 10893 return _PyUnicode_HASH(self); 10894 if (PyUnicode_READY(self) == -1) 10895 return -1; 10896 len = PyUnicode_GET_LENGTH(self); 10897 /* 10898 We make the hash of the empty string be 0, rather than using 10899 (prefix ^ suffix), since this slightly obfuscates the hash secret 10900 */ 10901 if (len == 0) { 10902 _PyUnicode_HASH(self) = 0; 10903 return 0; 10904 } 10905 10906 /* The hash function as a macro, gets expanded three times below. */ 10907#define HASH(P) \ 10908 x ^= (Py_uhash_t) *P << 7; \ 10909 while (--len >= 0) \ 10910 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 10911 10912 x = (Py_uhash_t) _Py_HashSecret.prefix; 10913 switch (PyUnicode_KIND(self)) { 10914 case PyUnicode_1BYTE_KIND: { 10915 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10916 HASH(c); 10917 break; 10918 } 10919 case PyUnicode_2BYTE_KIND: { 10920 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10921 HASH(s); 10922 break; 10923 } 10924 default: { 10925 Py_UCS4 *l; 10926 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10927 "Impossible switch case in unicode_hash"); 10928 l = PyUnicode_4BYTE_DATA(self); 10929 HASH(l); 10930 break; 10931 } 10932 } 10933 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 10934 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 10935 10936 if (x == -1) 10937 x = -2; 10938 _PyUnicode_HASH(self) = x; 10939 return x; 10940} 10941#undef HASH 10942 10943PyDoc_STRVAR(index__doc__, 10944 "S.index(sub[, start[, end]]) -> int\n\ 10945\n\ 10946Like S.find() but raise ValueError when the substring is not found."); 10947 10948static PyObject * 10949unicode_index(PyObject *self, PyObject *args) 10950{ 10951 Py_ssize_t result; 10952 PyObject *substring; 10953 Py_ssize_t start; 10954 Py_ssize_t end; 10955 10956 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10957 &start, &end)) 10958 return NULL; 10959 10960 if (PyUnicode_READY(self) == -1) 10961 return NULL; 10962 if (PyUnicode_READY(substring) == -1) 10963 return NULL; 10964 10965 result = any_find_slice(1, self, substring, start, end); 10966 10967 Py_DECREF(substring); 10968 10969 if (result == -2) 10970 return NULL; 10971 10972 if (result < 0) { 10973 PyErr_SetString(PyExc_ValueError, "substring not found"); 10974 return NULL; 10975 } 10976 10977 return PyLong_FromSsize_t(result); 10978} 10979 10980PyDoc_STRVAR(islower__doc__, 10981 "S.islower() -> bool\n\ 10982\n\ 10983Return True if all cased characters in S are lowercase and there is\n\ 10984at least one cased character in S, False otherwise."); 10985 10986static PyObject* 10987unicode_islower(PyObject *self) 10988{ 10989 Py_ssize_t i, length; 10990 int kind; 10991 void *data; 10992 int cased; 10993 10994 if (PyUnicode_READY(self) == -1) 10995 return NULL; 10996 length = PyUnicode_GET_LENGTH(self); 10997 kind = PyUnicode_KIND(self); 10998 data = PyUnicode_DATA(self); 10999 11000 /* Shortcut for single character strings */ 11001 if (length == 1) 11002 return PyBool_FromLong( 11003 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11004 11005 /* Special case for empty strings */ 11006 if (length == 0) 11007 return PyBool_FromLong(0); 11008 11009 cased = 0; 11010 for (i = 0; i < length; i++) { 11011 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11012 11013 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11014 return PyBool_FromLong(0); 11015 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11016 cased = 1; 11017 } 11018 return PyBool_FromLong(cased); 11019} 11020 11021PyDoc_STRVAR(isupper__doc__, 11022 "S.isupper() -> bool\n\ 11023\n\ 11024Return True if all cased characters in S are uppercase and there is\n\ 11025at least one cased character in S, False otherwise."); 11026 11027static PyObject* 11028unicode_isupper(PyObject *self) 11029{ 11030 Py_ssize_t i, length; 11031 int kind; 11032 void *data; 11033 int cased; 11034 11035 if (PyUnicode_READY(self) == -1) 11036 return NULL; 11037 length = PyUnicode_GET_LENGTH(self); 11038 kind = PyUnicode_KIND(self); 11039 data = PyUnicode_DATA(self); 11040 11041 /* Shortcut for single character strings */ 11042 if (length == 1) 11043 return PyBool_FromLong( 11044 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11045 11046 /* Special case for empty strings */ 11047 if (length == 0) 11048 return PyBool_FromLong(0); 11049 11050 cased = 0; 11051 for (i = 0; i < length; i++) { 11052 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11053 11054 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11055 return PyBool_FromLong(0); 11056 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11057 cased = 1; 11058 } 11059 return PyBool_FromLong(cased); 11060} 11061 11062PyDoc_STRVAR(istitle__doc__, 11063 "S.istitle() -> bool\n\ 11064\n\ 11065Return True if S is a titlecased string and there is at least one\n\ 11066character in S, i.e. upper- and titlecase characters may only\n\ 11067follow uncased characters and lowercase characters only cased ones.\n\ 11068Return False otherwise."); 11069 11070static PyObject* 11071unicode_istitle(PyObject *self) 11072{ 11073 Py_ssize_t i, length; 11074 int kind; 11075 void *data; 11076 int cased, previous_is_cased; 11077 11078 if (PyUnicode_READY(self) == -1) 11079 return NULL; 11080 length = PyUnicode_GET_LENGTH(self); 11081 kind = PyUnicode_KIND(self); 11082 data = PyUnicode_DATA(self); 11083 11084 /* Shortcut for single character strings */ 11085 if (length == 1) { 11086 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11087 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11088 (Py_UNICODE_ISUPPER(ch) != 0)); 11089 } 11090 11091 /* Special case for empty strings */ 11092 if (length == 0) 11093 return PyBool_FromLong(0); 11094 11095 cased = 0; 11096 previous_is_cased = 0; 11097 for (i = 0; i < length; i++) { 11098 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11099 11100 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11101 if (previous_is_cased) 11102 return PyBool_FromLong(0); 11103 previous_is_cased = 1; 11104 cased = 1; 11105 } 11106 else if (Py_UNICODE_ISLOWER(ch)) { 11107 if (!previous_is_cased) 11108 return PyBool_FromLong(0); 11109 previous_is_cased = 1; 11110 cased = 1; 11111 } 11112 else 11113 previous_is_cased = 0; 11114 } 11115 return PyBool_FromLong(cased); 11116} 11117 11118PyDoc_STRVAR(isspace__doc__, 11119 "S.isspace() -> bool\n\ 11120\n\ 11121Return True if all characters in S are whitespace\n\ 11122and there is at least one character in S, False otherwise."); 11123 11124static PyObject* 11125unicode_isspace(PyObject *self) 11126{ 11127 Py_ssize_t i, length; 11128 int kind; 11129 void *data; 11130 11131 if (PyUnicode_READY(self) == -1) 11132 return NULL; 11133 length = PyUnicode_GET_LENGTH(self); 11134 kind = PyUnicode_KIND(self); 11135 data = PyUnicode_DATA(self); 11136 11137 /* Shortcut for single character strings */ 11138 if (length == 1) 11139 return PyBool_FromLong( 11140 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11141 11142 /* Special case for empty strings */ 11143 if (length == 0) 11144 return PyBool_FromLong(0); 11145 11146 for (i = 0; i < length; i++) { 11147 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11148 if (!Py_UNICODE_ISSPACE(ch)) 11149 return PyBool_FromLong(0); 11150 } 11151 return PyBool_FromLong(1); 11152} 11153 11154PyDoc_STRVAR(isalpha__doc__, 11155 "S.isalpha() -> bool\n\ 11156\n\ 11157Return True if all characters in S are alphabetic\n\ 11158and there is at least one character in S, False otherwise."); 11159 11160static PyObject* 11161unicode_isalpha(PyObject *self) 11162{ 11163 Py_ssize_t i, length; 11164 int kind; 11165 void *data; 11166 11167 if (PyUnicode_READY(self) == -1) 11168 return NULL; 11169 length = PyUnicode_GET_LENGTH(self); 11170 kind = PyUnicode_KIND(self); 11171 data = PyUnicode_DATA(self); 11172 11173 /* Shortcut for single character strings */ 11174 if (length == 1) 11175 return PyBool_FromLong( 11176 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11177 11178 /* Special case for empty strings */ 11179 if (length == 0) 11180 return PyBool_FromLong(0); 11181 11182 for (i = 0; i < length; i++) { 11183 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11184 return PyBool_FromLong(0); 11185 } 11186 return PyBool_FromLong(1); 11187} 11188 11189PyDoc_STRVAR(isalnum__doc__, 11190 "S.isalnum() -> bool\n\ 11191\n\ 11192Return True if all characters in S are alphanumeric\n\ 11193and there is at least one character in S, False otherwise."); 11194 11195static PyObject* 11196unicode_isalnum(PyObject *self) 11197{ 11198 int kind; 11199 void *data; 11200 Py_ssize_t len, i; 11201 11202 if (PyUnicode_READY(self) == -1) 11203 return NULL; 11204 11205 kind = PyUnicode_KIND(self); 11206 data = PyUnicode_DATA(self); 11207 len = PyUnicode_GET_LENGTH(self); 11208 11209 /* Shortcut for single character strings */ 11210 if (len == 1) { 11211 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11212 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11213 } 11214 11215 /* Special case for empty strings */ 11216 if (len == 0) 11217 return PyBool_FromLong(0); 11218 11219 for (i = 0; i < len; i++) { 11220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11221 if (!Py_UNICODE_ISALNUM(ch)) 11222 return PyBool_FromLong(0); 11223 } 11224 return PyBool_FromLong(1); 11225} 11226 11227PyDoc_STRVAR(isdecimal__doc__, 11228 "S.isdecimal() -> bool\n\ 11229\n\ 11230Return True if there are only decimal characters in S,\n\ 11231False otherwise."); 11232 11233static PyObject* 11234unicode_isdecimal(PyObject *self) 11235{ 11236 Py_ssize_t i, length; 11237 int kind; 11238 void *data; 11239 11240 if (PyUnicode_READY(self) == -1) 11241 return NULL; 11242 length = PyUnicode_GET_LENGTH(self); 11243 kind = PyUnicode_KIND(self); 11244 data = PyUnicode_DATA(self); 11245 11246 /* Shortcut for single character strings */ 11247 if (length == 1) 11248 return PyBool_FromLong( 11249 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11250 11251 /* Special case for empty strings */ 11252 if (length == 0) 11253 return PyBool_FromLong(0); 11254 11255 for (i = 0; i < length; i++) { 11256 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11257 return PyBool_FromLong(0); 11258 } 11259 return PyBool_FromLong(1); 11260} 11261 11262PyDoc_STRVAR(isdigit__doc__, 11263 "S.isdigit() -> bool\n\ 11264\n\ 11265Return True if all characters in S are digits\n\ 11266and there is at least one character in S, False otherwise."); 11267 11268static PyObject* 11269unicode_isdigit(PyObject *self) 11270{ 11271 Py_ssize_t i, length; 11272 int kind; 11273 void *data; 11274 11275 if (PyUnicode_READY(self) == -1) 11276 return NULL; 11277 length = PyUnicode_GET_LENGTH(self); 11278 kind = PyUnicode_KIND(self); 11279 data = PyUnicode_DATA(self); 11280 11281 /* Shortcut for single character strings */ 11282 if (length == 1) { 11283 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11284 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11285 } 11286 11287 /* Special case for empty strings */ 11288 if (length == 0) 11289 return PyBool_FromLong(0); 11290 11291 for (i = 0; i < length; i++) { 11292 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11293 return PyBool_FromLong(0); 11294 } 11295 return PyBool_FromLong(1); 11296} 11297 11298PyDoc_STRVAR(isnumeric__doc__, 11299 "S.isnumeric() -> bool\n\ 11300\n\ 11301Return True if there are only numeric characters in S,\n\ 11302False otherwise."); 11303 11304static PyObject* 11305unicode_isnumeric(PyObject *self) 11306{ 11307 Py_ssize_t i, length; 11308 int kind; 11309 void *data; 11310 11311 if (PyUnicode_READY(self) == -1) 11312 return NULL; 11313 length = PyUnicode_GET_LENGTH(self); 11314 kind = PyUnicode_KIND(self); 11315 data = PyUnicode_DATA(self); 11316 11317 /* Shortcut for single character strings */ 11318 if (length == 1) 11319 return PyBool_FromLong( 11320 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11321 11322 /* Special case for empty strings */ 11323 if (length == 0) 11324 return PyBool_FromLong(0); 11325 11326 for (i = 0; i < length; i++) { 11327 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11328 return PyBool_FromLong(0); 11329 } 11330 return PyBool_FromLong(1); 11331} 11332 11333int 11334PyUnicode_IsIdentifier(PyObject *self) 11335{ 11336 int kind; 11337 void *data; 11338 Py_ssize_t i; 11339 Py_UCS4 first; 11340 11341 if (PyUnicode_READY(self) == -1) { 11342 Py_FatalError("identifier not ready"); 11343 return 0; 11344 } 11345 11346 /* Special case for empty strings */ 11347 if (PyUnicode_GET_LENGTH(self) == 0) 11348 return 0; 11349 kind = PyUnicode_KIND(self); 11350 data = PyUnicode_DATA(self); 11351 11352 /* PEP 3131 says that the first character must be in 11353 XID_Start and subsequent characters in XID_Continue, 11354 and for the ASCII range, the 2.x rules apply (i.e 11355 start with letters and underscore, continue with 11356 letters, digits, underscore). However, given the current 11357 definition of XID_Start and XID_Continue, it is sufficient 11358 to check just for these, except that _ must be allowed 11359 as starting an identifier. */ 11360 first = PyUnicode_READ(kind, data, 0); 11361 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11362 return 0; 11363 11364 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11365 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11366 return 0; 11367 return 1; 11368} 11369 11370PyDoc_STRVAR(isidentifier__doc__, 11371 "S.isidentifier() -> bool\n\ 11372\n\ 11373Return True if S is a valid identifier according\n\ 11374to the language definition."); 11375 11376static PyObject* 11377unicode_isidentifier(PyObject *self) 11378{ 11379 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11380} 11381 11382PyDoc_STRVAR(isprintable__doc__, 11383 "S.isprintable() -> bool\n\ 11384\n\ 11385Return True if all characters in S are considered\n\ 11386printable in repr() or S is empty, False otherwise."); 11387 11388static PyObject* 11389unicode_isprintable(PyObject *self) 11390{ 11391 Py_ssize_t i, length; 11392 int kind; 11393 void *data; 11394 11395 if (PyUnicode_READY(self) == -1) 11396 return NULL; 11397 length = PyUnicode_GET_LENGTH(self); 11398 kind = PyUnicode_KIND(self); 11399 data = PyUnicode_DATA(self); 11400 11401 /* Shortcut for single character strings */ 11402 if (length == 1) 11403 return PyBool_FromLong( 11404 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11405 11406 for (i = 0; i < length; i++) { 11407 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11408 Py_RETURN_FALSE; 11409 } 11410 } 11411 Py_RETURN_TRUE; 11412} 11413 11414PyDoc_STRVAR(join__doc__, 11415 "S.join(iterable) -> str\n\ 11416\n\ 11417Return a string which is the concatenation of the strings in the\n\ 11418iterable. The separator between elements is S."); 11419 11420static PyObject* 11421unicode_join(PyObject *self, PyObject *data) 11422{ 11423 return PyUnicode_Join(self, data); 11424} 11425 11426static Py_ssize_t 11427unicode_length(PyObject *self) 11428{ 11429 if (PyUnicode_READY(self) == -1) 11430 return -1; 11431 return PyUnicode_GET_LENGTH(self); 11432} 11433 11434PyDoc_STRVAR(ljust__doc__, 11435 "S.ljust(width[, fillchar]) -> str\n\ 11436\n\ 11437Return S left-justified in a Unicode string of length width. Padding is\n\ 11438done using the specified fill character (default is a space)."); 11439 11440static PyObject * 11441unicode_ljust(PyObject *self, PyObject *args) 11442{ 11443 Py_ssize_t width; 11444 Py_UCS4 fillchar = ' '; 11445 11446 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11447 return NULL; 11448 11449 if (PyUnicode_READY(self) == -1) 11450 return NULL; 11451 11452 if (PyUnicode_GET_LENGTH(self) >= width) 11453 return unicode_result_unchanged(self); 11454 11455 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11456} 11457 11458PyDoc_STRVAR(lower__doc__, 11459 "S.lower() -> str\n\ 11460\n\ 11461Return a copy of the string S converted to lowercase."); 11462 11463static PyObject* 11464unicode_lower(PyObject *self) 11465{ 11466 if (PyUnicode_READY(self) == -1) 11467 return NULL; 11468 if (PyUnicode_IS_ASCII(self)) 11469 return ascii_upper_or_lower(self, 1); 11470 return case_operation(self, do_lower); 11471} 11472 11473#define LEFTSTRIP 0 11474#define RIGHTSTRIP 1 11475#define BOTHSTRIP 2 11476 11477/* Arrays indexed by above */ 11478static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11479 11480#define STRIPNAME(i) (stripformat[i]+3) 11481 11482/* externally visible for str.strip(unicode) */ 11483PyObject * 11484_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11485{ 11486 void *data; 11487 int kind; 11488 Py_ssize_t i, j, len; 11489 BLOOM_MASK sepmask; 11490 11491 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11492 return NULL; 11493 11494 kind = PyUnicode_KIND(self); 11495 data = PyUnicode_DATA(self); 11496 len = PyUnicode_GET_LENGTH(self); 11497 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11498 PyUnicode_DATA(sepobj), 11499 PyUnicode_GET_LENGTH(sepobj)); 11500 11501 i = 0; 11502 if (striptype != RIGHTSTRIP) { 11503 while (i < len && 11504 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11505 i++; 11506 } 11507 } 11508 11509 j = len; 11510 if (striptype != LEFTSTRIP) { 11511 do { 11512 j--; 11513 } while (j >= i && 11514 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11515 j++; 11516 } 11517 11518 return PyUnicode_Substring(self, i, j); 11519} 11520 11521PyObject* 11522PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11523{ 11524 unsigned char *data; 11525 int kind; 11526 Py_ssize_t length; 11527 11528 if (PyUnicode_READY(self) == -1) 11529 return NULL; 11530 11531 length = PyUnicode_GET_LENGTH(self); 11532 end = Py_MIN(end, length); 11533 11534 if (start == 0 && end == length) 11535 return unicode_result_unchanged(self); 11536 11537 if (start < 0 || end < 0) { 11538 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11539 return NULL; 11540 } 11541 if (start >= length || end < start) { 11542 Py_INCREF(unicode_empty); 11543 return unicode_empty; 11544 } 11545 11546 length = end - start; 11547 if (PyUnicode_IS_ASCII(self)) { 11548 data = PyUnicode_1BYTE_DATA(self); 11549 return _PyUnicode_FromASCII((char*)(data + start), length); 11550 } 11551 else { 11552 kind = PyUnicode_KIND(self); 11553 data = PyUnicode_1BYTE_DATA(self); 11554 return PyUnicode_FromKindAndData(kind, 11555 data + kind * start, 11556 length); 11557 } 11558} 11559 11560static PyObject * 11561do_strip(PyObject *self, int striptype) 11562{ 11563 int kind; 11564 void *data; 11565 Py_ssize_t len, i, j; 11566 11567 if (PyUnicode_READY(self) == -1) 11568 return NULL; 11569 11570 kind = PyUnicode_KIND(self); 11571 data = PyUnicode_DATA(self); 11572 len = PyUnicode_GET_LENGTH(self); 11573 11574 i = 0; 11575 if (striptype != RIGHTSTRIP) { 11576 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11577 i++; 11578 } 11579 } 11580 11581 j = len; 11582 if (striptype != LEFTSTRIP) { 11583 do { 11584 j--; 11585 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11586 j++; 11587 } 11588 11589 return PyUnicode_Substring(self, i, j); 11590} 11591 11592 11593static PyObject * 11594do_argstrip(PyObject *self, int striptype, PyObject *args) 11595{ 11596 PyObject *sep = NULL; 11597 11598 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11599 return NULL; 11600 11601 if (sep != NULL && sep != Py_None) { 11602 if (PyUnicode_Check(sep)) 11603 return _PyUnicode_XStrip(self, striptype, sep); 11604 else { 11605 PyErr_Format(PyExc_TypeError, 11606 "%s arg must be None or str", 11607 STRIPNAME(striptype)); 11608 return NULL; 11609 } 11610 } 11611 11612 return do_strip(self, striptype); 11613} 11614 11615 11616PyDoc_STRVAR(strip__doc__, 11617 "S.strip([chars]) -> str\n\ 11618\n\ 11619Return a copy of the string S with leading and trailing\n\ 11620whitespace removed.\n\ 11621If chars is given and not None, remove characters in chars instead."); 11622 11623static PyObject * 11624unicode_strip(PyObject *self, PyObject *args) 11625{ 11626 if (PyTuple_GET_SIZE(args) == 0) 11627 return do_strip(self, BOTHSTRIP); /* Common case */ 11628 else 11629 return do_argstrip(self, BOTHSTRIP, args); 11630} 11631 11632 11633PyDoc_STRVAR(lstrip__doc__, 11634 "S.lstrip([chars]) -> str\n\ 11635\n\ 11636Return a copy of the string S with leading whitespace removed.\n\ 11637If chars is given and not None, remove characters in chars instead."); 11638 11639static PyObject * 11640unicode_lstrip(PyObject *self, PyObject *args) 11641{ 11642 if (PyTuple_GET_SIZE(args) == 0) 11643 return do_strip(self, LEFTSTRIP); /* Common case */ 11644 else 11645 return do_argstrip(self, LEFTSTRIP, args); 11646} 11647 11648 11649PyDoc_STRVAR(rstrip__doc__, 11650 "S.rstrip([chars]) -> str\n\ 11651\n\ 11652Return a copy of the string S with trailing whitespace removed.\n\ 11653If chars is given and not None, remove characters in chars instead."); 11654 11655static PyObject * 11656unicode_rstrip(PyObject *self, PyObject *args) 11657{ 11658 if (PyTuple_GET_SIZE(args) == 0) 11659 return do_strip(self, RIGHTSTRIP); /* Common case */ 11660 else 11661 return do_argstrip(self, RIGHTSTRIP, args); 11662} 11663 11664 11665static PyObject* 11666unicode_repeat(PyObject *str, Py_ssize_t len) 11667{ 11668 PyObject *u; 11669 Py_ssize_t nchars, n; 11670 11671 if (len < 1) { 11672 Py_INCREF(unicode_empty); 11673 return unicode_empty; 11674 } 11675 11676 /* no repeat, return original string */ 11677 if (len == 1) 11678 return unicode_result_unchanged(str); 11679 11680 if (PyUnicode_READY(str) == -1) 11681 return NULL; 11682 11683 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11684 PyErr_SetString(PyExc_OverflowError, 11685 "repeated string is too long"); 11686 return NULL; 11687 } 11688 nchars = len * PyUnicode_GET_LENGTH(str); 11689 11690 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11691 if (!u) 11692 return NULL; 11693 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11694 11695 if (PyUnicode_GET_LENGTH(str) == 1) { 11696 const int kind = PyUnicode_KIND(str); 11697 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11698 if (kind == PyUnicode_1BYTE_KIND) { 11699 void *to = PyUnicode_DATA(u); 11700 memset(to, (unsigned char)fill_char, len); 11701 } 11702 else if (kind == PyUnicode_2BYTE_KIND) { 11703 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11704 for (n = 0; n < len; ++n) 11705 ucs2[n] = fill_char; 11706 } else { 11707 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11708 assert(kind == PyUnicode_4BYTE_KIND); 11709 for (n = 0; n < len; ++n) 11710 ucs4[n] = fill_char; 11711 } 11712 } 11713 else { 11714 /* number of characters copied this far */ 11715 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11716 const Py_ssize_t char_size = PyUnicode_KIND(str); 11717 char *to = (char *) PyUnicode_DATA(u); 11718 Py_MEMCPY(to, PyUnicode_DATA(str), 11719 PyUnicode_GET_LENGTH(str) * char_size); 11720 while (done < nchars) { 11721 n = (done <= nchars-done) ? done : nchars-done; 11722 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11723 done += n; 11724 } 11725 } 11726 11727 assert(_PyUnicode_CheckConsistency(u, 1)); 11728 return u; 11729} 11730 11731PyObject * 11732PyUnicode_Replace(PyObject *obj, 11733 PyObject *subobj, 11734 PyObject *replobj, 11735 Py_ssize_t maxcount) 11736{ 11737 PyObject *self; 11738 PyObject *str1; 11739 PyObject *str2; 11740 PyObject *result; 11741 11742 self = PyUnicode_FromObject(obj); 11743 if (self == NULL) 11744 return NULL; 11745 str1 = PyUnicode_FromObject(subobj); 11746 if (str1 == NULL) { 11747 Py_DECREF(self); 11748 return NULL; 11749 } 11750 str2 = PyUnicode_FromObject(replobj); 11751 if (str2 == NULL) { 11752 Py_DECREF(self); 11753 Py_DECREF(str1); 11754 return NULL; 11755 } 11756 if (PyUnicode_READY(self) == -1 || 11757 PyUnicode_READY(str1) == -1 || 11758 PyUnicode_READY(str2) == -1) 11759 result = NULL; 11760 else 11761 result = replace(self, str1, str2, maxcount); 11762 Py_DECREF(self); 11763 Py_DECREF(str1); 11764 Py_DECREF(str2); 11765 return result; 11766} 11767 11768PyDoc_STRVAR(replace__doc__, 11769 "S.replace(old, new[, count]) -> str\n\ 11770\n\ 11771Return a copy of S with all occurrences of substring\n\ 11772old replaced by new. If the optional argument count is\n\ 11773given, only the first count occurrences are replaced."); 11774 11775static PyObject* 11776unicode_replace(PyObject *self, PyObject *args) 11777{ 11778 PyObject *str1; 11779 PyObject *str2; 11780 Py_ssize_t maxcount = -1; 11781 PyObject *result; 11782 11783 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11784 return NULL; 11785 if (PyUnicode_READY(self) == -1) 11786 return NULL; 11787 str1 = PyUnicode_FromObject(str1); 11788 if (str1 == NULL) 11789 return NULL; 11790 str2 = PyUnicode_FromObject(str2); 11791 if (str2 == NULL) { 11792 Py_DECREF(str1); 11793 return NULL; 11794 } 11795 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 11796 result = NULL; 11797 else 11798 result = replace(self, str1, str2, maxcount); 11799 11800 Py_DECREF(str1); 11801 Py_DECREF(str2); 11802 return result; 11803} 11804 11805static PyObject * 11806unicode_repr(PyObject *unicode) 11807{ 11808 PyObject *repr; 11809 Py_ssize_t isize; 11810 Py_ssize_t osize, squote, dquote, i, o; 11811 Py_UCS4 max, quote; 11812 int ikind, okind; 11813 void *idata, *odata; 11814 11815 if (PyUnicode_READY(unicode) == -1) 11816 return NULL; 11817 11818 isize = PyUnicode_GET_LENGTH(unicode); 11819 idata = PyUnicode_DATA(unicode); 11820 11821 /* Compute length of output, quote characters, and 11822 maximum character */ 11823 osize = 2; /* quotes */ 11824 max = 127; 11825 squote = dquote = 0; 11826 ikind = PyUnicode_KIND(unicode); 11827 for (i = 0; i < isize; i++) { 11828 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11829 switch (ch) { 11830 case '\'': squote++; osize++; break; 11831 case '"': dquote++; osize++; break; 11832 case '\\': case '\t': case '\r': case '\n': 11833 osize += 2; break; 11834 default: 11835 /* Fast-path ASCII */ 11836 if (ch < ' ' || ch == 0x7f) 11837 osize += 4; /* \xHH */ 11838 else if (ch < 0x7f) 11839 osize++; 11840 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11841 osize++; 11842 max = ch > max ? ch : max; 11843 } 11844 else if (ch < 0x100) 11845 osize += 4; /* \xHH */ 11846 else if (ch < 0x10000) 11847 osize += 6; /* \uHHHH */ 11848 else 11849 osize += 10; /* \uHHHHHHHH */ 11850 } 11851 } 11852 11853 quote = '\''; 11854 if (squote) { 11855 if (dquote) 11856 /* Both squote and dquote present. Use squote, 11857 and escape them */ 11858 osize += squote; 11859 else 11860 quote = '"'; 11861 } 11862 11863 repr = PyUnicode_New(osize, max); 11864 if (repr == NULL) 11865 return NULL; 11866 okind = PyUnicode_KIND(repr); 11867 odata = PyUnicode_DATA(repr); 11868 11869 PyUnicode_WRITE(okind, odata, 0, quote); 11870 PyUnicode_WRITE(okind, odata, osize-1, quote); 11871 11872 for (i = 0, o = 1; i < isize; i++) { 11873 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11874 11875 /* Escape quotes and backslashes */ 11876 if ((ch == quote) || (ch == '\\')) { 11877 PyUnicode_WRITE(okind, odata, o++, '\\'); 11878 PyUnicode_WRITE(okind, odata, o++, ch); 11879 continue; 11880 } 11881 11882 /* Map special whitespace to '\t', \n', '\r' */ 11883 if (ch == '\t') { 11884 PyUnicode_WRITE(okind, odata, o++, '\\'); 11885 PyUnicode_WRITE(okind, odata, o++, 't'); 11886 } 11887 else if (ch == '\n') { 11888 PyUnicode_WRITE(okind, odata, o++, '\\'); 11889 PyUnicode_WRITE(okind, odata, o++, 'n'); 11890 } 11891 else if (ch == '\r') { 11892 PyUnicode_WRITE(okind, odata, o++, '\\'); 11893 PyUnicode_WRITE(okind, odata, o++, 'r'); 11894 } 11895 11896 /* Map non-printable US ASCII to '\xhh' */ 11897 else if (ch < ' ' || ch == 0x7F) { 11898 PyUnicode_WRITE(okind, odata, o++, '\\'); 11899 PyUnicode_WRITE(okind, odata, o++, 'x'); 11900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 11902 } 11903 11904 /* Copy ASCII characters as-is */ 11905 else if (ch < 0x7F) { 11906 PyUnicode_WRITE(okind, odata, o++, ch); 11907 } 11908 11909 /* Non-ASCII characters */ 11910 else { 11911 /* Map Unicode whitespace and control characters 11912 (categories Z* and C* except ASCII space) 11913 */ 11914 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11915 PyUnicode_WRITE(okind, odata, o++, '\\'); 11916 /* Map 8-bit characters to '\xhh' */ 11917 if (ch <= 0xff) { 11918 PyUnicode_WRITE(okind, odata, o++, 'x'); 11919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 11921 } 11922 /* Map 16-bit characters to '\uxxxx' */ 11923 else if (ch <= 0xffff) { 11924 PyUnicode_WRITE(okind, odata, o++, 'u'); 11925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 11926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 11927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 11929 } 11930 /* Map 21-bit characters to '\U00xxxxxx' */ 11931 else { 11932 PyUnicode_WRITE(okind, odata, o++, 'U'); 11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 11939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 11940 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 11941 } 11942 } 11943 /* Copy characters as-is */ 11944 else { 11945 PyUnicode_WRITE(okind, odata, o++, ch); 11946 } 11947 } 11948 } 11949 /* Closing quote already added at the beginning */ 11950 assert(_PyUnicode_CheckConsistency(repr, 1)); 11951 return repr; 11952} 11953 11954PyDoc_STRVAR(rfind__doc__, 11955 "S.rfind(sub[, start[, end]]) -> int\n\ 11956\n\ 11957Return the highest index in S where substring sub is found,\n\ 11958such that sub is contained within S[start:end]. Optional\n\ 11959arguments start and end are interpreted as in slice notation.\n\ 11960\n\ 11961Return -1 on failure."); 11962 11963static PyObject * 11964unicode_rfind(PyObject *self, PyObject *args) 11965{ 11966 PyObject *substring; 11967 Py_ssize_t start; 11968 Py_ssize_t end; 11969 Py_ssize_t result; 11970 11971 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11972 &start, &end)) 11973 return NULL; 11974 11975 if (PyUnicode_READY(self) == -1) 11976 return NULL; 11977 if (PyUnicode_READY(substring) == -1) 11978 return NULL; 11979 11980 result = any_find_slice(-1, self, substring, start, end); 11981 11982 Py_DECREF(substring); 11983 11984 if (result == -2) 11985 return NULL; 11986 11987 return PyLong_FromSsize_t(result); 11988} 11989 11990PyDoc_STRVAR(rindex__doc__, 11991 "S.rindex(sub[, start[, end]]) -> int\n\ 11992\n\ 11993Like S.rfind() but raise ValueError when the substring is not found."); 11994 11995static PyObject * 11996unicode_rindex(PyObject *self, PyObject *args) 11997{ 11998 PyObject *substring; 11999 Py_ssize_t start; 12000 Py_ssize_t end; 12001 Py_ssize_t result; 12002 12003 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12004 &start, &end)) 12005 return NULL; 12006 12007 if (PyUnicode_READY(self) == -1) 12008 return NULL; 12009 if (PyUnicode_READY(substring) == -1) 12010 return NULL; 12011 12012 result = any_find_slice(-1, self, substring, start, end); 12013 12014 Py_DECREF(substring); 12015 12016 if (result == -2) 12017 return NULL; 12018 12019 if (result < 0) { 12020 PyErr_SetString(PyExc_ValueError, "substring not found"); 12021 return NULL; 12022 } 12023 12024 return PyLong_FromSsize_t(result); 12025} 12026 12027PyDoc_STRVAR(rjust__doc__, 12028 "S.rjust(width[, fillchar]) -> str\n\ 12029\n\ 12030Return S right-justified in a string of length width. Padding is\n\ 12031done using the specified fill character (default is a space)."); 12032 12033static PyObject * 12034unicode_rjust(PyObject *self, PyObject *args) 12035{ 12036 Py_ssize_t width; 12037 Py_UCS4 fillchar = ' '; 12038 12039 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12040 return NULL; 12041 12042 if (PyUnicode_READY(self) == -1) 12043 return NULL; 12044 12045 if (PyUnicode_GET_LENGTH(self) >= width) 12046 return unicode_result_unchanged(self); 12047 12048 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12049} 12050 12051PyObject * 12052PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12053{ 12054 PyObject *result; 12055 12056 s = PyUnicode_FromObject(s); 12057 if (s == NULL) 12058 return NULL; 12059 if (sep != NULL) { 12060 sep = PyUnicode_FromObject(sep); 12061 if (sep == NULL) { 12062 Py_DECREF(s); 12063 return NULL; 12064 } 12065 } 12066 12067 result = split(s, sep, maxsplit); 12068 12069 Py_DECREF(s); 12070 Py_XDECREF(sep); 12071 return result; 12072} 12073 12074PyDoc_STRVAR(split__doc__, 12075 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12076\n\ 12077Return a list of the words in S, using sep as the\n\ 12078delimiter string. If maxsplit is given, at most maxsplit\n\ 12079splits are done. If sep is not specified or is None, any\n\ 12080whitespace string is a separator and empty strings are\n\ 12081removed from the result."); 12082 12083static PyObject* 12084unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12085{ 12086 static char *kwlist[] = {"sep", "maxsplit", 0}; 12087 PyObject *substring = Py_None; 12088 Py_ssize_t maxcount = -1; 12089 12090 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12091 kwlist, &substring, &maxcount)) 12092 return NULL; 12093 12094 if (substring == Py_None) 12095 return split(self, NULL, maxcount); 12096 else if (PyUnicode_Check(substring)) 12097 return split(self, substring, maxcount); 12098 else 12099 return PyUnicode_Split(self, substring, maxcount); 12100} 12101 12102PyObject * 12103PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12104{ 12105 PyObject* str_obj; 12106 PyObject* sep_obj; 12107 PyObject* out; 12108 int kind1, kind2, kind; 12109 void *buf1 = NULL, *buf2 = NULL; 12110 Py_ssize_t len1, len2; 12111 12112 str_obj = PyUnicode_FromObject(str_in); 12113 if (!str_obj) 12114 return NULL; 12115 sep_obj = PyUnicode_FromObject(sep_in); 12116 if (!sep_obj) { 12117 Py_DECREF(str_obj); 12118 return NULL; 12119 } 12120 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12121 Py_DECREF(sep_obj); 12122 Py_DECREF(str_obj); 12123 return NULL; 12124 } 12125 12126 kind1 = PyUnicode_KIND(str_obj); 12127 kind2 = PyUnicode_KIND(sep_obj); 12128 kind = Py_MAX(kind1, kind2); 12129 buf1 = PyUnicode_DATA(str_obj); 12130 if (kind1 != kind) 12131 buf1 = _PyUnicode_AsKind(str_obj, kind); 12132 if (!buf1) 12133 goto onError; 12134 buf2 = PyUnicode_DATA(sep_obj); 12135 if (kind2 != kind) 12136 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12137 if (!buf2) 12138 goto onError; 12139 len1 = PyUnicode_GET_LENGTH(str_obj); 12140 len2 = PyUnicode_GET_LENGTH(sep_obj); 12141 12142 switch (PyUnicode_KIND(str_obj)) { 12143 case PyUnicode_1BYTE_KIND: 12144 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12145 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12146 else 12147 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12148 break; 12149 case PyUnicode_2BYTE_KIND: 12150 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12151 break; 12152 case PyUnicode_4BYTE_KIND: 12153 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12154 break; 12155 default: 12156 assert(0); 12157 out = 0; 12158 } 12159 12160 Py_DECREF(sep_obj); 12161 Py_DECREF(str_obj); 12162 if (kind1 != kind) 12163 PyMem_Free(buf1); 12164 if (kind2 != kind) 12165 PyMem_Free(buf2); 12166 12167 return out; 12168 onError: 12169 Py_DECREF(sep_obj); 12170 Py_DECREF(str_obj); 12171 if (kind1 != kind && buf1) 12172 PyMem_Free(buf1); 12173 if (kind2 != kind && buf2) 12174 PyMem_Free(buf2); 12175 return NULL; 12176} 12177 12178 12179PyObject * 12180PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12181{ 12182 PyObject* str_obj; 12183 PyObject* sep_obj; 12184 PyObject* out; 12185 int kind1, kind2, kind; 12186 void *buf1 = NULL, *buf2 = NULL; 12187 Py_ssize_t len1, len2; 12188 12189 str_obj = PyUnicode_FromObject(str_in); 12190 if (!str_obj) 12191 return NULL; 12192 sep_obj = PyUnicode_FromObject(sep_in); 12193 if (!sep_obj) { 12194 Py_DECREF(str_obj); 12195 return NULL; 12196 } 12197 12198 kind1 = PyUnicode_KIND(str_in); 12199 kind2 = PyUnicode_KIND(sep_obj); 12200 kind = Py_MAX(kind1, kind2); 12201 buf1 = PyUnicode_DATA(str_in); 12202 if (kind1 != kind) 12203 buf1 = _PyUnicode_AsKind(str_in, kind); 12204 if (!buf1) 12205 goto onError; 12206 buf2 = PyUnicode_DATA(sep_obj); 12207 if (kind2 != kind) 12208 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12209 if (!buf2) 12210 goto onError; 12211 len1 = PyUnicode_GET_LENGTH(str_obj); 12212 len2 = PyUnicode_GET_LENGTH(sep_obj); 12213 12214 switch (PyUnicode_KIND(str_in)) { 12215 case PyUnicode_1BYTE_KIND: 12216 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12217 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12218 else 12219 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12220 break; 12221 case PyUnicode_2BYTE_KIND: 12222 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12223 break; 12224 case PyUnicode_4BYTE_KIND: 12225 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12226 break; 12227 default: 12228 assert(0); 12229 out = 0; 12230 } 12231 12232 Py_DECREF(sep_obj); 12233 Py_DECREF(str_obj); 12234 if (kind1 != kind) 12235 PyMem_Free(buf1); 12236 if (kind2 != kind) 12237 PyMem_Free(buf2); 12238 12239 return out; 12240 onError: 12241 Py_DECREF(sep_obj); 12242 Py_DECREF(str_obj); 12243 if (kind1 != kind && buf1) 12244 PyMem_Free(buf1); 12245 if (kind2 != kind && buf2) 12246 PyMem_Free(buf2); 12247 return NULL; 12248} 12249 12250PyDoc_STRVAR(partition__doc__, 12251 "S.partition(sep) -> (head, sep, tail)\n\ 12252\n\ 12253Search for the separator sep in S, and return the part before it,\n\ 12254the separator itself, and the part after it. If the separator is not\n\ 12255found, return S and two empty strings."); 12256 12257static PyObject* 12258unicode_partition(PyObject *self, PyObject *separator) 12259{ 12260 return PyUnicode_Partition(self, separator); 12261} 12262 12263PyDoc_STRVAR(rpartition__doc__, 12264 "S.rpartition(sep) -> (head, sep, tail)\n\ 12265\n\ 12266Search for the separator sep in S, starting at the end of S, and return\n\ 12267the part before it, the separator itself, and the part after it. If the\n\ 12268separator is not found, return two empty strings and S."); 12269 12270static PyObject* 12271unicode_rpartition(PyObject *self, PyObject *separator) 12272{ 12273 return PyUnicode_RPartition(self, separator); 12274} 12275 12276PyObject * 12277PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12278{ 12279 PyObject *result; 12280 12281 s = PyUnicode_FromObject(s); 12282 if (s == NULL) 12283 return NULL; 12284 if (sep != NULL) { 12285 sep = PyUnicode_FromObject(sep); 12286 if (sep == NULL) { 12287 Py_DECREF(s); 12288 return NULL; 12289 } 12290 } 12291 12292 result = rsplit(s, sep, maxsplit); 12293 12294 Py_DECREF(s); 12295 Py_XDECREF(sep); 12296 return result; 12297} 12298 12299PyDoc_STRVAR(rsplit__doc__, 12300 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12301\n\ 12302Return a list of the words in S, using sep as the\n\ 12303delimiter string, starting at the end of the string and\n\ 12304working to the front. If maxsplit is given, at most maxsplit\n\ 12305splits are done. If sep is not specified, any whitespace string\n\ 12306is a separator."); 12307 12308static PyObject* 12309unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12310{ 12311 static char *kwlist[] = {"sep", "maxsplit", 0}; 12312 PyObject *substring = Py_None; 12313 Py_ssize_t maxcount = -1; 12314 12315 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12316 kwlist, &substring, &maxcount)) 12317 return NULL; 12318 12319 if (substring == Py_None) 12320 return rsplit(self, NULL, maxcount); 12321 else if (PyUnicode_Check(substring)) 12322 return rsplit(self, substring, maxcount); 12323 else 12324 return PyUnicode_RSplit(self, substring, maxcount); 12325} 12326 12327PyDoc_STRVAR(splitlines__doc__, 12328 "S.splitlines([keepends]) -> list of strings\n\ 12329\n\ 12330Return a list of the lines in S, breaking at line boundaries.\n\ 12331Line breaks are not included in the resulting list unless keepends\n\ 12332is given and true."); 12333 12334static PyObject* 12335unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12336{ 12337 static char *kwlist[] = {"keepends", 0}; 12338 int keepends = 0; 12339 12340 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12341 kwlist, &keepends)) 12342 return NULL; 12343 12344 return PyUnicode_Splitlines(self, keepends); 12345} 12346 12347static 12348PyObject *unicode_str(PyObject *self) 12349{ 12350 return unicode_result_unchanged(self); 12351} 12352 12353PyDoc_STRVAR(swapcase__doc__, 12354 "S.swapcase() -> str\n\ 12355\n\ 12356Return a copy of S with uppercase characters converted to lowercase\n\ 12357and vice versa."); 12358 12359static PyObject* 12360unicode_swapcase(PyObject *self) 12361{ 12362 if (PyUnicode_READY(self) == -1) 12363 return NULL; 12364 return case_operation(self, do_swapcase); 12365} 12366 12367PyDoc_STRVAR(maketrans__doc__, 12368 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12369\n\ 12370Return a translation table usable for str.translate().\n\ 12371If there is only one argument, it must be a dictionary mapping Unicode\n\ 12372ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12373Character keys will be then converted to ordinals.\n\ 12374If there are two arguments, they must be strings of equal length, and\n\ 12375in the resulting dictionary, each character in x will be mapped to the\n\ 12376character at the same position in y. If there is a third argument, it\n\ 12377must be a string, whose characters will be mapped to None in the result."); 12378 12379static PyObject* 12380unicode_maketrans(PyObject *null, PyObject *args) 12381{ 12382 PyObject *x, *y = NULL, *z = NULL; 12383 PyObject *new = NULL, *key, *value; 12384 Py_ssize_t i = 0; 12385 int res; 12386 12387 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12388 return NULL; 12389 new = PyDict_New(); 12390 if (!new) 12391 return NULL; 12392 if (y != NULL) { 12393 int x_kind, y_kind, z_kind; 12394 void *x_data, *y_data, *z_data; 12395 12396 /* x must be a string too, of equal length */ 12397 if (!PyUnicode_Check(x)) { 12398 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12399 "be a string if there is a second argument"); 12400 goto err; 12401 } 12402 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12403 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12404 "arguments must have equal length"); 12405 goto err; 12406 } 12407 /* create entries for translating chars in x to those in y */ 12408 x_kind = PyUnicode_KIND(x); 12409 y_kind = PyUnicode_KIND(y); 12410 x_data = PyUnicode_DATA(x); 12411 y_data = PyUnicode_DATA(y); 12412 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12413 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12414 if (!key) 12415 goto err; 12416 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12417 if (!value) { 12418 Py_DECREF(key); 12419 goto err; 12420 } 12421 res = PyDict_SetItem(new, key, value); 12422 Py_DECREF(key); 12423 Py_DECREF(value); 12424 if (res < 0) 12425 goto err; 12426 } 12427 /* create entries for deleting chars in z */ 12428 if (z != NULL) { 12429 z_kind = PyUnicode_KIND(z); 12430 z_data = PyUnicode_DATA(z); 12431 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12432 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12433 if (!key) 12434 goto err; 12435 res = PyDict_SetItem(new, key, Py_None); 12436 Py_DECREF(key); 12437 if (res < 0) 12438 goto err; 12439 } 12440 } 12441 } else { 12442 int kind; 12443 void *data; 12444 12445 /* x must be a dict */ 12446 if (!PyDict_CheckExact(x)) { 12447 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12448 "to maketrans it must be a dict"); 12449 goto err; 12450 } 12451 /* copy entries into the new dict, converting string keys to int keys */ 12452 while (PyDict_Next(x, &i, &key, &value)) { 12453 if (PyUnicode_Check(key)) { 12454 /* convert string keys to integer keys */ 12455 PyObject *newkey; 12456 if (PyUnicode_GET_LENGTH(key) != 1) { 12457 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12458 "table must be of length 1"); 12459 goto err; 12460 } 12461 kind = PyUnicode_KIND(key); 12462 data = PyUnicode_DATA(key); 12463 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12464 if (!newkey) 12465 goto err; 12466 res = PyDict_SetItem(new, newkey, value); 12467 Py_DECREF(newkey); 12468 if (res < 0) 12469 goto err; 12470 } else if (PyLong_Check(key)) { 12471 /* just keep integer keys */ 12472 if (PyDict_SetItem(new, key, value) < 0) 12473 goto err; 12474 } else { 12475 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12476 "be strings or integers"); 12477 goto err; 12478 } 12479 } 12480 } 12481 return new; 12482 err: 12483 Py_DECREF(new); 12484 return NULL; 12485} 12486 12487PyDoc_STRVAR(translate__doc__, 12488 "S.translate(table) -> str\n\ 12489\n\ 12490Return a copy of the string S, where all characters have been mapped\n\ 12491through the given translation table, which must be a mapping of\n\ 12492Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12493Unmapped characters are left untouched. Characters mapped to None\n\ 12494are deleted."); 12495 12496static PyObject* 12497unicode_translate(PyObject *self, PyObject *table) 12498{ 12499 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12500} 12501 12502PyDoc_STRVAR(upper__doc__, 12503 "S.upper() -> str\n\ 12504\n\ 12505Return a copy of S converted to uppercase."); 12506 12507static PyObject* 12508unicode_upper(PyObject *self) 12509{ 12510 if (PyUnicode_READY(self) == -1) 12511 return NULL; 12512 if (PyUnicode_IS_ASCII(self)) 12513 return ascii_upper_or_lower(self, 0); 12514 return case_operation(self, do_upper); 12515} 12516 12517PyDoc_STRVAR(zfill__doc__, 12518 "S.zfill(width) -> str\n\ 12519\n\ 12520Pad a numeric string S with zeros on the left, to fill a field\n\ 12521of the specified width. The string S is never truncated."); 12522 12523static PyObject * 12524unicode_zfill(PyObject *self, PyObject *args) 12525{ 12526 Py_ssize_t fill; 12527 PyObject *u; 12528 Py_ssize_t width; 12529 int kind; 12530 void *data; 12531 Py_UCS4 chr; 12532 12533 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12534 return NULL; 12535 12536 if (PyUnicode_READY(self) == -1) 12537 return NULL; 12538 12539 if (PyUnicode_GET_LENGTH(self) >= width) 12540 return unicode_result_unchanged(self); 12541 12542 fill = width - PyUnicode_GET_LENGTH(self); 12543 12544 u = pad(self, fill, 0, '0'); 12545 12546 if (u == NULL) 12547 return NULL; 12548 12549 kind = PyUnicode_KIND(u); 12550 data = PyUnicode_DATA(u); 12551 chr = PyUnicode_READ(kind, data, fill); 12552 12553 if (chr == '+' || chr == '-') { 12554 /* move sign to beginning of string */ 12555 PyUnicode_WRITE(kind, data, 0, chr); 12556 PyUnicode_WRITE(kind, data, fill, '0'); 12557 } 12558 12559 assert(_PyUnicode_CheckConsistency(u, 1)); 12560 return u; 12561} 12562 12563#if 0 12564static PyObject * 12565unicode__decimal2ascii(PyObject *self) 12566{ 12567 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12568} 12569#endif 12570 12571PyDoc_STRVAR(startswith__doc__, 12572 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12573\n\ 12574Return True if S starts with the specified prefix, False otherwise.\n\ 12575With optional start, test S beginning at that position.\n\ 12576With optional end, stop comparing S at that position.\n\ 12577prefix can also be a tuple of strings to try."); 12578 12579static PyObject * 12580unicode_startswith(PyObject *self, 12581 PyObject *args) 12582{ 12583 PyObject *subobj; 12584 PyObject *substring; 12585 Py_ssize_t start = 0; 12586 Py_ssize_t end = PY_SSIZE_T_MAX; 12587 int result; 12588 12589 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12590 return NULL; 12591 if (PyTuple_Check(subobj)) { 12592 Py_ssize_t i; 12593 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12594 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12595 if (substring == NULL) 12596 return NULL; 12597 result = tailmatch(self, substring, start, end, -1); 12598 Py_DECREF(substring); 12599 if (result) { 12600 Py_RETURN_TRUE; 12601 } 12602 } 12603 /* nothing matched */ 12604 Py_RETURN_FALSE; 12605 } 12606 substring = PyUnicode_FromObject(subobj); 12607 if (substring == NULL) { 12608 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12609 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12610 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12611 return NULL; 12612 } 12613 result = tailmatch(self, substring, start, end, -1); 12614 Py_DECREF(substring); 12615 return PyBool_FromLong(result); 12616} 12617 12618 12619PyDoc_STRVAR(endswith__doc__, 12620 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12621\n\ 12622Return True if S ends with the specified suffix, False otherwise.\n\ 12623With optional start, test S beginning at that position.\n\ 12624With optional end, stop comparing S at that position.\n\ 12625suffix can also be a tuple of strings to try."); 12626 12627static PyObject * 12628unicode_endswith(PyObject *self, 12629 PyObject *args) 12630{ 12631 PyObject *subobj; 12632 PyObject *substring; 12633 Py_ssize_t start = 0; 12634 Py_ssize_t end = PY_SSIZE_T_MAX; 12635 int result; 12636 12637 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12638 return NULL; 12639 if (PyTuple_Check(subobj)) { 12640 Py_ssize_t i; 12641 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12642 substring = PyUnicode_FromObject( 12643 PyTuple_GET_ITEM(subobj, i)); 12644 if (substring == NULL) 12645 return NULL; 12646 result = tailmatch(self, substring, start, end, +1); 12647 Py_DECREF(substring); 12648 if (result) { 12649 Py_RETURN_TRUE; 12650 } 12651 } 12652 Py_RETURN_FALSE; 12653 } 12654 substring = PyUnicode_FromObject(subobj); 12655 if (substring == NULL) { 12656 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12657 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12658 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12659 return NULL; 12660 } 12661 result = tailmatch(self, substring, start, end, +1); 12662 Py_DECREF(substring); 12663 return PyBool_FromLong(result); 12664} 12665 12666Py_LOCAL_INLINE(void) 12667_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12668{ 12669 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12670 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12671 writer->data = PyUnicode_DATA(writer->buffer); 12672 writer->kind = PyUnicode_KIND(writer->buffer); 12673} 12674 12675void 12676_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) 12677{ 12678 memset(writer, 0, sizeof(*writer)); 12679#ifdef Py_DEBUG 12680 writer->kind = 5; /* invalid kind */ 12681#endif 12682 writer->min_length = Py_MAX(min_length, 100); 12683 writer->overallocate = (min_length > 0); 12684} 12685 12686int 12687_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12688 Py_ssize_t length, Py_UCS4 maxchar) 12689{ 12690 Py_ssize_t newlen; 12691 PyObject *newbuffer; 12692 12693 assert(length > 0); 12694 12695 if (length > PY_SSIZE_T_MAX - writer->pos) { 12696 PyErr_NoMemory(); 12697 return -1; 12698 } 12699 newlen = writer->pos + length; 12700 12701 if (writer->buffer == NULL) { 12702 if (writer->overallocate) { 12703 /* overallocate 25% to limit the number of resize */ 12704 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12705 newlen += newlen / 4; 12706 if (newlen < writer->min_length) 12707 newlen = writer->min_length; 12708 } 12709 writer->buffer = PyUnicode_New(newlen, maxchar); 12710 if (writer->buffer == NULL) 12711 return -1; 12712 _PyUnicodeWriter_Update(writer); 12713 return 0; 12714 } 12715 12716 if (newlen > writer->size) { 12717 if (writer->overallocate) { 12718 /* overallocate 25% to limit the number of resize */ 12719 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12720 newlen += newlen / 4; 12721 if (newlen < writer->min_length) 12722 newlen = writer->min_length; 12723 } 12724 12725 if (maxchar > writer->maxchar || writer->readonly) { 12726 /* resize + widen */ 12727 newbuffer = PyUnicode_New(newlen, maxchar); 12728 if (newbuffer == NULL) 12729 return -1; 12730 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12731 writer->buffer, 0, writer->pos); 12732 Py_DECREF(writer->buffer); 12733 writer->readonly = 0; 12734 } 12735 else { 12736 newbuffer = resize_compact(writer->buffer, newlen); 12737 if (newbuffer == NULL) 12738 return -1; 12739 } 12740 writer->buffer = newbuffer; 12741 _PyUnicodeWriter_Update(writer); 12742 } 12743 else if (maxchar > writer->maxchar) { 12744 assert(!writer->readonly); 12745 newbuffer = PyUnicode_New(writer->size, maxchar); 12746 if (newbuffer == NULL) 12747 return -1; 12748 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12749 writer->buffer, 0, writer->pos); 12750 Py_DECREF(writer->buffer); 12751 writer->buffer = newbuffer; 12752 _PyUnicodeWriter_Update(writer); 12753 } 12754 return 0; 12755} 12756 12757int 12758_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 12759{ 12760 Py_UCS4 maxchar; 12761 Py_ssize_t len; 12762 12763 if (PyUnicode_READY(str) == -1) 12764 return -1; 12765 len = PyUnicode_GET_LENGTH(str); 12766 if (len == 0) 12767 return 0; 12768 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 12769 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 12770 if (writer->buffer == NULL && !writer->overallocate) { 12771 Py_INCREF(str); 12772 writer->buffer = str; 12773 _PyUnicodeWriter_Update(writer); 12774 writer->readonly = 1; 12775 writer->size = 0; 12776 writer->pos += len; 12777 return 0; 12778 } 12779 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 12780 return -1; 12781 } 12782 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 12783 str, 0, len); 12784 writer->pos += len; 12785 return 0; 12786} 12787 12788int 12789_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) 12790{ 12791 Py_UCS4 maxchar; 12792 12793 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 12794 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 12795 return -1; 12796 unicode_write_cstr(writer->buffer, writer->pos, str, len); 12797 writer->pos += len; 12798 return 0; 12799} 12800 12801PyObject * 12802_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 12803{ 12804 if (writer->pos == 0) { 12805 Py_XDECREF(writer->buffer); 12806 Py_INCREF(unicode_empty); 12807 return unicode_empty; 12808 } 12809 if (writer->readonly) { 12810 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 12811 return writer->buffer; 12812 } 12813 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 12814 PyObject *newbuffer; 12815 newbuffer = resize_compact(writer->buffer, writer->pos); 12816 if (newbuffer == NULL) { 12817 Py_DECREF(writer->buffer); 12818 return NULL; 12819 } 12820 writer->buffer = newbuffer; 12821 } 12822 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 12823 return writer->buffer; 12824} 12825 12826void 12827_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 12828{ 12829 Py_CLEAR(writer->buffer); 12830} 12831 12832#include "stringlib/unicode_format.h" 12833 12834PyDoc_STRVAR(format__doc__, 12835 "S.format(*args, **kwargs) -> str\n\ 12836\n\ 12837Return a formatted version of S, using substitutions from args and kwargs.\n\ 12838The substitutions are identified by braces ('{' and '}')."); 12839 12840PyDoc_STRVAR(format_map__doc__, 12841 "S.format_map(mapping) -> str\n\ 12842\n\ 12843Return a formatted version of S, using substitutions from mapping.\n\ 12844The substitutions are identified by braces ('{' and '}')."); 12845 12846static PyObject * 12847unicode__format__(PyObject* self, PyObject* args) 12848{ 12849 PyObject *format_spec; 12850 _PyUnicodeWriter writer; 12851 int ret; 12852 12853 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12854 return NULL; 12855 12856 if (PyUnicode_READY(self) == -1) 12857 return NULL; 12858 _PyUnicodeWriter_Init(&writer, 0); 12859 ret = _PyUnicode_FormatAdvancedWriter(&writer, 12860 self, format_spec, 0, 12861 PyUnicode_GET_LENGTH(format_spec)); 12862 if (ret == -1) { 12863 _PyUnicodeWriter_Dealloc(&writer); 12864 return NULL; 12865 } 12866 return _PyUnicodeWriter_Finish(&writer); 12867} 12868 12869PyDoc_STRVAR(p_format__doc__, 12870 "S.__format__(format_spec) -> str\n\ 12871\n\ 12872Return a formatted version of S as described by format_spec."); 12873 12874static PyObject * 12875unicode__sizeof__(PyObject *v) 12876{ 12877 Py_ssize_t size; 12878 12879 /* If it's a compact object, account for base structure + 12880 character data. */ 12881 if (PyUnicode_IS_COMPACT_ASCII(v)) 12882 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12883 else if (PyUnicode_IS_COMPACT(v)) 12884 size = sizeof(PyCompactUnicodeObject) + 12885 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12886 else { 12887 /* If it is a two-block object, account for base object, and 12888 for character block if present. */ 12889 size = sizeof(PyUnicodeObject); 12890 if (_PyUnicode_DATA_ANY(v)) 12891 size += (PyUnicode_GET_LENGTH(v) + 1) * 12892 PyUnicode_KIND(v); 12893 } 12894 /* If the wstr pointer is present, account for it unless it is shared 12895 with the data pointer. Check if the data is not shared. */ 12896 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12897 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12898 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12899 size += PyUnicode_UTF8_LENGTH(v) + 1; 12900 12901 return PyLong_FromSsize_t(size); 12902} 12903 12904PyDoc_STRVAR(sizeof__doc__, 12905 "S.__sizeof__() -> size of S in memory, in bytes"); 12906 12907static PyObject * 12908unicode_getnewargs(PyObject *v) 12909{ 12910 PyObject *copy = _PyUnicode_Copy(v); 12911 if (!copy) 12912 return NULL; 12913 return Py_BuildValue("(N)", copy); 12914} 12915 12916static PyMethodDef unicode_methods[] = { 12917 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12918 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12919 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 12920 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 12921 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12922 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12923 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 12924 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12925 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12926 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12927 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12928 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12929 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12930 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12931 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12932 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12933 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12934 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12935 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12936 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12937 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12938 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12939 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12940 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12941 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12942 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12943 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12944 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12945 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12946 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12947 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12948 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12949 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12950 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12951 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12952 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12953 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12954 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12955 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12956 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12957 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12958 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12959 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12960 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12961 {"maketrans", (PyCFunction) unicode_maketrans, 12962 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12963 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12964#if 0 12965 /* These methods are just used for debugging the implementation. */ 12966 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12967#endif 12968 12969 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12970 {NULL, NULL} 12971}; 12972 12973static PyObject * 12974unicode_mod(PyObject *v, PyObject *w) 12975{ 12976 if (!PyUnicode_Check(v)) 12977 Py_RETURN_NOTIMPLEMENTED; 12978 return PyUnicode_Format(v, w); 12979} 12980 12981static PyNumberMethods unicode_as_number = { 12982 0, /*nb_add*/ 12983 0, /*nb_subtract*/ 12984 0, /*nb_multiply*/ 12985 unicode_mod, /*nb_remainder*/ 12986}; 12987 12988static PySequenceMethods unicode_as_sequence = { 12989 (lenfunc) unicode_length, /* sq_length */ 12990 PyUnicode_Concat, /* sq_concat */ 12991 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12992 (ssizeargfunc) unicode_getitem, /* sq_item */ 12993 0, /* sq_slice */ 12994 0, /* sq_ass_item */ 12995 0, /* sq_ass_slice */ 12996 PyUnicode_Contains, /* sq_contains */ 12997}; 12998 12999static PyObject* 13000unicode_subscript(PyObject* self, PyObject* item) 13001{ 13002 if (PyUnicode_READY(self) == -1) 13003 return NULL; 13004 13005 if (PyIndex_Check(item)) { 13006 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13007 if (i == -1 && PyErr_Occurred()) 13008 return NULL; 13009 if (i < 0) 13010 i += PyUnicode_GET_LENGTH(self); 13011 return unicode_getitem(self, i); 13012 } else if (PySlice_Check(item)) { 13013 Py_ssize_t start, stop, step, slicelength, cur, i; 13014 PyObject *result; 13015 void *src_data, *dest_data; 13016 int src_kind, dest_kind; 13017 Py_UCS4 ch, max_char, kind_limit; 13018 13019 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13020 &start, &stop, &step, &slicelength) < 0) { 13021 return NULL; 13022 } 13023 13024 if (slicelength <= 0) { 13025 Py_INCREF(unicode_empty); 13026 return unicode_empty; 13027 } else if (start == 0 && step == 1 && 13028 slicelength == PyUnicode_GET_LENGTH(self)) { 13029 return unicode_result_unchanged(self); 13030 } else if (step == 1) { 13031 return PyUnicode_Substring(self, 13032 start, start + slicelength); 13033 } 13034 /* General case */ 13035 src_kind = PyUnicode_KIND(self); 13036 src_data = PyUnicode_DATA(self); 13037 if (!PyUnicode_IS_ASCII(self)) { 13038 kind_limit = kind_maxchar_limit(src_kind); 13039 max_char = 0; 13040 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13041 ch = PyUnicode_READ(src_kind, src_data, cur); 13042 if (ch > max_char) { 13043 max_char = ch; 13044 if (max_char >= kind_limit) 13045 break; 13046 } 13047 } 13048 } 13049 else 13050 max_char = 127; 13051 result = PyUnicode_New(slicelength, max_char); 13052 if (result == NULL) 13053 return NULL; 13054 dest_kind = PyUnicode_KIND(result); 13055 dest_data = PyUnicode_DATA(result); 13056 13057 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13058 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13059 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13060 } 13061 assert(_PyUnicode_CheckConsistency(result, 1)); 13062 return result; 13063 } else { 13064 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13065 return NULL; 13066 } 13067} 13068 13069static PyMappingMethods unicode_as_mapping = { 13070 (lenfunc)unicode_length, /* mp_length */ 13071 (binaryfunc)unicode_subscript, /* mp_subscript */ 13072 (objobjargproc)0, /* mp_ass_subscript */ 13073}; 13074 13075 13076/* Helpers for PyUnicode_Format() */ 13077 13078struct unicode_formatter_t { 13079 PyObject *args; 13080 int args_owned; 13081 Py_ssize_t arglen, argidx; 13082 PyObject *dict; 13083 13084 enum PyUnicode_Kind fmtkind; 13085 Py_ssize_t fmtcnt, fmtpos; 13086 void *fmtdata; 13087 PyObject *fmtstr; 13088 13089 _PyUnicodeWriter writer; 13090}; 13091 13092struct unicode_format_arg_t { 13093 Py_UCS4 ch; 13094 int flags; 13095 Py_ssize_t width; 13096 int prec; 13097 int sign; 13098}; 13099 13100static PyObject * 13101unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13102{ 13103 Py_ssize_t argidx = ctx->argidx; 13104 13105 if (argidx < ctx->arglen) { 13106 ctx->argidx++; 13107 if (ctx->arglen < 0) 13108 return ctx->args; 13109 else 13110 return PyTuple_GetItem(ctx->args, argidx); 13111 } 13112 PyErr_SetString(PyExc_TypeError, 13113 "not enough arguments for format string"); 13114 return NULL; 13115} 13116 13117/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13118 13119/* Format a float into the writer if the writer is not NULL, or into *p_output 13120 otherwise. 13121 13122 Return 0 on success, raise an exception and return -1 on error. */ 13123static int 13124formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13125 PyObject **p_output, 13126 _PyUnicodeWriter *writer) 13127{ 13128 char *p; 13129 double x; 13130 Py_ssize_t len; 13131 int prec; 13132 int dtoa_flags; 13133 13134 x = PyFloat_AsDouble(v); 13135 if (x == -1.0 && PyErr_Occurred()) 13136 return -1; 13137 13138 prec = arg->prec; 13139 if (prec < 0) 13140 prec = 6; 13141 13142 if (arg->flags & F_ALT) 13143 dtoa_flags = Py_DTSF_ALT; 13144 else 13145 dtoa_flags = 0; 13146 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13147 if (p == NULL) 13148 return -1; 13149 len = strlen(p); 13150 if (writer) { 13151 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) { 13152 PyMem_Free(p); 13153 return -1; 13154 } 13155 unicode_write_cstr(writer->buffer, writer->pos, p, len); 13156 writer->pos += len; 13157 } 13158 else 13159 *p_output = _PyUnicode_FromASCII(p, len); 13160 PyMem_Free(p); 13161 return 0; 13162} 13163 13164/* formatlong() emulates the format codes d, u, o, x and X, and 13165 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13166 * Python's regular ints. 13167 * Return value: a new PyUnicodeObject*, or NULL if error. 13168 * The output string is of the form 13169 * "-"? ("0x" | "0X")? digit+ 13170 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13171 * set in flags. The case of hex digits will be correct, 13172 * There will be at least prec digits, zero-filled on the left if 13173 * necessary to get that many. 13174 * val object to be converted 13175 * flags bitmask of format flags; only F_ALT is looked at 13176 * prec minimum number of digits; 0-fill on left if needed 13177 * type a character in [duoxX]; u acts the same as d 13178 * 13179 * CAUTION: o, x and X conversions on regular ints can never 13180 * produce a '-' sign, but can for Python's unbounded ints. 13181 */ 13182static PyObject* 13183formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13184{ 13185 PyObject *result = NULL; 13186 char *buf; 13187 Py_ssize_t i; 13188 int sign; /* 1 if '-', else 0 */ 13189 int len; /* number of characters */ 13190 Py_ssize_t llen; 13191 int numdigits; /* len == numnondigits + numdigits */ 13192 int numnondigits = 0; 13193 int prec = arg->prec; 13194 int type = arg->ch; 13195 13196 /* Avoid exceeding SSIZE_T_MAX */ 13197 if (prec > INT_MAX-3) { 13198 PyErr_SetString(PyExc_OverflowError, 13199 "precision too large"); 13200 return NULL; 13201 } 13202 13203 assert(PyLong_Check(val)); 13204 13205 switch (type) { 13206 default: 13207 assert(!"'type' not in [diuoxX]"); 13208 case 'd': 13209 case 'i': 13210 case 'u': 13211 /* Special-case boolean: we want 0/1 */ 13212 if (PyBool_Check(val)) 13213 result = PyNumber_ToBase(val, 10); 13214 else 13215 result = Py_TYPE(val)->tp_str(val); 13216 break; 13217 case 'o': 13218 numnondigits = 2; 13219 result = PyNumber_ToBase(val, 8); 13220 break; 13221 case 'x': 13222 case 'X': 13223 numnondigits = 2; 13224 result = PyNumber_ToBase(val, 16); 13225 break; 13226 } 13227 if (!result) 13228 return NULL; 13229 13230 assert(unicode_modifiable(result)); 13231 assert(PyUnicode_IS_READY(result)); 13232 assert(PyUnicode_IS_ASCII(result)); 13233 13234 /* To modify the string in-place, there can only be one reference. */ 13235 if (Py_REFCNT(result) != 1) { 13236 PyErr_BadInternalCall(); 13237 return NULL; 13238 } 13239 buf = PyUnicode_DATA(result); 13240 llen = PyUnicode_GET_LENGTH(result); 13241 if (llen > INT_MAX) { 13242 PyErr_SetString(PyExc_ValueError, 13243 "string too large in _PyBytes_FormatLong"); 13244 return NULL; 13245 } 13246 len = (int)llen; 13247 sign = buf[0] == '-'; 13248 numnondigits += sign; 13249 numdigits = len - numnondigits; 13250 assert(numdigits > 0); 13251 13252 /* Get rid of base marker unless F_ALT */ 13253 if (((arg->flags & F_ALT) == 0 && 13254 (type == 'o' || type == 'x' || type == 'X'))) { 13255 assert(buf[sign] == '0'); 13256 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13257 buf[sign+1] == 'o'); 13258 numnondigits -= 2; 13259 buf += 2; 13260 len -= 2; 13261 if (sign) 13262 buf[0] = '-'; 13263 assert(len == numnondigits + numdigits); 13264 assert(numdigits > 0); 13265 } 13266 13267 /* Fill with leading zeroes to meet minimum width. */ 13268 if (prec > numdigits) { 13269 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13270 numnondigits + prec); 13271 char *b1; 13272 if (!r1) { 13273 Py_DECREF(result); 13274 return NULL; 13275 } 13276 b1 = PyBytes_AS_STRING(r1); 13277 for (i = 0; i < numnondigits; ++i) 13278 *b1++ = *buf++; 13279 for (i = 0; i < prec - numdigits; i++) 13280 *b1++ = '0'; 13281 for (i = 0; i < numdigits; i++) 13282 *b1++ = *buf++; 13283 *b1 = '\0'; 13284 Py_DECREF(result); 13285 result = r1; 13286 buf = PyBytes_AS_STRING(result); 13287 len = numnondigits + prec; 13288 } 13289 13290 /* Fix up case for hex conversions. */ 13291 if (type == 'X') { 13292 /* Need to convert all lower case letters to upper case. 13293 and need to convert 0x to 0X (and -0x to -0X). */ 13294 for (i = 0; i < len; i++) 13295 if (buf[i] >= 'a' && buf[i] <= 'x') 13296 buf[i] -= 'a'-'A'; 13297 } 13298 if (!PyUnicode_Check(result) 13299 || buf != PyUnicode_DATA(result)) { 13300 PyObject *unicode; 13301 unicode = _PyUnicode_FromASCII(buf, len); 13302 Py_DECREF(result); 13303 result = unicode; 13304 } 13305 else if (len != PyUnicode_GET_LENGTH(result)) { 13306 if (PyUnicode_Resize(&result, len) < 0) 13307 Py_CLEAR(result); 13308 } 13309 return result; 13310} 13311 13312/* Format an integer. 13313 * Return 1 if the number has been formatted into the writer, 13314 * 0 if the number has been formatted into *p_output 13315 * -1 and raise an exception on error */ 13316static int 13317mainformatlong(PyObject *v, 13318 struct unicode_format_arg_t *arg, 13319 PyObject **p_output, 13320 _PyUnicodeWriter *writer) 13321{ 13322 PyObject *iobj, *res; 13323 char type = (char)arg->ch; 13324 13325 if (!PyNumber_Check(v)) 13326 goto wrongtype; 13327 13328 if (!PyLong_Check(v)) { 13329 iobj = PyNumber_Long(v); 13330 if (iobj == NULL) { 13331 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13332 goto wrongtype; 13333 return -1; 13334 } 13335 assert(PyLong_Check(iobj)); 13336 } 13337 else { 13338 iobj = v; 13339 Py_INCREF(iobj); 13340 } 13341 13342 if (PyLong_CheckExact(v) 13343 && arg->width == -1 && arg->prec == -1 13344 && !(arg->flags & (F_SIGN | F_BLANK)) 13345 && type != 'X') 13346 { 13347 /* Fast path */ 13348 int alternate = arg->flags & F_ALT; 13349 int base; 13350 13351 switch(type) 13352 { 13353 default: 13354 assert(0 && "'type' not in [diuoxX]"); 13355 case 'd': 13356 case 'i': 13357 case 'u': 13358 base = 10; 13359 break; 13360 case 'o': 13361 base = 8; 13362 break; 13363 case 'x': 13364 case 'X': 13365 base = 16; 13366 break; 13367 } 13368 13369 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 13370 Py_DECREF(iobj); 13371 return -1; 13372 } 13373 Py_DECREF(iobj); 13374 return 1; 13375 } 13376 13377 res = formatlong(iobj, arg); 13378 Py_DECREF(iobj); 13379 if (res == NULL) 13380 return -1; 13381 *p_output = res; 13382 return 0; 13383 13384wrongtype: 13385 PyErr_Format(PyExc_TypeError, 13386 "%%%c format: a number is required, " 13387 "not %.200s", 13388 type, Py_TYPE(v)->tp_name); 13389 return -1; 13390} 13391 13392static Py_UCS4 13393formatchar(PyObject *v) 13394{ 13395 /* presume that the buffer is at least 3 characters long */ 13396 if (PyUnicode_Check(v)) { 13397 if (PyUnicode_GET_LENGTH(v) == 1) { 13398 return PyUnicode_READ_CHAR(v, 0); 13399 } 13400 goto onError; 13401 } 13402 else { 13403 /* Integer input truncated to a character */ 13404 long x; 13405 x = PyLong_AsLong(v); 13406 if (x == -1 && PyErr_Occurred()) 13407 goto onError; 13408 13409 if (x < 0 || x > MAX_UNICODE) { 13410 PyErr_SetString(PyExc_OverflowError, 13411 "%c arg not in range(0x110000)"); 13412 return (Py_UCS4) -1; 13413 } 13414 13415 return (Py_UCS4) x; 13416 } 13417 13418 onError: 13419 PyErr_SetString(PyExc_TypeError, 13420 "%c requires int or char"); 13421 return (Py_UCS4) -1; 13422} 13423 13424/* Parse options of an argument: flags, width, precision. 13425 Handle also "%(name)" syntax. 13426 13427 Return 0 if the argument has been formatted into arg->str. 13428 Return 1 if the argument has been written into ctx->writer, 13429 Raise an exception and return -1 on error. */ 13430static int 13431unicode_format_arg_parse(struct unicode_formatter_t *ctx, 13432 struct unicode_format_arg_t *arg) 13433{ 13434#define FORMAT_READ(ctx) \ 13435 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 13436 13437 PyObject *v; 13438 13439 arg->ch = FORMAT_READ(ctx); 13440 if (arg->ch == '(') { 13441 /* Get argument value from a dictionary. Example: "%(name)s". */ 13442 Py_ssize_t keystart; 13443 Py_ssize_t keylen; 13444 PyObject *key; 13445 int pcount = 1; 13446 13447 if (ctx->dict == NULL) { 13448 PyErr_SetString(PyExc_TypeError, 13449 "format requires a mapping"); 13450 return -1; 13451 } 13452 ++ctx->fmtpos; 13453 --ctx->fmtcnt; 13454 keystart = ctx->fmtpos; 13455 /* Skip over balanced parentheses */ 13456 while (pcount > 0 && --ctx->fmtcnt >= 0) { 13457 arg->ch = FORMAT_READ(ctx); 13458 if (arg->ch == ')') 13459 --pcount; 13460 else if (arg->ch == '(') 13461 ++pcount; 13462 ctx->fmtpos++; 13463 } 13464 keylen = ctx->fmtpos - keystart - 1; 13465 if (ctx->fmtcnt < 0 || pcount > 0) { 13466 PyErr_SetString(PyExc_ValueError, 13467 "incomplete format key"); 13468 return -1; 13469 } 13470 key = PyUnicode_Substring(ctx->fmtstr, 13471 keystart, keystart + keylen); 13472 if (key == NULL) 13473 return -1; 13474 if (ctx->args_owned) { 13475 Py_DECREF(ctx->args); 13476 ctx->args_owned = 0; 13477 } 13478 ctx->args = PyObject_GetItem(ctx->dict, key); 13479 Py_DECREF(key); 13480 if (ctx->args == NULL) 13481 return -1; 13482 ctx->args_owned = 1; 13483 ctx->arglen = -1; 13484 ctx->argidx = -2; 13485 } 13486 13487 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 13488 arg->flags = 0; 13489 while (--ctx->fmtcnt >= 0) { 13490 arg->ch = FORMAT_READ(ctx); 13491 ctx->fmtpos++; 13492 switch (arg->ch) { 13493 case '-': arg->flags |= F_LJUST; continue; 13494 case '+': arg->flags |= F_SIGN; continue; 13495 case ' ': arg->flags |= F_BLANK; continue; 13496 case '#': arg->flags |= F_ALT; continue; 13497 case '0': arg->flags |= F_ZERO; continue; 13498 } 13499 break; 13500 } 13501 13502 /* Parse width. Example: "%10s" => width=10 */ 13503 arg->width = -1; 13504 if (arg->ch == '*') { 13505 v = unicode_format_getnextarg(ctx); 13506 if (v == NULL) 13507 return -1; 13508 if (!PyLong_Check(v)) { 13509 PyErr_SetString(PyExc_TypeError, 13510 "* wants int"); 13511 return -1; 13512 } 13513 arg->width = PyLong_AsLong(v); 13514 if (arg->width == -1 && PyErr_Occurred()) 13515 return -1; 13516 if (arg->width < 0) { 13517 arg->flags |= F_LJUST; 13518 arg->width = -arg->width; 13519 } 13520 if (--ctx->fmtcnt >= 0) { 13521 arg->ch = FORMAT_READ(ctx); 13522 ctx->fmtpos++; 13523 } 13524 } 13525 else if (arg->ch >= '0' && arg->ch <= '9') { 13526 arg->width = arg->ch - '0'; 13527 while (--ctx->fmtcnt >= 0) { 13528 arg->ch = FORMAT_READ(ctx); 13529 ctx->fmtpos++; 13530 if (arg->ch < '0' || arg->ch > '9') 13531 break; 13532 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 13533 mixing signed and unsigned comparison. Since arg->ch is between 13534 '0' and '9', casting to int is safe. */ 13535 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 13536 PyErr_SetString(PyExc_ValueError, 13537 "width too big"); 13538 return -1; 13539 } 13540 arg->width = arg->width*10 + (arg->ch - '0'); 13541 } 13542 } 13543 13544 /* Parse precision. Example: "%.3f" => prec=3 */ 13545 arg->prec = -1; 13546 if (arg->ch == '.') { 13547 arg->prec = 0; 13548 if (--ctx->fmtcnt >= 0) { 13549 arg->ch = FORMAT_READ(ctx); 13550 ctx->fmtpos++; 13551 } 13552 if (arg->ch == '*') { 13553 v = unicode_format_getnextarg(ctx); 13554 if (v == NULL) 13555 return -1; 13556 if (!PyLong_Check(v)) { 13557 PyErr_SetString(PyExc_TypeError, 13558 "* wants int"); 13559 return -1; 13560 } 13561 arg->prec = PyLong_AsLong(v); 13562 if (arg->prec == -1 && PyErr_Occurred()) 13563 return -1; 13564 if (arg->prec < 0) 13565 arg->prec = 0; 13566 if (--ctx->fmtcnt >= 0) { 13567 arg->ch = FORMAT_READ(ctx); 13568 ctx->fmtpos++; 13569 } 13570 } 13571 else if (arg->ch >= '0' && arg->ch <= '9') { 13572 arg->prec = arg->ch - '0'; 13573 while (--ctx->fmtcnt >= 0) { 13574 arg->ch = FORMAT_READ(ctx); 13575 ctx->fmtpos++; 13576 if (arg->ch < '0' || arg->ch > '9') 13577 break; 13578 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 13579 PyErr_SetString(PyExc_ValueError, 13580 "precision too big"); 13581 return -1; 13582 } 13583 arg->prec = arg->prec*10 + (arg->ch - '0'); 13584 } 13585 } 13586 } 13587 13588 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 13589 if (ctx->fmtcnt >= 0) { 13590 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 13591 if (--ctx->fmtcnt >= 0) { 13592 arg->ch = FORMAT_READ(ctx); 13593 ctx->fmtpos++; 13594 } 13595 } 13596 } 13597 if (ctx->fmtcnt < 0) { 13598 PyErr_SetString(PyExc_ValueError, 13599 "incomplete format"); 13600 return -1; 13601 } 13602 return 0; 13603 13604#undef FORMAT_READ 13605} 13606 13607/* Format one argument. Supported conversion specifiers: 13608 13609 - "s", "r", "a": any type 13610 - "i", "d", "u", "o", "x", "X": int 13611 - "e", "E", "f", "F", "g", "G": float 13612 - "c": int or str (1 character) 13613 13614 Return 0 if the argument has been formatted into *p_str, 13615 1 if the argument has been written into ctx->writer, 13616 -1 on error. */ 13617static int 13618unicode_format_arg_format(struct unicode_formatter_t *ctx, 13619 struct unicode_format_arg_t *arg, 13620 PyObject **p_str) 13621{ 13622 PyObject *v; 13623 _PyUnicodeWriter *writer = &ctx->writer; 13624 13625 if (ctx->fmtcnt == 0) 13626 ctx->writer.overallocate = 0; 13627 13628 if (arg->ch == '%') { 13629 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1) 13630 return -1; 13631 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); 13632 writer->pos += 1; 13633 return 1; 13634 } 13635 13636 v = unicode_format_getnextarg(ctx); 13637 if (v == NULL) 13638 return -1; 13639 13640 arg->sign = 0; 13641 13642 switch (arg->ch) { 13643 13644 case 's': 13645 case 'r': 13646 case 'a': 13647 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 13648 /* Fast path */ 13649 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 13650 return -1; 13651 return 1; 13652 } 13653 13654 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 13655 *p_str = v; 13656 Py_INCREF(*p_str); 13657 } 13658 else { 13659 if (arg->ch == 's') 13660 *p_str = PyObject_Str(v); 13661 else if (arg->ch == 'r') 13662 *p_str = PyObject_Repr(v); 13663 else 13664 *p_str = PyObject_ASCII(v); 13665 } 13666 break; 13667 13668 case 'i': 13669 case 'd': 13670 case 'u': 13671 case 'o': 13672 case 'x': 13673 case 'X': 13674 { 13675 int ret = mainformatlong(v, arg, p_str, writer); 13676 if (ret != 0) 13677 return ret; 13678 arg->sign = 1; 13679 break; 13680 } 13681 13682 case 'e': 13683 case 'E': 13684 case 'f': 13685 case 'F': 13686 case 'g': 13687 case 'G': 13688 if (arg->width == -1 && arg->prec == -1 13689 && !(arg->flags & (F_SIGN | F_BLANK))) 13690 { 13691 /* Fast path */ 13692 if (formatfloat(v, arg, NULL, writer) == -1) 13693 return -1; 13694 return 1; 13695 } 13696 13697 arg->sign = 1; 13698 if (formatfloat(v, arg, p_str, NULL) == -1) 13699 return -1; 13700 break; 13701 13702 case 'c': 13703 { 13704 Py_UCS4 ch = formatchar(v); 13705 if (ch == (Py_UCS4) -1) 13706 return -1; 13707 if (arg->width == -1 && arg->prec == -1) { 13708 /* Fast path */ 13709 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1) 13710 return -1; 13711 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13712 writer->pos += 1; 13713 return 1; 13714 } 13715 *p_str = PyUnicode_FromOrdinal(ch); 13716 break; 13717 } 13718 13719 default: 13720 PyErr_Format(PyExc_ValueError, 13721 "unsupported format character '%c' (0x%x) " 13722 "at index %zd", 13723 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 13724 (int)arg->ch, 13725 ctx->fmtpos - 1); 13726 return -1; 13727 } 13728 if (*p_str == NULL) 13729 return -1; 13730 assert (PyUnicode_Check(*p_str)); 13731 return 0; 13732} 13733 13734static int 13735unicode_format_arg_output(struct unicode_formatter_t *ctx, 13736 struct unicode_format_arg_t *arg, 13737 PyObject *str) 13738{ 13739 Py_ssize_t len; 13740 enum PyUnicode_Kind kind; 13741 void *pbuf; 13742 Py_ssize_t pindex; 13743 Py_UCS4 signchar; 13744 Py_ssize_t buflen; 13745 Py_UCS4 maxchar, bufmaxchar; 13746 Py_ssize_t sublen; 13747 _PyUnicodeWriter *writer = &ctx->writer; 13748 Py_UCS4 fill; 13749 13750 fill = ' '; 13751 if (arg->sign && arg->flags & F_ZERO) 13752 fill = '0'; 13753 13754 if (PyUnicode_READY(str) == -1) 13755 return -1; 13756 13757 len = PyUnicode_GET_LENGTH(str); 13758 if ((arg->width == -1 || arg->width <= len) 13759 && (arg->prec == -1 || arg->prec >= len) 13760 && !(arg->flags & (F_SIGN | F_BLANK))) 13761 { 13762 /* Fast path */ 13763 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 13764 return -1; 13765 return 0; 13766 } 13767 13768 /* Truncate the string for "s", "r" and "a" formats 13769 if the precision is set */ 13770 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 13771 if (arg->prec >= 0 && len > arg->prec) 13772 len = arg->prec; 13773 } 13774 13775 /* Adjust sign and width */ 13776 kind = PyUnicode_KIND(str); 13777 pbuf = PyUnicode_DATA(str); 13778 pindex = 0; 13779 signchar = '\0'; 13780 if (arg->sign) { 13781 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 13782 if (ch == '-' || ch == '+') { 13783 signchar = ch; 13784 len--; 13785 pindex++; 13786 } 13787 else if (arg->flags & F_SIGN) 13788 signchar = '+'; 13789 else if (arg->flags & F_BLANK) 13790 signchar = ' '; 13791 else 13792 arg->sign = 0; 13793 } 13794 if (arg->width < len) 13795 arg->width = len; 13796 13797 /* Prepare the writer */ 13798 bufmaxchar = 127; 13799 if (!(arg->flags & F_LJUST)) { 13800 if (arg->sign) { 13801 if ((arg->width-1) > len) 13802 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13803 } 13804 else { 13805 if (arg->width > len) 13806 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13807 } 13808 } 13809 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 13810 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); 13811 buflen = arg->width; 13812 if (arg->sign && len == arg->width) 13813 buflen++; 13814 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1) 13815 return -1; 13816 13817 /* Write the sign if needed */ 13818 if (arg->sign) { 13819 if (fill != ' ') { 13820 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 13821 writer->pos += 1; 13822 } 13823 if (arg->width > len) 13824 arg->width--; 13825 } 13826 13827 /* Write the numeric prefix for "x", "X" and "o" formats 13828 if the alternate form is used. 13829 For example, write "0x" for the "%#x" format. */ 13830 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 13831 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13832 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 13833 if (fill != ' ') { 13834 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 13835 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 13836 writer->pos += 2; 13837 pindex += 2; 13838 } 13839 arg->width -= 2; 13840 if (arg->width < 0) 13841 arg->width = 0; 13842 len -= 2; 13843 } 13844 13845 /* Pad left with the fill character if needed */ 13846 if (arg->width > len && !(arg->flags & F_LJUST)) { 13847 sublen = arg->width - len; 13848 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 13849 writer->pos += sublen; 13850 arg->width = len; 13851 } 13852 13853 /* If padding with spaces: write sign if needed and/or numeric prefix if 13854 the alternate form is used */ 13855 if (fill == ' ') { 13856 if (arg->sign) { 13857 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 13858 writer->pos += 1; 13859 } 13860 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 13861 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13862 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 13863 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 13864 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 13865 writer->pos += 2; 13866 pindex += 2; 13867 } 13868 } 13869 13870 /* Write characters */ 13871 if (len) { 13872 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13873 str, pindex, len); 13874 writer->pos += len; 13875 } 13876 13877 /* Pad right with the fill character if needed */ 13878 if (arg->width > len) { 13879 sublen = arg->width - len; 13880 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 13881 writer->pos += sublen; 13882 } 13883 return 0; 13884} 13885 13886/* Helper of PyUnicode_Format(): format one arg. 13887 Return 0 on success, raise an exception and return -1 on error. */ 13888static int 13889unicode_format_arg(struct unicode_formatter_t *ctx) 13890{ 13891 struct unicode_format_arg_t arg; 13892 PyObject *str; 13893 int ret; 13894 13895 ret = unicode_format_arg_parse(ctx, &arg); 13896 if (ret == -1) 13897 return -1; 13898 13899 ret = unicode_format_arg_format(ctx, &arg, &str); 13900 if (ret == -1) 13901 return -1; 13902 13903 if (ret != 1) { 13904 ret = unicode_format_arg_output(ctx, &arg, str); 13905 Py_DECREF(str); 13906 if (ret == -1) 13907 return -1; 13908 } 13909 13910 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 13911 PyErr_SetString(PyExc_TypeError, 13912 "not all arguments converted during string formatting"); 13913 return -1; 13914 } 13915 return 0; 13916} 13917 13918PyObject * 13919PyUnicode_Format(PyObject *format, PyObject *args) 13920{ 13921 struct unicode_formatter_t ctx; 13922 13923 if (format == NULL || args == NULL) { 13924 PyErr_BadInternalCall(); 13925 return NULL; 13926 } 13927 13928 ctx.fmtstr = PyUnicode_FromObject(format); 13929 if (ctx.fmtstr == NULL) 13930 return NULL; 13931 if (PyUnicode_READY(ctx.fmtstr) == -1) { 13932 Py_DECREF(ctx.fmtstr); 13933 return NULL; 13934 } 13935 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 13936 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 13937 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 13938 ctx.fmtpos = 0; 13939 13940 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); 13941 13942 if (PyTuple_Check(args)) { 13943 ctx.arglen = PyTuple_Size(args); 13944 ctx.argidx = 0; 13945 } 13946 else { 13947 ctx.arglen = -1; 13948 ctx.argidx = -2; 13949 } 13950 ctx.args_owned = 0; 13951 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 13952 ctx.dict = args; 13953 else 13954 ctx.dict = NULL; 13955 ctx.args = args; 13956 13957 while (--ctx.fmtcnt >= 0) { 13958 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 13959 Py_ssize_t nonfmtpos, sublen; 13960 Py_UCS4 maxchar; 13961 13962 nonfmtpos = ctx.fmtpos++; 13963 while (ctx.fmtcnt >= 0 && 13964 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 13965 ctx.fmtpos++; 13966 ctx.fmtcnt--; 13967 } 13968 if (ctx.fmtcnt < 0) { 13969 ctx.fmtpos--; 13970 ctx.writer.overallocate = 0; 13971 } 13972 sublen = ctx.fmtpos - nonfmtpos; 13973 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr, 13974 nonfmtpos, nonfmtpos + sublen); 13975 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1) 13976 goto onError; 13977 13978 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos, 13979 ctx.fmtstr, nonfmtpos, sublen); 13980 ctx.writer.pos += sublen; 13981 } 13982 else { 13983 ctx.fmtpos++; 13984 if (unicode_format_arg(&ctx) == -1) 13985 goto onError; 13986 } 13987 } 13988 13989 if (ctx.argidx < ctx.arglen && !ctx.dict) { 13990 PyErr_SetString(PyExc_TypeError, 13991 "not all arguments converted during string formatting"); 13992 goto onError; 13993 } 13994 13995 if (ctx.args_owned) { 13996 Py_DECREF(ctx.args); 13997 } 13998 Py_DECREF(ctx.fmtstr); 13999 return _PyUnicodeWriter_Finish(&ctx.writer); 14000 14001 onError: 14002 Py_DECREF(ctx.fmtstr); 14003 _PyUnicodeWriter_Dealloc(&ctx.writer); 14004 if (ctx.args_owned) { 14005 Py_DECREF(ctx.args); 14006 } 14007 return NULL; 14008} 14009 14010static PyObject * 14011unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14012 14013static PyObject * 14014unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14015{ 14016 PyObject *x = NULL; 14017 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14018 char *encoding = NULL; 14019 char *errors = NULL; 14020 14021 if (type != &PyUnicode_Type) 14022 return unicode_subtype_new(type, args, kwds); 14023 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14024 kwlist, &x, &encoding, &errors)) 14025 return NULL; 14026 if (x == NULL) { 14027 Py_INCREF(unicode_empty); 14028 return unicode_empty; 14029 } 14030 if (encoding == NULL && errors == NULL) 14031 return PyObject_Str(x); 14032 else 14033 return PyUnicode_FromEncodedObject(x, encoding, errors); 14034} 14035 14036static PyObject * 14037unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14038{ 14039 PyObject *unicode, *self; 14040 Py_ssize_t length, char_size; 14041 int share_wstr, share_utf8; 14042 unsigned int kind; 14043 void *data; 14044 14045 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14046 14047 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14048 if (unicode == NULL) 14049 return NULL; 14050 assert(_PyUnicode_CHECK(unicode)); 14051 if (PyUnicode_READY(unicode) == -1) { 14052 Py_DECREF(unicode); 14053 return NULL; 14054 } 14055 14056 self = type->tp_alloc(type, 0); 14057 if (self == NULL) { 14058 Py_DECREF(unicode); 14059 return NULL; 14060 } 14061 kind = PyUnicode_KIND(unicode); 14062 length = PyUnicode_GET_LENGTH(unicode); 14063 14064 _PyUnicode_LENGTH(self) = length; 14065#ifdef Py_DEBUG 14066 _PyUnicode_HASH(self) = -1; 14067#else 14068 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14069#endif 14070 _PyUnicode_STATE(self).interned = 0; 14071 _PyUnicode_STATE(self).kind = kind; 14072 _PyUnicode_STATE(self).compact = 0; 14073 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14074 _PyUnicode_STATE(self).ready = 1; 14075 _PyUnicode_WSTR(self) = NULL; 14076 _PyUnicode_UTF8_LENGTH(self) = 0; 14077 _PyUnicode_UTF8(self) = NULL; 14078 _PyUnicode_WSTR_LENGTH(self) = 0; 14079 _PyUnicode_DATA_ANY(self) = NULL; 14080 14081 share_utf8 = 0; 14082 share_wstr = 0; 14083 if (kind == PyUnicode_1BYTE_KIND) { 14084 char_size = 1; 14085 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14086 share_utf8 = 1; 14087 } 14088 else if (kind == PyUnicode_2BYTE_KIND) { 14089 char_size = 2; 14090 if (sizeof(wchar_t) == 2) 14091 share_wstr = 1; 14092 } 14093 else { 14094 assert(kind == PyUnicode_4BYTE_KIND); 14095 char_size = 4; 14096 if (sizeof(wchar_t) == 4) 14097 share_wstr = 1; 14098 } 14099 14100 /* Ensure we won't overflow the length. */ 14101 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14102 PyErr_NoMemory(); 14103 goto onError; 14104 } 14105 data = PyObject_MALLOC((length + 1) * char_size); 14106 if (data == NULL) { 14107 PyErr_NoMemory(); 14108 goto onError; 14109 } 14110 14111 _PyUnicode_DATA_ANY(self) = data; 14112 if (share_utf8) { 14113 _PyUnicode_UTF8_LENGTH(self) = length; 14114 _PyUnicode_UTF8(self) = data; 14115 } 14116 if (share_wstr) { 14117 _PyUnicode_WSTR_LENGTH(self) = length; 14118 _PyUnicode_WSTR(self) = (wchar_t *)data; 14119 } 14120 14121 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14122 kind * (length + 1)); 14123 assert(_PyUnicode_CheckConsistency(self, 1)); 14124#ifdef Py_DEBUG 14125 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14126#endif 14127 Py_DECREF(unicode); 14128 return self; 14129 14130onError: 14131 Py_DECREF(unicode); 14132 Py_DECREF(self); 14133 return NULL; 14134} 14135 14136PyDoc_STRVAR(unicode_doc, 14137"str(object='') -> str\n\ 14138str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14139\n\ 14140Create a new string object from the given object. If encoding or\n\ 14141errors is specified, then the object must expose a data buffer\n\ 14142that will be decoded using the given encoding and error handler.\n\ 14143Otherwise, returns the result of object.__str__() (if defined)\n\ 14144or repr(object).\n\ 14145encoding defaults to sys.getdefaultencoding().\n\ 14146errors defaults to 'strict'."); 14147 14148static PyObject *unicode_iter(PyObject *seq); 14149 14150PyTypeObject PyUnicode_Type = { 14151 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14152 "str", /* tp_name */ 14153 sizeof(PyUnicodeObject), /* tp_size */ 14154 0, /* tp_itemsize */ 14155 /* Slots */ 14156 (destructor)unicode_dealloc, /* tp_dealloc */ 14157 0, /* tp_print */ 14158 0, /* tp_getattr */ 14159 0, /* tp_setattr */ 14160 0, /* tp_reserved */ 14161 unicode_repr, /* tp_repr */ 14162 &unicode_as_number, /* tp_as_number */ 14163 &unicode_as_sequence, /* tp_as_sequence */ 14164 &unicode_as_mapping, /* tp_as_mapping */ 14165 (hashfunc) unicode_hash, /* tp_hash*/ 14166 0, /* tp_call*/ 14167 (reprfunc) unicode_str, /* tp_str */ 14168 PyObject_GenericGetAttr, /* tp_getattro */ 14169 0, /* tp_setattro */ 14170 0, /* tp_as_buffer */ 14171 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14172 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14173 unicode_doc, /* tp_doc */ 14174 0, /* tp_traverse */ 14175 0, /* tp_clear */ 14176 PyUnicode_RichCompare, /* tp_richcompare */ 14177 0, /* tp_weaklistoffset */ 14178 unicode_iter, /* tp_iter */ 14179 0, /* tp_iternext */ 14180 unicode_methods, /* tp_methods */ 14181 0, /* tp_members */ 14182 0, /* tp_getset */ 14183 &PyBaseObject_Type, /* tp_base */ 14184 0, /* tp_dict */ 14185 0, /* tp_descr_get */ 14186 0, /* tp_descr_set */ 14187 0, /* tp_dictoffset */ 14188 0, /* tp_init */ 14189 0, /* tp_alloc */ 14190 unicode_new, /* tp_new */ 14191 PyObject_Del, /* tp_free */ 14192}; 14193 14194/* Initialize the Unicode implementation */ 14195 14196int _PyUnicode_Init(void) 14197{ 14198 int i; 14199 14200 /* XXX - move this array to unicodectype.c ? */ 14201 Py_UCS2 linebreak[] = { 14202 0x000A, /* LINE FEED */ 14203 0x000D, /* CARRIAGE RETURN */ 14204 0x001C, /* FILE SEPARATOR */ 14205 0x001D, /* GROUP SEPARATOR */ 14206 0x001E, /* RECORD SEPARATOR */ 14207 0x0085, /* NEXT LINE */ 14208 0x2028, /* LINE SEPARATOR */ 14209 0x2029, /* PARAGRAPH SEPARATOR */ 14210 }; 14211 14212 /* Init the implementation */ 14213 unicode_empty = PyUnicode_New(0, 0); 14214 if (!unicode_empty) 14215 Py_FatalError("Can't create empty string"); 14216 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 14217 14218 for (i = 0; i < 256; i++) 14219 unicode_latin1[i] = NULL; 14220 if (PyType_Ready(&PyUnicode_Type) < 0) 14221 Py_FatalError("Can't initialize 'unicode'"); 14222 14223 /* initialize the linebreak bloom filter */ 14224 bloom_linebreak = make_bloom_mask( 14225 PyUnicode_2BYTE_KIND, linebreak, 14226 Py_ARRAY_LENGTH(linebreak)); 14227 14228 PyType_Ready(&EncodingMapType); 14229 14230#ifdef HAVE_MBCS 14231 winver.dwOSVersionInfoSize = sizeof(winver); 14232 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14233 PyErr_SetFromWindowsErr(0); 14234 return -1; 14235 } 14236#endif 14237 return 0; 14238} 14239 14240/* Finalize the Unicode implementation */ 14241 14242int 14243PyUnicode_ClearFreeList(void) 14244{ 14245 return 0; 14246} 14247 14248void 14249_PyUnicode_Fini(void) 14250{ 14251 int i; 14252 14253 Py_XDECREF(unicode_empty); 14254 unicode_empty = NULL; 14255 14256 for (i = 0; i < 256; i++) { 14257 if (unicode_latin1[i]) { 14258 Py_DECREF(unicode_latin1[i]); 14259 unicode_latin1[i] = NULL; 14260 } 14261 } 14262 _PyUnicode_ClearStaticStrings(); 14263 (void)PyUnicode_ClearFreeList(); 14264} 14265 14266void 14267PyUnicode_InternInPlace(PyObject **p) 14268{ 14269 register PyObject *s = *p; 14270 PyObject *t; 14271#ifdef Py_DEBUG 14272 assert(s != NULL); 14273 assert(_PyUnicode_CHECK(s)); 14274#else 14275 if (s == NULL || !PyUnicode_Check(s)) 14276 return; 14277#endif 14278 /* If it's a subclass, we don't really know what putting 14279 it in the interned dict might do. */ 14280 if (!PyUnicode_CheckExact(s)) 14281 return; 14282 if (PyUnicode_CHECK_INTERNED(s)) 14283 return; 14284 if (interned == NULL) { 14285 interned = PyDict_New(); 14286 if (interned == NULL) { 14287 PyErr_Clear(); /* Don't leave an exception */ 14288 return; 14289 } 14290 } 14291 /* It might be that the GetItem call fails even 14292 though the key is present in the dictionary, 14293 namely when this happens during a stack overflow. */ 14294 Py_ALLOW_RECURSION 14295 t = PyDict_GetItem(interned, s); 14296 Py_END_ALLOW_RECURSION 14297 14298 if (t) { 14299 Py_INCREF(t); 14300 Py_DECREF(*p); 14301 *p = t; 14302 return; 14303 } 14304 14305 PyThreadState_GET()->recursion_critical = 1; 14306 if (PyDict_SetItem(interned, s, s) < 0) { 14307 PyErr_Clear(); 14308 PyThreadState_GET()->recursion_critical = 0; 14309 return; 14310 } 14311 PyThreadState_GET()->recursion_critical = 0; 14312 /* The two references in interned are not counted by refcnt. 14313 The deallocator will take care of this */ 14314 Py_REFCNT(s) -= 2; 14315 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14316} 14317 14318void 14319PyUnicode_InternImmortal(PyObject **p) 14320{ 14321 PyUnicode_InternInPlace(p); 14322 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14323 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14324 Py_INCREF(*p); 14325 } 14326} 14327 14328PyObject * 14329PyUnicode_InternFromString(const char *cp) 14330{ 14331 PyObject *s = PyUnicode_FromString(cp); 14332 if (s == NULL) 14333 return NULL; 14334 PyUnicode_InternInPlace(&s); 14335 return s; 14336} 14337 14338void 14339_Py_ReleaseInternedUnicodeStrings(void) 14340{ 14341 PyObject *keys; 14342 PyObject *s; 14343 Py_ssize_t i, n; 14344 Py_ssize_t immortal_size = 0, mortal_size = 0; 14345 14346 if (interned == NULL || !PyDict_Check(interned)) 14347 return; 14348 keys = PyDict_Keys(interned); 14349 if (keys == NULL || !PyList_Check(keys)) { 14350 PyErr_Clear(); 14351 return; 14352 } 14353 14354 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14355 detector, interned unicode strings are not forcibly deallocated; 14356 rather, we give them their stolen references back, and then clear 14357 and DECREF the interned dict. */ 14358 14359 n = PyList_GET_SIZE(keys); 14360 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14361 n); 14362 for (i = 0; i < n; i++) { 14363 s = PyList_GET_ITEM(keys, i); 14364 if (PyUnicode_READY(s) == -1) { 14365 assert(0 && "could not ready string"); 14366 fprintf(stderr, "could not ready string\n"); 14367 } 14368 switch (PyUnicode_CHECK_INTERNED(s)) { 14369 case SSTATE_NOT_INTERNED: 14370 /* XXX Shouldn't happen */ 14371 break; 14372 case SSTATE_INTERNED_IMMORTAL: 14373 Py_REFCNT(s) += 1; 14374 immortal_size += PyUnicode_GET_LENGTH(s); 14375 break; 14376 case SSTATE_INTERNED_MORTAL: 14377 Py_REFCNT(s) += 2; 14378 mortal_size += PyUnicode_GET_LENGTH(s); 14379 break; 14380 default: 14381 Py_FatalError("Inconsistent interned string state."); 14382 } 14383 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14384 } 14385 fprintf(stderr, "total size of all interned strings: " 14386 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14387 "mortal/immortal\n", mortal_size, immortal_size); 14388 Py_DECREF(keys); 14389 PyDict_Clear(interned); 14390 Py_DECREF(interned); 14391 interned = NULL; 14392} 14393 14394 14395/********************* Unicode Iterator **************************/ 14396 14397typedef struct { 14398 PyObject_HEAD 14399 Py_ssize_t it_index; 14400 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14401} unicodeiterobject; 14402 14403static void 14404unicodeiter_dealloc(unicodeiterobject *it) 14405{ 14406 _PyObject_GC_UNTRACK(it); 14407 Py_XDECREF(it->it_seq); 14408 PyObject_GC_Del(it); 14409} 14410 14411static int 14412unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14413{ 14414 Py_VISIT(it->it_seq); 14415 return 0; 14416} 14417 14418static PyObject * 14419unicodeiter_next(unicodeiterobject *it) 14420{ 14421 PyObject *seq, *item; 14422 14423 assert(it != NULL); 14424 seq = it->it_seq; 14425 if (seq == NULL) 14426 return NULL; 14427 assert(_PyUnicode_CHECK(seq)); 14428 14429 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14430 int kind = PyUnicode_KIND(seq); 14431 void *data = PyUnicode_DATA(seq); 14432 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14433 item = PyUnicode_FromOrdinal(chr); 14434 if (item != NULL) 14435 ++it->it_index; 14436 return item; 14437 } 14438 14439 Py_DECREF(seq); 14440 it->it_seq = NULL; 14441 return NULL; 14442} 14443 14444static PyObject * 14445unicodeiter_len(unicodeiterobject *it) 14446{ 14447 Py_ssize_t len = 0; 14448 if (it->it_seq) 14449 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14450 return PyLong_FromSsize_t(len); 14451} 14452 14453PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14454 14455static PyObject * 14456unicodeiter_reduce(unicodeiterobject *it) 14457{ 14458 if (it->it_seq != NULL) { 14459 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14460 it->it_seq, it->it_index); 14461 } else { 14462 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14463 if (u == NULL) 14464 return NULL; 14465 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14466 } 14467} 14468 14469PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14470 14471static PyObject * 14472unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14473{ 14474 Py_ssize_t index = PyLong_AsSsize_t(state); 14475 if (index == -1 && PyErr_Occurred()) 14476 return NULL; 14477 if (index < 0) 14478 index = 0; 14479 it->it_index = index; 14480 Py_RETURN_NONE; 14481} 14482 14483PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14484 14485static PyMethodDef unicodeiter_methods[] = { 14486 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14487 length_hint_doc}, 14488 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14489 reduce_doc}, 14490 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14491 setstate_doc}, 14492 {NULL, NULL} /* sentinel */ 14493}; 14494 14495PyTypeObject PyUnicodeIter_Type = { 14496 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14497 "str_iterator", /* tp_name */ 14498 sizeof(unicodeiterobject), /* tp_basicsize */ 14499 0, /* tp_itemsize */ 14500 /* methods */ 14501 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14502 0, /* tp_print */ 14503 0, /* tp_getattr */ 14504 0, /* tp_setattr */ 14505 0, /* tp_reserved */ 14506 0, /* tp_repr */ 14507 0, /* tp_as_number */ 14508 0, /* tp_as_sequence */ 14509 0, /* tp_as_mapping */ 14510 0, /* tp_hash */ 14511 0, /* tp_call */ 14512 0, /* tp_str */ 14513 PyObject_GenericGetAttr, /* tp_getattro */ 14514 0, /* tp_setattro */ 14515 0, /* tp_as_buffer */ 14516 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14517 0, /* tp_doc */ 14518 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14519 0, /* tp_clear */ 14520 0, /* tp_richcompare */ 14521 0, /* tp_weaklistoffset */ 14522 PyObject_SelfIter, /* tp_iter */ 14523 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14524 unicodeiter_methods, /* tp_methods */ 14525 0, 14526}; 14527 14528static PyObject * 14529unicode_iter(PyObject *seq) 14530{ 14531 unicodeiterobject *it; 14532 14533 if (!PyUnicode_Check(seq)) { 14534 PyErr_BadInternalCall(); 14535 return NULL; 14536 } 14537 if (PyUnicode_READY(seq) == -1) 14538 return NULL; 14539 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14540 if (it == NULL) 14541 return NULL; 14542 it->it_index = 0; 14543 Py_INCREF(seq); 14544 it->it_seq = seq; 14545 _PyObject_GC_TRACK(it); 14546 return (PyObject *)it; 14547} 14548 14549 14550size_t 14551Py_UNICODE_strlen(const Py_UNICODE *u) 14552{ 14553 int res = 0; 14554 while(*u++) 14555 res++; 14556 return res; 14557} 14558 14559Py_UNICODE* 14560Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14561{ 14562 Py_UNICODE *u = s1; 14563 while ((*u++ = *s2++)); 14564 return s1; 14565} 14566 14567Py_UNICODE* 14568Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14569{ 14570 Py_UNICODE *u = s1; 14571 while ((*u++ = *s2++)) 14572 if (n-- == 0) 14573 break; 14574 return s1; 14575} 14576 14577Py_UNICODE* 14578Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14579{ 14580 Py_UNICODE *u1 = s1; 14581 u1 += Py_UNICODE_strlen(u1); 14582 Py_UNICODE_strcpy(u1, s2); 14583 return s1; 14584} 14585 14586int 14587Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14588{ 14589 while (*s1 && *s2 && *s1 == *s2) 14590 s1++, s2++; 14591 if (*s1 && *s2) 14592 return (*s1 < *s2) ? -1 : +1; 14593 if (*s1) 14594 return 1; 14595 if (*s2) 14596 return -1; 14597 return 0; 14598} 14599 14600int 14601Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14602{ 14603 register Py_UNICODE u1, u2; 14604 for (; n != 0; n--) { 14605 u1 = *s1; 14606 u2 = *s2; 14607 if (u1 != u2) 14608 return (u1 < u2) ? -1 : +1; 14609 if (u1 == '\0') 14610 return 0; 14611 s1++; 14612 s2++; 14613 } 14614 return 0; 14615} 14616 14617Py_UNICODE* 14618Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14619{ 14620 const Py_UNICODE *p; 14621 for (p = s; *p; p++) 14622 if (*p == c) 14623 return (Py_UNICODE*)p; 14624 return NULL; 14625} 14626 14627Py_UNICODE* 14628Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14629{ 14630 const Py_UNICODE *p; 14631 p = s + Py_UNICODE_strlen(s); 14632 while (p != s) { 14633 p--; 14634 if (*p == c) 14635 return (Py_UNICODE*)p; 14636 } 14637 return NULL; 14638} 14639 14640Py_UNICODE* 14641PyUnicode_AsUnicodeCopy(PyObject *unicode) 14642{ 14643 Py_UNICODE *u, *copy; 14644 Py_ssize_t len, size; 14645 14646 if (!PyUnicode_Check(unicode)) { 14647 PyErr_BadArgument(); 14648 return NULL; 14649 } 14650 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14651 if (u == NULL) 14652 return NULL; 14653 /* Ensure we won't overflow the size. */ 14654 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14655 PyErr_NoMemory(); 14656 return NULL; 14657 } 14658 size = len + 1; /* copy the null character */ 14659 size *= sizeof(Py_UNICODE); 14660 copy = PyMem_Malloc(size); 14661 if (copy == NULL) { 14662 PyErr_NoMemory(); 14663 return NULL; 14664 } 14665 memcpy(copy, u, size); 14666 return copy; 14667} 14668 14669/* A _string module, to export formatter_parser and formatter_field_name_split 14670 to the string.Formatter class implemented in Python. */ 14671 14672static PyMethodDef _string_methods[] = { 14673 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14674 METH_O, PyDoc_STR("split the argument as a field name")}, 14675 {"formatter_parser", (PyCFunction) formatter_parser, 14676 METH_O, PyDoc_STR("parse the argument as a format string")}, 14677 {NULL, NULL} 14678}; 14679 14680static struct PyModuleDef _string_module = { 14681 PyModuleDef_HEAD_INIT, 14682 "_string", 14683 PyDoc_STR("string helper module"), 14684 0, 14685 _string_methods, 14686 NULL, 14687 NULL, 14688 NULL, 14689 NULL 14690}; 14691 14692PyMODINIT_FUNC 14693PyInit__string(void) 14694{ 14695 return PyModule_Create(&_string_module); 14696} 14697 14698 14699#ifdef __cplusplus 14700} 14701#endif 14702