unicodeobject.c revision 0d92c4f667518c7a24abda885e10c0c8e72cae57
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* --- Globals ------------------------------------------------------------ 51 52 The globals are initialized by the _PyUnicode_Init() API and should 53 not be used before calling that API. 54 55*/ 56 57 58#ifdef __cplusplus 59extern "C" { 60#endif 61 62/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 63#define MAX_UNICODE 0x10ffff 64 65#ifdef Py_DEBUG 66# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 67#else 68# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 69#endif 70 71#define _PyUnicode_UTF8(op) \ 72 (((PyCompactUnicodeObject*)(op))->utf8) 73#define PyUnicode_UTF8(op) \ 74 (assert(_PyUnicode_CHECK(op)), \ 75 assert(PyUnicode_IS_READY(op)), \ 76 PyUnicode_IS_COMPACT_ASCII(op) ? \ 77 ((char*)((PyASCIIObject*)(op) + 1)) : \ 78 _PyUnicode_UTF8(op)) 79#define _PyUnicode_UTF8_LENGTH(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8_length) 81#define PyUnicode_UTF8_LENGTH(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((PyASCIIObject*)(op))->length : \ 86 _PyUnicode_UTF8_LENGTH(op)) 87#define _PyUnicode_WSTR(op) \ 88 (((PyASCIIObject*)(op))->wstr) 89#define _PyUnicode_WSTR_LENGTH(op) \ 90 (((PyCompactUnicodeObject*)(op))->wstr_length) 91#define _PyUnicode_LENGTH(op) \ 92 (((PyASCIIObject *)(op))->length) 93#define _PyUnicode_STATE(op) \ 94 (((PyASCIIObject *)(op))->state) 95#define _PyUnicode_HASH(op) \ 96 (((PyASCIIObject *)(op))->hash) 97#define _PyUnicode_KIND(op) \ 98 (assert(_PyUnicode_CHECK(op)), \ 99 ((PyASCIIObject *)(op))->state.kind) 100#define _PyUnicode_GET_LENGTH(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 ((PyASCIIObject *)(op))->length) 103#define _PyUnicode_DATA_ANY(op) \ 104 (((PyUnicodeObject*)(op))->data.any) 105 106/* Optimized version of Py_MAX() to compute the maximum character: 107 use it when your are computing the second argument of PyUnicode_New() */ 108#define MAX_MAXCHAR(maxchar1, maxchar2) \ 109 ((maxchar1) | (maxchar2)) 110 111#undef PyUnicode_READY 112#define PyUnicode_READY(op) \ 113 (assert(_PyUnicode_CHECK(op)), \ 114 (PyUnicode_IS_READY(op) ? \ 115 0 : \ 116 _PyUnicode_Ready(op))) 117 118#define _PyUnicode_SHARE_UTF8(op) \ 119 (assert(_PyUnicode_CHECK(op)), \ 120 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 121 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 122#define _PyUnicode_SHARE_WSTR(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 125 126/* true if the Unicode object has an allocated UTF-8 memory block 127 (not shared with other data) */ 128#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 (!PyUnicode_IS_COMPACT_ASCII(op) \ 131 && _PyUnicode_UTF8(op) \ 132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated wstr memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (_PyUnicode_WSTR(op) && \ 139 (!PyUnicode_IS_READY(op) || \ 140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 141 142/* Generic helper macro to convert characters of different types. 143 from_type and to_type have to be valid type names, begin and end 144 are pointers to the source characters which should be of type 145 "from_type *". to is a pointer of type "to_type *" and points to the 146 buffer where the result characters are written to. */ 147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 148 do { \ 149 to_type *_to = (to_type *) to; \ 150 const from_type *_iter = (begin); \ 151 const from_type *_end = (end); \ 152 Py_ssize_t n = (_end) - (_iter); \ 153 const from_type *_unrolled_end = \ 154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 155 while (_iter < (_unrolled_end)) { \ 156 _to[0] = (to_type) _iter[0]; \ 157 _to[1] = (to_type) _iter[1]; \ 158 _to[2] = (to_type) _iter[2]; \ 159 _to[3] = (to_type) _iter[3]; \ 160 _iter += 4; _to += 4; \ 161 } \ 162 while (_iter < (_end)) \ 163 *_to++ = (to_type) *_iter++; \ 164 } while (0) 165 166/* This dictionary holds all interned unicode strings. Note that references 167 to strings in this dictionary are *not* counted in the string's ob_refcnt. 168 When the interned string reaches a refcnt of 0 the string deallocation 169 function will delete the reference from this dictionary. 170 171 Another way to look at this is that to say that the actual reference 172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 173*/ 174static PyObject *interned; 175 176/* The empty Unicode object is shared to improve performance. */ 177static PyObject *unicode_empty; 178 179/* List of static strings. */ 180static _Py_Identifier *static_strings; 181 182/* Single character Unicode strings in the Latin-1 range are being 183 shared as well. */ 184static PyObject *unicode_latin1[256]; 185 186/* Fast detection of the most frequent whitespace characters */ 187const unsigned char _Py_ascii_whitespace[] = { 188 0, 0, 0, 0, 0, 0, 0, 0, 189/* case 0x0009: * CHARACTER TABULATION */ 190/* case 0x000A: * LINE FEED */ 191/* case 0x000B: * LINE TABULATION */ 192/* case 0x000C: * FORM FEED */ 193/* case 0x000D: * CARRIAGE RETURN */ 194 0, 1, 1, 1, 1, 1, 0, 0, 195 0, 0, 0, 0, 0, 0, 0, 0, 196/* case 0x001C: * FILE SEPARATOR */ 197/* case 0x001D: * GROUP SEPARATOR */ 198/* case 0x001E: * RECORD SEPARATOR */ 199/* case 0x001F: * UNIT SEPARATOR */ 200 0, 0, 0, 0, 1, 1, 1, 1, 201/* case 0x0020: * SPACE */ 202 1, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0, 0, 0, 0, 0, 0, 0, 206 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0 215}; 216 217/* forward */ 218static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 219static PyObject* get_latin1_char(unsigned char ch); 220static int unicode_modifiable(PyObject *unicode); 221 222 223static PyObject * 224_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 225static PyObject * 226_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 227static PyObject * 228_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 229 230static PyObject * 231unicode_encode_call_errorhandler(const char *errors, 232 PyObject **errorHandler,const char *encoding, const char *reason, 233 PyObject *unicode, PyObject **exceptionObject, 234 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 235 236static void 237raise_encode_exception(PyObject **exceptionObject, 238 const char *encoding, 239 PyObject *unicode, 240 Py_ssize_t startpos, Py_ssize_t endpos, 241 const char *reason); 242 243/* Same for linebreaks */ 244static unsigned char ascii_linebreak[] = { 245 0, 0, 0, 0, 0, 0, 0, 0, 246/* 0x000A, * LINE FEED */ 247/* 0x000B, * LINE TABULATION */ 248/* 0x000C, * FORM FEED */ 249/* 0x000D, * CARRIAGE RETURN */ 250 0, 0, 1, 1, 1, 1, 0, 0, 251 0, 0, 0, 0, 0, 0, 0, 0, 252/* 0x001C, * FILE SEPARATOR */ 253/* 0x001D, * GROUP SEPARATOR */ 254/* 0x001E, * RECORD SEPARATOR */ 255 0, 0, 0, 0, 1, 1, 1, 0, 256 0, 0, 0, 0, 0, 0, 0, 0, 257 0, 0, 0, 0, 0, 0, 0, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260 261 0, 0, 0, 0, 0, 0, 0, 0, 262 0, 0, 0, 0, 0, 0, 0, 0, 263 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0 269}; 270 271/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 272 This function is kept for backward compatibility with the old API. */ 273Py_UNICODE 274PyUnicode_GetMax(void) 275{ 276#ifdef Py_UNICODE_WIDE 277 return 0x10FFFF; 278#else 279 /* This is actually an illegal character, so it should 280 not be passed to unichr. */ 281 return 0xFFFF; 282#endif 283} 284 285#ifdef Py_DEBUG 286int 287_PyUnicode_CheckConsistency(PyObject *op, int check_content) 288{ 289 PyASCIIObject *ascii; 290 unsigned int kind; 291 292 assert(PyUnicode_Check(op)); 293 294 ascii = (PyASCIIObject *)op; 295 kind = ascii->state.kind; 296 297 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 298 assert(kind == PyUnicode_1BYTE_KIND); 299 assert(ascii->state.ready == 1); 300 } 301 else { 302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 303 void *data; 304 305 if (ascii->state.compact == 1) { 306 data = compact + 1; 307 assert(kind == PyUnicode_1BYTE_KIND 308 || kind == PyUnicode_2BYTE_KIND 309 || kind == PyUnicode_4BYTE_KIND); 310 assert(ascii->state.ascii == 0); 311 assert(ascii->state.ready == 1); 312 assert (compact->utf8 != data); 313 } 314 else { 315 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 316 317 data = unicode->data.any; 318 if (kind == PyUnicode_WCHAR_KIND) { 319 assert(ascii->length == 0); 320 assert(ascii->hash == -1); 321 assert(ascii->state.compact == 0); 322 assert(ascii->state.ascii == 0); 323 assert(ascii->state.ready == 0); 324 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 325 assert(ascii->wstr != NULL); 326 assert(data == NULL); 327 assert(compact->utf8 == NULL); 328 } 329 else { 330 assert(kind == PyUnicode_1BYTE_KIND 331 || kind == PyUnicode_2BYTE_KIND 332 || kind == PyUnicode_4BYTE_KIND); 333 assert(ascii->state.compact == 0); 334 assert(ascii->state.ready == 1); 335 assert(data != NULL); 336 if (ascii->state.ascii) { 337 assert (compact->utf8 == data); 338 assert (compact->utf8_length == ascii->length); 339 } 340 else 341 assert (compact->utf8 != data); 342 } 343 } 344 if (kind != PyUnicode_WCHAR_KIND) { 345 if ( 346#if SIZEOF_WCHAR_T == 2 347 kind == PyUnicode_2BYTE_KIND 348#else 349 kind == PyUnicode_4BYTE_KIND 350#endif 351 ) 352 { 353 assert(ascii->wstr == data); 354 assert(compact->wstr_length == ascii->length); 355 } else 356 assert(ascii->wstr != data); 357 } 358 359 if (compact->utf8 == NULL) 360 assert(compact->utf8_length == 0); 361 if (ascii->wstr == NULL) 362 assert(compact->wstr_length == 0); 363 } 364 /* check that the best kind is used */ 365 if (check_content && kind != PyUnicode_WCHAR_KIND) 366 { 367 Py_ssize_t i; 368 Py_UCS4 maxchar = 0; 369 void *data; 370 Py_UCS4 ch; 371 372 data = PyUnicode_DATA(ascii); 373 for (i=0; i < ascii->length; i++) 374 { 375 ch = PyUnicode_READ(kind, data, i); 376 if (ch > maxchar) 377 maxchar = ch; 378 } 379 if (kind == PyUnicode_1BYTE_KIND) { 380 if (ascii->state.ascii == 0) { 381 assert(maxchar >= 128); 382 assert(maxchar <= 255); 383 } 384 else 385 assert(maxchar < 128); 386 } 387 else if (kind == PyUnicode_2BYTE_KIND) { 388 assert(maxchar >= 0x100); 389 assert(maxchar <= 0xFFFF); 390 } 391 else { 392 assert(maxchar >= 0x10000); 393 assert(maxchar <= MAX_UNICODE); 394 } 395 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 396 } 397 return 1; 398} 399#endif 400 401static PyObject* 402unicode_result_wchar(PyObject *unicode) 403{ 404#ifndef Py_DEBUG 405 Py_ssize_t len; 406 407 len = _PyUnicode_WSTR_LENGTH(unicode); 408 if (len == 0) { 409 Py_INCREF(unicode_empty); 410 Py_DECREF(unicode); 411 return unicode_empty; 412 } 413 414 if (len == 1) { 415 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 416 if (ch < 256) { 417 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 418 Py_DECREF(unicode); 419 return latin1_char; 420 } 421 } 422 423 if (_PyUnicode_Ready(unicode) < 0) { 424 Py_DECREF(unicode); 425 return NULL; 426 } 427#else 428 assert(Py_REFCNT(unicode) == 1); 429 430 /* don't make the result ready in debug mode to ensure that the caller 431 makes the string ready before using it */ 432 assert(_PyUnicode_CheckConsistency(unicode, 1)); 433#endif 434 return unicode; 435} 436 437static PyObject* 438unicode_result_ready(PyObject *unicode) 439{ 440 Py_ssize_t length; 441 442 length = PyUnicode_GET_LENGTH(unicode); 443 if (length == 0) { 444 if (unicode != unicode_empty) { 445 Py_INCREF(unicode_empty); 446 Py_DECREF(unicode); 447 } 448 return unicode_empty; 449 } 450 451 if (length == 1) { 452 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 453 if (ch < 256) { 454 PyObject *latin1_char = unicode_latin1[ch]; 455 if (latin1_char != NULL) { 456 if (unicode != latin1_char) { 457 Py_INCREF(latin1_char); 458 Py_DECREF(unicode); 459 } 460 return latin1_char; 461 } 462 else { 463 assert(_PyUnicode_CheckConsistency(unicode, 1)); 464 Py_INCREF(unicode); 465 unicode_latin1[ch] = unicode; 466 return unicode; 467 } 468 } 469 } 470 471 assert(_PyUnicode_CheckConsistency(unicode, 1)); 472 return unicode; 473} 474 475static PyObject* 476unicode_result(PyObject *unicode) 477{ 478 assert(_PyUnicode_CHECK(unicode)); 479 if (PyUnicode_IS_READY(unicode)) 480 return unicode_result_ready(unicode); 481 else 482 return unicode_result_wchar(unicode); 483} 484 485static PyObject* 486unicode_result_unchanged(PyObject *unicode) 487{ 488 if (PyUnicode_CheckExact(unicode)) { 489 if (PyUnicode_READY(unicode) == -1) 490 return NULL; 491 Py_INCREF(unicode); 492 return unicode; 493 } 494 else 495 /* Subtype -- return genuine unicode string with the same value. */ 496 return _PyUnicode_Copy(unicode); 497} 498 499#ifdef HAVE_MBCS 500static OSVERSIONINFOEX winver; 501#endif 502 503/* --- Bloom Filters ----------------------------------------------------- */ 504 505/* stuff to implement simple "bloom filters" for Unicode characters. 506 to keep things simple, we use a single bitmask, using the least 5 507 bits from each unicode characters as the bit index. */ 508 509/* the linebreak mask is set up by Unicode_Init below */ 510 511#if LONG_BIT >= 128 512#define BLOOM_WIDTH 128 513#elif LONG_BIT >= 64 514#define BLOOM_WIDTH 64 515#elif LONG_BIT >= 32 516#define BLOOM_WIDTH 32 517#else 518#error "LONG_BIT is smaller than 32" 519#endif 520 521#define BLOOM_MASK unsigned long 522 523static BLOOM_MASK bloom_linebreak; 524 525#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 526#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 527 528#define BLOOM_LINEBREAK(ch) \ 529 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 530 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 531 532Py_LOCAL_INLINE(BLOOM_MASK) 533make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 534{ 535 /* calculate simple bloom-style bitmask for a given unicode string */ 536 537 BLOOM_MASK mask; 538 Py_ssize_t i; 539 540 mask = 0; 541 for (i = 0; i < len; i++) 542 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 543 544 return mask; 545} 546 547#define BLOOM_MEMBER(mask, chr, str) \ 548 (BLOOM(mask, chr) \ 549 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 550 551/* Compilation of templated routines */ 552 553#include "stringlib/asciilib.h" 554#include "stringlib/fastsearch.h" 555#include "stringlib/partition.h" 556#include "stringlib/split.h" 557#include "stringlib/count.h" 558#include "stringlib/find.h" 559#include "stringlib/find_max_char.h" 560#include "stringlib/localeutil.h" 561#include "stringlib/undef.h" 562 563#include "stringlib/ucs1lib.h" 564#include "stringlib/fastsearch.h" 565#include "stringlib/partition.h" 566#include "stringlib/split.h" 567#include "stringlib/count.h" 568#include "stringlib/find.h" 569#include "stringlib/find_max_char.h" 570#include "stringlib/localeutil.h" 571#include "stringlib/undef.h" 572 573#include "stringlib/ucs2lib.h" 574#include "stringlib/fastsearch.h" 575#include "stringlib/partition.h" 576#include "stringlib/split.h" 577#include "stringlib/count.h" 578#include "stringlib/find.h" 579#include "stringlib/find_max_char.h" 580#include "stringlib/localeutil.h" 581#include "stringlib/undef.h" 582 583#include "stringlib/ucs4lib.h" 584#include "stringlib/fastsearch.h" 585#include "stringlib/partition.h" 586#include "stringlib/split.h" 587#include "stringlib/count.h" 588#include "stringlib/find.h" 589#include "stringlib/find_max_char.h" 590#include "stringlib/localeutil.h" 591#include "stringlib/undef.h" 592 593#include "stringlib/unicodedefs.h" 594#include "stringlib/fastsearch.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/undef.h" 598 599/* --- Unicode Object ----------------------------------------------------- */ 600 601static PyObject * 602fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 603 604Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 605 Py_ssize_t size, Py_UCS4 ch, 606 int direction) 607{ 608 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 609 610 switch (kind) { 611 case PyUnicode_1BYTE_KIND: 612 { 613 Py_UCS1 ch1 = (Py_UCS1) ch; 614 if (ch1 == ch) 615 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 616 else 617 return -1; 618 } 619 case PyUnicode_2BYTE_KIND: 620 { 621 Py_UCS2 ch2 = (Py_UCS2) ch; 622 if (ch2 == ch) 623 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 624 else 625 return -1; 626 } 627 case PyUnicode_4BYTE_KIND: 628 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 629 default: 630 assert(0); 631 return -1; 632 } 633} 634 635#ifdef Py_DEBUG 636/* Fill the data of an Unicode string with invalid characters to detect bugs 637 earlier. 638 639 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 640 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 641 invalid character in Unicode 6.0. */ 642static void 643unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 644{ 645 int kind = PyUnicode_KIND(unicode); 646 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 647 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 648 if (length <= old_length) 649 return; 650 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 651} 652#endif 653 654static PyObject* 655resize_compact(PyObject *unicode, Py_ssize_t length) 656{ 657 Py_ssize_t char_size; 658 Py_ssize_t struct_size; 659 Py_ssize_t new_size; 660 int share_wstr; 661 PyObject *new_unicode; 662#ifdef Py_DEBUG 663 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 664#endif 665 666 assert(unicode_modifiable(unicode)); 667 assert(PyUnicode_IS_READY(unicode)); 668 assert(PyUnicode_IS_COMPACT(unicode)); 669 670 char_size = PyUnicode_KIND(unicode); 671 if (PyUnicode_IS_ASCII(unicode)) 672 struct_size = sizeof(PyASCIIObject); 673 else 674 struct_size = sizeof(PyCompactUnicodeObject); 675 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 676 677 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 678 PyErr_NoMemory(); 679 return NULL; 680 } 681 new_size = (struct_size + (length + 1) * char_size); 682 683 _Py_DEC_REFTOTAL; 684 _Py_ForgetReference(unicode); 685 686 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 687 if (new_unicode == NULL) { 688 _Py_NewReference(unicode); 689 PyErr_NoMemory(); 690 return NULL; 691 } 692 unicode = new_unicode; 693 _Py_NewReference(unicode); 694 695 _PyUnicode_LENGTH(unicode) = length; 696 if (share_wstr) { 697 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 698 if (!PyUnicode_IS_ASCII(unicode)) 699 _PyUnicode_WSTR_LENGTH(unicode) = length; 700 } 701#ifdef Py_DEBUG 702 unicode_fill_invalid(unicode, old_length); 703#endif 704 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 705 length, 0); 706 assert(_PyUnicode_CheckConsistency(unicode, 0)); 707 return unicode; 708} 709 710static int 711resize_inplace(PyObject *unicode, Py_ssize_t length) 712{ 713 wchar_t *wstr; 714 Py_ssize_t new_size; 715 assert(!PyUnicode_IS_COMPACT(unicode)); 716 assert(Py_REFCNT(unicode) == 1); 717 718 if (PyUnicode_IS_READY(unicode)) { 719 Py_ssize_t char_size; 720 int share_wstr, share_utf8; 721 void *data; 722#ifdef Py_DEBUG 723 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 724#endif 725 726 data = _PyUnicode_DATA_ANY(unicode); 727 char_size = PyUnicode_KIND(unicode); 728 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 729 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 730 731 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 732 PyErr_NoMemory(); 733 return -1; 734 } 735 new_size = (length + 1) * char_size; 736 737 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 738 { 739 PyObject_DEL(_PyUnicode_UTF8(unicode)); 740 _PyUnicode_UTF8(unicode) = NULL; 741 _PyUnicode_UTF8_LENGTH(unicode) = 0; 742 } 743 744 data = (PyObject *)PyObject_REALLOC(data, new_size); 745 if (data == NULL) { 746 PyErr_NoMemory(); 747 return -1; 748 } 749 _PyUnicode_DATA_ANY(unicode) = data; 750 if (share_wstr) { 751 _PyUnicode_WSTR(unicode) = data; 752 _PyUnicode_WSTR_LENGTH(unicode) = length; 753 } 754 if (share_utf8) { 755 _PyUnicode_UTF8(unicode) = data; 756 _PyUnicode_UTF8_LENGTH(unicode) = length; 757 } 758 _PyUnicode_LENGTH(unicode) = length; 759 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 760#ifdef Py_DEBUG 761 unicode_fill_invalid(unicode, old_length); 762#endif 763 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 764 assert(_PyUnicode_CheckConsistency(unicode, 0)); 765 return 0; 766 } 767 } 768 assert(_PyUnicode_WSTR(unicode) != NULL); 769 770 /* check for integer overflow */ 771 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 772 PyErr_NoMemory(); 773 return -1; 774 } 775 new_size = sizeof(wchar_t) * (length + 1); 776 wstr = _PyUnicode_WSTR(unicode); 777 wstr = PyObject_REALLOC(wstr, new_size); 778 if (!wstr) { 779 PyErr_NoMemory(); 780 return -1; 781 } 782 _PyUnicode_WSTR(unicode) = wstr; 783 _PyUnicode_WSTR(unicode)[length] = 0; 784 _PyUnicode_WSTR_LENGTH(unicode) = length; 785 assert(_PyUnicode_CheckConsistency(unicode, 0)); 786 return 0; 787} 788 789static PyObject* 790resize_copy(PyObject *unicode, Py_ssize_t length) 791{ 792 Py_ssize_t copy_length; 793 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 794 PyObject *copy; 795 796 if (PyUnicode_READY(unicode) == -1) 797 return NULL; 798 799 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 800 if (copy == NULL) 801 return NULL; 802 803 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 804 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 805 return copy; 806 } 807 else { 808 PyObject *w; 809 810 w = (PyObject*)_PyUnicode_New(length); 811 if (w == NULL) 812 return NULL; 813 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 814 copy_length = Py_MIN(copy_length, length); 815 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 816 copy_length * sizeof(wchar_t)); 817 return w; 818 } 819} 820 821/* We allocate one more byte to make sure the string is 822 Ux0000 terminated; some code (e.g. new_identifier) 823 relies on that. 824 825 XXX This allocator could further be enhanced by assuring that the 826 free list never reduces its size below 1. 827 828*/ 829 830static PyUnicodeObject * 831_PyUnicode_New(Py_ssize_t length) 832{ 833 register PyUnicodeObject *unicode; 834 size_t new_size; 835 836 /* Optimization for empty strings */ 837 if (length == 0 && unicode_empty != NULL) { 838 Py_INCREF(unicode_empty); 839 return (PyUnicodeObject*)unicode_empty; 840 } 841 842 /* Ensure we won't overflow the size. */ 843 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 844 return (PyUnicodeObject *)PyErr_NoMemory(); 845 } 846 if (length < 0) { 847 PyErr_SetString(PyExc_SystemError, 848 "Negative size passed to _PyUnicode_New"); 849 return NULL; 850 } 851 852 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 853 if (unicode == NULL) 854 return NULL; 855 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 856 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 857 if (!_PyUnicode_WSTR(unicode)) { 858 Py_DECREF(unicode); 859 PyErr_NoMemory(); 860 return NULL; 861 } 862 863 /* Initialize the first element to guard against cases where 864 * the caller fails before initializing str -- unicode_resize() 865 * reads str[0], and the Keep-Alive optimization can keep memory 866 * allocated for str alive across a call to unicode_dealloc(unicode). 867 * We don't want unicode_resize to read uninitialized memory in 868 * that case. 869 */ 870 _PyUnicode_WSTR(unicode)[0] = 0; 871 _PyUnicode_WSTR(unicode)[length] = 0; 872 _PyUnicode_WSTR_LENGTH(unicode) = length; 873 _PyUnicode_HASH(unicode) = -1; 874 _PyUnicode_STATE(unicode).interned = 0; 875 _PyUnicode_STATE(unicode).kind = 0; 876 _PyUnicode_STATE(unicode).compact = 0; 877 _PyUnicode_STATE(unicode).ready = 0; 878 _PyUnicode_STATE(unicode).ascii = 0; 879 _PyUnicode_DATA_ANY(unicode) = NULL; 880 _PyUnicode_LENGTH(unicode) = 0; 881 _PyUnicode_UTF8(unicode) = NULL; 882 _PyUnicode_UTF8_LENGTH(unicode) = 0; 883 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 884 return unicode; 885} 886 887static const char* 888unicode_kind_name(PyObject *unicode) 889{ 890 /* don't check consistency: unicode_kind_name() is called from 891 _PyUnicode_Dump() */ 892 if (!PyUnicode_IS_COMPACT(unicode)) 893 { 894 if (!PyUnicode_IS_READY(unicode)) 895 return "wstr"; 896 switch (PyUnicode_KIND(unicode)) 897 { 898 case PyUnicode_1BYTE_KIND: 899 if (PyUnicode_IS_ASCII(unicode)) 900 return "legacy ascii"; 901 else 902 return "legacy latin1"; 903 case PyUnicode_2BYTE_KIND: 904 return "legacy UCS2"; 905 case PyUnicode_4BYTE_KIND: 906 return "legacy UCS4"; 907 default: 908 return "<legacy invalid kind>"; 909 } 910 } 911 assert(PyUnicode_IS_READY(unicode)); 912 switch (PyUnicode_KIND(unicode)) { 913 case PyUnicode_1BYTE_KIND: 914 if (PyUnicode_IS_ASCII(unicode)) 915 return "ascii"; 916 else 917 return "latin1"; 918 case PyUnicode_2BYTE_KIND: 919 return "UCS2"; 920 case PyUnicode_4BYTE_KIND: 921 return "UCS4"; 922 default: 923 return "<invalid compact kind>"; 924 } 925} 926 927#ifdef Py_DEBUG 928/* Functions wrapping macros for use in debugger */ 929char *_PyUnicode_utf8(void *unicode){ 930 return PyUnicode_UTF8(unicode); 931} 932 933void *_PyUnicode_compact_data(void *unicode) { 934 return _PyUnicode_COMPACT_DATA(unicode); 935} 936void *_PyUnicode_data(void *unicode){ 937 printf("obj %p\n", unicode); 938 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 939 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 940 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 941 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 942 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 943 return PyUnicode_DATA(unicode); 944} 945 946void 947_PyUnicode_Dump(PyObject *op) 948{ 949 PyASCIIObject *ascii = (PyASCIIObject *)op; 950 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 951 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 952 void *data; 953 954 if (ascii->state.compact) 955 { 956 if (ascii->state.ascii) 957 data = (ascii + 1); 958 else 959 data = (compact + 1); 960 } 961 else 962 data = unicode->data.any; 963 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 964 965 if (ascii->wstr == data) 966 printf("shared "); 967 printf("wstr=%p", ascii->wstr); 968 969 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 970 printf(" (%zu), ", compact->wstr_length); 971 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 972 printf("shared "); 973 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 974 } 975 printf(", data=%p\n", data); 976} 977#endif 978 979PyObject * 980PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 981{ 982 PyObject *obj; 983 PyCompactUnicodeObject *unicode; 984 void *data; 985 enum PyUnicode_Kind kind; 986 int is_sharing, is_ascii; 987 Py_ssize_t char_size; 988 Py_ssize_t struct_size; 989 990 /* Optimization for empty strings */ 991 if (size == 0 && unicode_empty != NULL) { 992 Py_INCREF(unicode_empty); 993 return unicode_empty; 994 } 995 996 is_ascii = 0; 997 is_sharing = 0; 998 struct_size = sizeof(PyCompactUnicodeObject); 999 if (maxchar < 128) { 1000 kind = PyUnicode_1BYTE_KIND; 1001 char_size = 1; 1002 is_ascii = 1; 1003 struct_size = sizeof(PyASCIIObject); 1004 } 1005 else if (maxchar < 256) { 1006 kind = PyUnicode_1BYTE_KIND; 1007 char_size = 1; 1008 } 1009 else if (maxchar < 65536) { 1010 kind = PyUnicode_2BYTE_KIND; 1011 char_size = 2; 1012 if (sizeof(wchar_t) == 2) 1013 is_sharing = 1; 1014 } 1015 else { 1016 if (maxchar > MAX_UNICODE) { 1017 PyErr_SetString(PyExc_SystemError, 1018 "invalid maximum character passed to PyUnicode_New"); 1019 return NULL; 1020 } 1021 kind = PyUnicode_4BYTE_KIND; 1022 char_size = 4; 1023 if (sizeof(wchar_t) == 4) 1024 is_sharing = 1; 1025 } 1026 1027 /* Ensure we won't overflow the size. */ 1028 if (size < 0) { 1029 PyErr_SetString(PyExc_SystemError, 1030 "Negative size passed to PyUnicode_New"); 1031 return NULL; 1032 } 1033 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1034 return PyErr_NoMemory(); 1035 1036 /* Duplicated allocation code from _PyObject_New() instead of a call to 1037 * PyObject_New() so we are able to allocate space for the object and 1038 * it's data buffer. 1039 */ 1040 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1041 if (obj == NULL) 1042 return PyErr_NoMemory(); 1043 obj = PyObject_INIT(obj, &PyUnicode_Type); 1044 if (obj == NULL) 1045 return NULL; 1046 1047 unicode = (PyCompactUnicodeObject *)obj; 1048 if (is_ascii) 1049 data = ((PyASCIIObject*)obj) + 1; 1050 else 1051 data = unicode + 1; 1052 _PyUnicode_LENGTH(unicode) = size; 1053 _PyUnicode_HASH(unicode) = -1; 1054 _PyUnicode_STATE(unicode).interned = 0; 1055 _PyUnicode_STATE(unicode).kind = kind; 1056 _PyUnicode_STATE(unicode).compact = 1; 1057 _PyUnicode_STATE(unicode).ready = 1; 1058 _PyUnicode_STATE(unicode).ascii = is_ascii; 1059 if (is_ascii) { 1060 ((char*)data)[size] = 0; 1061 _PyUnicode_WSTR(unicode) = NULL; 1062 } 1063 else if (kind == PyUnicode_1BYTE_KIND) { 1064 ((char*)data)[size] = 0; 1065 _PyUnicode_WSTR(unicode) = NULL; 1066 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1067 unicode->utf8 = NULL; 1068 unicode->utf8_length = 0; 1069 } 1070 else { 1071 unicode->utf8 = NULL; 1072 unicode->utf8_length = 0; 1073 if (kind == PyUnicode_2BYTE_KIND) 1074 ((Py_UCS2*)data)[size] = 0; 1075 else /* kind == PyUnicode_4BYTE_KIND */ 1076 ((Py_UCS4*)data)[size] = 0; 1077 if (is_sharing) { 1078 _PyUnicode_WSTR_LENGTH(unicode) = size; 1079 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1080 } 1081 else { 1082 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1083 _PyUnicode_WSTR(unicode) = NULL; 1084 } 1085 } 1086#ifdef Py_DEBUG 1087 unicode_fill_invalid((PyObject*)unicode, 0); 1088#endif 1089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1090 return obj; 1091} 1092 1093#if SIZEOF_WCHAR_T == 2 1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1095 will decode surrogate pairs, the other conversions are implemented as macros 1096 for efficiency. 1097 1098 This function assumes that unicode can hold one more code point than wstr 1099 characters for a terminating null character. */ 1100static void 1101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1102 PyObject *unicode) 1103{ 1104 const wchar_t *iter; 1105 Py_UCS4 *ucs4_out; 1106 1107 assert(unicode != NULL); 1108 assert(_PyUnicode_CHECK(unicode)); 1109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1111 1112 for (iter = begin; iter < end; ) { 1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1114 _PyUnicode_GET_LENGTH(unicode))); 1115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1116 && (iter+1) < end 1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1118 { 1119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1120 iter += 2; 1121 } 1122 else { 1123 *ucs4_out++ = *iter; 1124 iter++; 1125 } 1126 } 1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1128 _PyUnicode_GET_LENGTH(unicode))); 1129 1130} 1131#endif 1132 1133static int 1134unicode_check_modifiable(PyObject *unicode) 1135{ 1136 if (!unicode_modifiable(unicode)) { 1137 PyErr_SetString(PyExc_SystemError, 1138 "Cannot modify a string currently used"); 1139 return -1; 1140 } 1141 return 0; 1142} 1143 1144static int 1145_copy_characters(PyObject *to, Py_ssize_t to_start, 1146 PyObject *from, Py_ssize_t from_start, 1147 Py_ssize_t how_many, int check_maxchar) 1148{ 1149 unsigned int from_kind, to_kind; 1150 void *from_data, *to_data; 1151 1152 assert(0 <= how_many); 1153 assert(0 <= from_start); 1154 assert(0 <= to_start); 1155 assert(PyUnicode_Check(from)); 1156 assert(PyUnicode_IS_READY(from)); 1157 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1158 1159 assert(PyUnicode_Check(to)); 1160 assert(PyUnicode_IS_READY(to)); 1161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1162 1163 if (how_many == 0) 1164 return 0; 1165 1166 from_kind = PyUnicode_KIND(from); 1167 from_data = PyUnicode_DATA(from); 1168 to_kind = PyUnicode_KIND(to); 1169 to_data = PyUnicode_DATA(to); 1170 1171#ifdef Py_DEBUG 1172 if (!check_maxchar 1173 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1174 { 1175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1176 Py_UCS4 ch; 1177 Py_ssize_t i; 1178 for (i=0; i < how_many; i++) { 1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1180 assert(ch <= to_maxchar); 1181 } 1182 } 1183#endif 1184 1185 if (from_kind == to_kind) { 1186 if (check_maxchar 1187 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1188 { 1189 /* Writing Latin-1 characters into an ASCII string requires to 1190 check that all written characters are pure ASCII */ 1191 Py_UCS4 max_char; 1192 max_char = ucs1lib_find_max_char(from_data, 1193 (Py_UCS1*)from_data + how_many); 1194 if (max_char >= 128) 1195 return -1; 1196 } 1197 Py_MEMCPY((char*)to_data + to_kind * to_start, 1198 (char*)from_data + from_kind * from_start, 1199 to_kind * how_many); 1200 } 1201 else if (from_kind == PyUnicode_1BYTE_KIND 1202 && to_kind == PyUnicode_2BYTE_KIND) 1203 { 1204 _PyUnicode_CONVERT_BYTES( 1205 Py_UCS1, Py_UCS2, 1206 PyUnicode_1BYTE_DATA(from) + from_start, 1207 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1208 PyUnicode_2BYTE_DATA(to) + to_start 1209 ); 1210 } 1211 else if (from_kind == PyUnicode_1BYTE_KIND 1212 && to_kind == PyUnicode_4BYTE_KIND) 1213 { 1214 _PyUnicode_CONVERT_BYTES( 1215 Py_UCS1, Py_UCS4, 1216 PyUnicode_1BYTE_DATA(from) + from_start, 1217 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1218 PyUnicode_4BYTE_DATA(to) + to_start 1219 ); 1220 } 1221 else if (from_kind == PyUnicode_2BYTE_KIND 1222 && to_kind == PyUnicode_4BYTE_KIND) 1223 { 1224 _PyUnicode_CONVERT_BYTES( 1225 Py_UCS2, Py_UCS4, 1226 PyUnicode_2BYTE_DATA(from) + from_start, 1227 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1228 PyUnicode_4BYTE_DATA(to) + to_start 1229 ); 1230 } 1231 else { 1232 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1233 1234 if (!check_maxchar) { 1235 if (from_kind == PyUnicode_2BYTE_KIND 1236 && to_kind == PyUnicode_1BYTE_KIND) 1237 { 1238 _PyUnicode_CONVERT_BYTES( 1239 Py_UCS2, Py_UCS1, 1240 PyUnicode_2BYTE_DATA(from) + from_start, 1241 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1242 PyUnicode_1BYTE_DATA(to) + to_start 1243 ); 1244 } 1245 else if (from_kind == PyUnicode_4BYTE_KIND 1246 && to_kind == PyUnicode_1BYTE_KIND) 1247 { 1248 _PyUnicode_CONVERT_BYTES( 1249 Py_UCS4, Py_UCS1, 1250 PyUnicode_4BYTE_DATA(from) + from_start, 1251 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1252 PyUnicode_1BYTE_DATA(to) + to_start 1253 ); 1254 } 1255 else if (from_kind == PyUnicode_4BYTE_KIND 1256 && to_kind == PyUnicode_2BYTE_KIND) 1257 { 1258 _PyUnicode_CONVERT_BYTES( 1259 Py_UCS4, Py_UCS2, 1260 PyUnicode_4BYTE_DATA(from) + from_start, 1261 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1262 PyUnicode_2BYTE_DATA(to) + to_start 1263 ); 1264 } 1265 else { 1266 assert(0); 1267 return -1; 1268 } 1269 } 1270 else { 1271 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1272 Py_UCS4 ch; 1273 Py_ssize_t i; 1274 1275 for (i=0; i < how_many; i++) { 1276 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1277 if (ch > to_maxchar) 1278 return -1; 1279 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1280 } 1281 } 1282 } 1283 return 0; 1284} 1285 1286void 1287_PyUnicode_FastCopyCharacters( 1288 PyObject *to, Py_ssize_t to_start, 1289 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1290{ 1291 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1292} 1293 1294Py_ssize_t 1295PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1296 PyObject *from, Py_ssize_t from_start, 1297 Py_ssize_t how_many) 1298{ 1299 int err; 1300 1301 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1302 PyErr_BadInternalCall(); 1303 return -1; 1304 } 1305 1306 if (PyUnicode_READY(from) == -1) 1307 return -1; 1308 if (PyUnicode_READY(to) == -1) 1309 return -1; 1310 1311 if (from_start < 0) { 1312 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1313 return -1; 1314 } 1315 if (to_start < 0) { 1316 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1317 return -1; 1318 } 1319 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1320 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1321 PyErr_Format(PyExc_SystemError, 1322 "Cannot write %zi characters at %zi " 1323 "in a string of %zi characters", 1324 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1325 return -1; 1326 } 1327 1328 if (how_many == 0) 1329 return 0; 1330 1331 if (unicode_check_modifiable(to)) 1332 return -1; 1333 1334 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1335 if (err) { 1336 PyErr_Format(PyExc_SystemError, 1337 "Cannot copy %s characters " 1338 "into a string of %s characters", 1339 unicode_kind_name(from), 1340 unicode_kind_name(to)); 1341 return -1; 1342 } 1343 return how_many; 1344} 1345 1346/* Find the maximum code point and count the number of surrogate pairs so a 1347 correct string length can be computed before converting a string to UCS4. 1348 This function counts single surrogates as a character and not as a pair. 1349 1350 Return 0 on success, or -1 on error. */ 1351static int 1352find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1353 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1354{ 1355 const wchar_t *iter; 1356 Py_UCS4 ch; 1357 1358 assert(num_surrogates != NULL && maxchar != NULL); 1359 *num_surrogates = 0; 1360 *maxchar = 0; 1361 1362 for (iter = begin; iter < end; ) { 1363#if SIZEOF_WCHAR_T == 2 1364 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1365 && (iter+1) < end 1366 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1367 { 1368 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1369 ++(*num_surrogates); 1370 iter += 2; 1371 } 1372 else 1373#endif 1374 { 1375 ch = *iter; 1376 iter++; 1377 } 1378 if (ch > *maxchar) { 1379 *maxchar = ch; 1380 if (*maxchar > MAX_UNICODE) { 1381 PyErr_Format(PyExc_ValueError, 1382 "character U+%x is not in range [U+0000; U+10ffff]", 1383 ch); 1384 return -1; 1385 } 1386 } 1387 } 1388 return 0; 1389} 1390 1391int 1392_PyUnicode_Ready(PyObject *unicode) 1393{ 1394 wchar_t *end; 1395 Py_UCS4 maxchar = 0; 1396 Py_ssize_t num_surrogates; 1397#if SIZEOF_WCHAR_T == 2 1398 Py_ssize_t length_wo_surrogates; 1399#endif 1400 1401 /* _PyUnicode_Ready() is only intended for old-style API usage where 1402 strings were created using _PyObject_New() and where no canonical 1403 representation (the str field) has been set yet aka strings 1404 which are not yet ready. */ 1405 assert(_PyUnicode_CHECK(unicode)); 1406 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1407 assert(_PyUnicode_WSTR(unicode) != NULL); 1408 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1409 assert(_PyUnicode_UTF8(unicode) == NULL); 1410 /* Actually, it should neither be interned nor be anything else: */ 1411 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1412 1413 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1414 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1415 &maxchar, &num_surrogates) == -1) 1416 return -1; 1417 1418 if (maxchar < 256) { 1419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1420 if (!_PyUnicode_DATA_ANY(unicode)) { 1421 PyErr_NoMemory(); 1422 return -1; 1423 } 1424 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1425 _PyUnicode_WSTR(unicode), end, 1426 PyUnicode_1BYTE_DATA(unicode)); 1427 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1428 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1429 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1430 if (maxchar < 128) { 1431 _PyUnicode_STATE(unicode).ascii = 1; 1432 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1433 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1434 } 1435 else { 1436 _PyUnicode_STATE(unicode).ascii = 0; 1437 _PyUnicode_UTF8(unicode) = NULL; 1438 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1439 } 1440 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1441 _PyUnicode_WSTR(unicode) = NULL; 1442 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1443 } 1444 /* In this case we might have to convert down from 4-byte native 1445 wchar_t to 2-byte unicode. */ 1446 else if (maxchar < 65536) { 1447 assert(num_surrogates == 0 && 1448 "FindMaxCharAndNumSurrogatePairs() messed up"); 1449 1450#if SIZEOF_WCHAR_T == 2 1451 /* We can share representations and are done. */ 1452 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1456 _PyUnicode_UTF8(unicode) = NULL; 1457 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1458#else 1459 /* sizeof(wchar_t) == 4 */ 1460 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1461 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1462 if (!_PyUnicode_DATA_ANY(unicode)) { 1463 PyErr_NoMemory(); 1464 return -1; 1465 } 1466 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1467 _PyUnicode_WSTR(unicode), end, 1468 PyUnicode_2BYTE_DATA(unicode)); 1469 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1470 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1471 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1472 _PyUnicode_UTF8(unicode) = NULL; 1473 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1474 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1475 _PyUnicode_WSTR(unicode) = NULL; 1476 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1477#endif 1478 } 1479 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1480 else { 1481#if SIZEOF_WCHAR_T == 2 1482 /* in case the native representation is 2-bytes, we need to allocate a 1483 new normalized 4-byte version. */ 1484 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1485 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1486 if (!_PyUnicode_DATA_ANY(unicode)) { 1487 PyErr_NoMemory(); 1488 return -1; 1489 } 1490 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1492 _PyUnicode_UTF8(unicode) = NULL; 1493 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1494 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1495 _PyUnicode_STATE(unicode).ready = 1; 1496 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1497 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1498 _PyUnicode_WSTR(unicode) = NULL; 1499 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1500#else 1501 assert(num_surrogates == 0); 1502 1503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1504 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1505 _PyUnicode_UTF8(unicode) = NULL; 1506 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1507 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1508#endif 1509 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1510 } 1511 _PyUnicode_STATE(unicode).ready = 1; 1512 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1513 return 0; 1514} 1515 1516static void 1517unicode_dealloc(register PyObject *unicode) 1518{ 1519 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1520 case SSTATE_NOT_INTERNED: 1521 break; 1522 1523 case SSTATE_INTERNED_MORTAL: 1524 /* revive dead object temporarily for DelItem */ 1525 Py_REFCNT(unicode) = 3; 1526 if (PyDict_DelItem(interned, unicode) != 0) 1527 Py_FatalError( 1528 "deletion of interned string failed"); 1529 break; 1530 1531 case SSTATE_INTERNED_IMMORTAL: 1532 Py_FatalError("Immortal interned string died."); 1533 1534 default: 1535 Py_FatalError("Inconsistent interned string state."); 1536 } 1537 1538 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1539 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1540 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1541 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1542 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1543 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1544 1545 Py_TYPE(unicode)->tp_free(unicode); 1546} 1547 1548#ifdef Py_DEBUG 1549static int 1550unicode_is_singleton(PyObject *unicode) 1551{ 1552 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1553 if (unicode == unicode_empty) 1554 return 1; 1555 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1556 { 1557 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1558 if (ch < 256 && unicode_latin1[ch] == unicode) 1559 return 1; 1560 } 1561 return 0; 1562} 1563#endif 1564 1565static int 1566unicode_modifiable(PyObject *unicode) 1567{ 1568 assert(_PyUnicode_CHECK(unicode)); 1569 if (Py_REFCNT(unicode) != 1) 1570 return 0; 1571 if (_PyUnicode_HASH(unicode) != -1) 1572 return 0; 1573 if (PyUnicode_CHECK_INTERNED(unicode)) 1574 return 0; 1575 if (!PyUnicode_CheckExact(unicode)) 1576 return 0; 1577#ifdef Py_DEBUG 1578 /* singleton refcount is greater than 1 */ 1579 assert(!unicode_is_singleton(unicode)); 1580#endif 1581 return 1; 1582} 1583 1584static int 1585unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1586{ 1587 PyObject *unicode; 1588 Py_ssize_t old_length; 1589 1590 assert(p_unicode != NULL); 1591 unicode = *p_unicode; 1592 1593 assert(unicode != NULL); 1594 assert(PyUnicode_Check(unicode)); 1595 assert(0 <= length); 1596 1597 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1598 old_length = PyUnicode_WSTR_LENGTH(unicode); 1599 else 1600 old_length = PyUnicode_GET_LENGTH(unicode); 1601 if (old_length == length) 1602 return 0; 1603 1604 if (length == 0) { 1605 Py_DECREF(*p_unicode); 1606 *p_unicode = unicode_empty; 1607 Py_INCREF(*p_unicode); 1608 return 0; 1609 } 1610 1611 if (!unicode_modifiable(unicode)) { 1612 PyObject *copy = resize_copy(unicode, length); 1613 if (copy == NULL) 1614 return -1; 1615 Py_DECREF(*p_unicode); 1616 *p_unicode = copy; 1617 return 0; 1618 } 1619 1620 if (PyUnicode_IS_COMPACT(unicode)) { 1621 PyObject *new_unicode = resize_compact(unicode, length); 1622 if (new_unicode == NULL) 1623 return -1; 1624 *p_unicode = new_unicode; 1625 return 0; 1626 } 1627 return resize_inplace(unicode, length); 1628} 1629 1630int 1631PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1632{ 1633 PyObject *unicode; 1634 if (p_unicode == NULL) { 1635 PyErr_BadInternalCall(); 1636 return -1; 1637 } 1638 unicode = *p_unicode; 1639 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1640 { 1641 PyErr_BadInternalCall(); 1642 return -1; 1643 } 1644 return unicode_resize(p_unicode, length); 1645} 1646 1647/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1648 1649 WARNING: The function doesn't copy the terminating null character and 1650 doesn't check the maximum character (may write a latin1 character in an 1651 ASCII string). */ 1652static void 1653unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1654 const char *str, Py_ssize_t len) 1655{ 1656 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1657 void *data = PyUnicode_DATA(unicode); 1658 const char *end = str + len; 1659 1660 switch (kind) { 1661 case PyUnicode_1BYTE_KIND: { 1662 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1663#ifdef Py_DEBUG 1664 if (PyUnicode_IS_ASCII(unicode)) { 1665 Py_UCS4 maxchar = ucs1lib_find_max_char( 1666 (const Py_UCS1*)str, 1667 (const Py_UCS1*)str + len); 1668 assert(maxchar < 128); 1669 } 1670#endif 1671 memcpy((char *) data + index, str, len); 1672 break; 1673 } 1674 case PyUnicode_2BYTE_KIND: { 1675 Py_UCS2 *start = (Py_UCS2 *)data + index; 1676 Py_UCS2 *ucs2 = start; 1677 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1678 1679 for (; str < end; ++ucs2, ++str) 1680 *ucs2 = (Py_UCS2)*str; 1681 1682 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1683 break; 1684 } 1685 default: { 1686 Py_UCS4 *start = (Py_UCS4 *)data + index; 1687 Py_UCS4 *ucs4 = start; 1688 assert(kind == PyUnicode_4BYTE_KIND); 1689 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1690 1691 for (; str < end; ++ucs4, ++str) 1692 *ucs4 = (Py_UCS4)*str; 1693 1694 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1695 } 1696 } 1697} 1698 1699 1700static PyObject* 1701get_latin1_char(unsigned char ch) 1702{ 1703 PyObject *unicode = unicode_latin1[ch]; 1704 if (!unicode) { 1705 unicode = PyUnicode_New(1, ch); 1706 if (!unicode) 1707 return NULL; 1708 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1709 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1710 unicode_latin1[ch] = unicode; 1711 } 1712 Py_INCREF(unicode); 1713 return unicode; 1714} 1715 1716PyObject * 1717PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1718{ 1719 PyObject *unicode; 1720 Py_UCS4 maxchar = 0; 1721 Py_ssize_t num_surrogates; 1722 1723 if (u == NULL) 1724 return (PyObject*)_PyUnicode_New(size); 1725 1726 /* If the Unicode data is known at construction time, we can apply 1727 some optimizations which share commonly used objects. */ 1728 1729 /* Optimization for empty strings */ 1730 if (size == 0 && unicode_empty != NULL) { 1731 Py_INCREF(unicode_empty); 1732 return unicode_empty; 1733 } 1734 1735 /* Single character Unicode objects in the Latin-1 range are 1736 shared when using this constructor */ 1737 if (size == 1 && *u < 256) 1738 return get_latin1_char((unsigned char)*u); 1739 1740 /* If not empty and not single character, copy the Unicode data 1741 into the new object */ 1742 if (find_maxchar_surrogates(u, u + size, 1743 &maxchar, &num_surrogates) == -1) 1744 return NULL; 1745 1746 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1747 if (!unicode) 1748 return NULL; 1749 1750 switch (PyUnicode_KIND(unicode)) { 1751 case PyUnicode_1BYTE_KIND: 1752 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1753 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1754 break; 1755 case PyUnicode_2BYTE_KIND: 1756#if Py_UNICODE_SIZE == 2 1757 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1758#else 1759 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1760 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1761#endif 1762 break; 1763 case PyUnicode_4BYTE_KIND: 1764#if SIZEOF_WCHAR_T == 2 1765 /* This is the only case which has to process surrogates, thus 1766 a simple copy loop is not enough and we need a function. */ 1767 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1768#else 1769 assert(num_surrogates == 0); 1770 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1771#endif 1772 break; 1773 default: 1774 assert(0 && "Impossible state"); 1775 } 1776 1777 return unicode_result(unicode); 1778} 1779 1780PyObject * 1781PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1782{ 1783 if (size < 0) { 1784 PyErr_SetString(PyExc_SystemError, 1785 "Negative size passed to PyUnicode_FromStringAndSize"); 1786 return NULL; 1787 } 1788 if (u != NULL) 1789 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1790 else 1791 return (PyObject *)_PyUnicode_New(size); 1792} 1793 1794PyObject * 1795PyUnicode_FromString(const char *u) 1796{ 1797 size_t size = strlen(u); 1798 if (size > PY_SSIZE_T_MAX) { 1799 PyErr_SetString(PyExc_OverflowError, "input too long"); 1800 return NULL; 1801 } 1802 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1803} 1804 1805PyObject * 1806_PyUnicode_FromId(_Py_Identifier *id) 1807{ 1808 if (!id->object) { 1809 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1810 strlen(id->string), 1811 NULL, NULL); 1812 if (!id->object) 1813 return NULL; 1814 PyUnicode_InternInPlace(&id->object); 1815 assert(!id->next); 1816 id->next = static_strings; 1817 static_strings = id; 1818 } 1819 return id->object; 1820} 1821 1822void 1823_PyUnicode_ClearStaticStrings() 1824{ 1825 _Py_Identifier *i; 1826 for (i = static_strings; i; i = i->next) { 1827 Py_DECREF(i->object); 1828 i->object = NULL; 1829 i->next = NULL; 1830 } 1831} 1832 1833/* Internal function, doesn't check maximum character */ 1834 1835PyObject* 1836_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1837{ 1838 const unsigned char *s = (const unsigned char *)buffer; 1839 PyObject *unicode; 1840 if (size == 1) { 1841#ifdef Py_DEBUG 1842 assert(s[0] < 128); 1843#endif 1844 return get_latin1_char(s[0]); 1845 } 1846 unicode = PyUnicode_New(size, 127); 1847 if (!unicode) 1848 return NULL; 1849 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1850 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1851 return unicode; 1852} 1853 1854static Py_UCS4 1855kind_maxchar_limit(unsigned int kind) 1856{ 1857 switch (kind) { 1858 case PyUnicode_1BYTE_KIND: 1859 return 0x80; 1860 case PyUnicode_2BYTE_KIND: 1861 return 0x100; 1862 case PyUnicode_4BYTE_KIND: 1863 return 0x10000; 1864 default: 1865 assert(0 && "invalid kind"); 1866 return MAX_UNICODE; 1867 } 1868} 1869 1870Py_LOCAL_INLINE(Py_UCS4) 1871align_maxchar(Py_UCS4 maxchar) 1872{ 1873 if (maxchar <= 127) 1874 return 127; 1875 else if (maxchar <= 255) 1876 return 255; 1877 else if (maxchar <= 65535) 1878 return 65535; 1879 else 1880 return MAX_UNICODE; 1881} 1882 1883static PyObject* 1884_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1885{ 1886 PyObject *res; 1887 unsigned char max_char; 1888 1889 if (size == 0) { 1890 Py_INCREF(unicode_empty); 1891 return unicode_empty; 1892 } 1893 assert(size > 0); 1894 if (size == 1) 1895 return get_latin1_char(u[0]); 1896 1897 max_char = ucs1lib_find_max_char(u, u + size); 1898 res = PyUnicode_New(size, max_char); 1899 if (!res) 1900 return NULL; 1901 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1902 assert(_PyUnicode_CheckConsistency(res, 1)); 1903 return res; 1904} 1905 1906static PyObject* 1907_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1908{ 1909 PyObject *res; 1910 Py_UCS2 max_char; 1911 1912 if (size == 0) { 1913 Py_INCREF(unicode_empty); 1914 return unicode_empty; 1915 } 1916 assert(size > 0); 1917 if (size == 1) { 1918 Py_UCS4 ch = u[0]; 1919 if (ch < 256) 1920 return get_latin1_char((unsigned char)ch); 1921 1922 res = PyUnicode_New(1, ch); 1923 if (res == NULL) 1924 return NULL; 1925 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1926 assert(_PyUnicode_CheckConsistency(res, 1)); 1927 return res; 1928 } 1929 1930 max_char = ucs2lib_find_max_char(u, u + size); 1931 res = PyUnicode_New(size, max_char); 1932 if (!res) 1933 return NULL; 1934 if (max_char >= 256) 1935 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1936 else { 1937 _PyUnicode_CONVERT_BYTES( 1938 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1939 } 1940 assert(_PyUnicode_CheckConsistency(res, 1)); 1941 return res; 1942} 1943 1944static PyObject* 1945_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1946{ 1947 PyObject *res; 1948 Py_UCS4 max_char; 1949 1950 if (size == 0) { 1951 Py_INCREF(unicode_empty); 1952 return unicode_empty; 1953 } 1954 assert(size > 0); 1955 if (size == 1) { 1956 Py_UCS4 ch = u[0]; 1957 if (ch < 256) 1958 return get_latin1_char((unsigned char)ch); 1959 1960 res = PyUnicode_New(1, ch); 1961 if (res == NULL) 1962 return NULL; 1963 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1964 assert(_PyUnicode_CheckConsistency(res, 1)); 1965 return res; 1966 } 1967 1968 max_char = ucs4lib_find_max_char(u, u + size); 1969 res = PyUnicode_New(size, max_char); 1970 if (!res) 1971 return NULL; 1972 if (max_char < 256) 1973 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1974 PyUnicode_1BYTE_DATA(res)); 1975 else if (max_char < 0x10000) 1976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1977 PyUnicode_2BYTE_DATA(res)); 1978 else 1979 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1980 assert(_PyUnicode_CheckConsistency(res, 1)); 1981 return res; 1982} 1983 1984PyObject* 1985PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1986{ 1987 if (size < 0) { 1988 PyErr_SetString(PyExc_ValueError, "size must be positive"); 1989 return NULL; 1990 } 1991 switch (kind) { 1992 case PyUnicode_1BYTE_KIND: 1993 return _PyUnicode_FromUCS1(buffer, size); 1994 case PyUnicode_2BYTE_KIND: 1995 return _PyUnicode_FromUCS2(buffer, size); 1996 case PyUnicode_4BYTE_KIND: 1997 return _PyUnicode_FromUCS4(buffer, size); 1998 default: 1999 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2000 return NULL; 2001 } 2002} 2003 2004Py_UCS4 2005_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2006{ 2007 enum PyUnicode_Kind kind; 2008 void *startptr, *endptr; 2009 2010 assert(PyUnicode_IS_READY(unicode)); 2011 assert(0 <= start); 2012 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2013 assert(start <= end); 2014 2015 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2016 return PyUnicode_MAX_CHAR_VALUE(unicode); 2017 2018 if (start == end) 2019 return 127; 2020 2021 if (PyUnicode_IS_ASCII(unicode)) 2022 return 127; 2023 2024 kind = PyUnicode_KIND(unicode); 2025 startptr = PyUnicode_DATA(unicode); 2026 endptr = (char *)startptr + end * kind; 2027 startptr = (char *)startptr + start * kind; 2028 switch(kind) { 2029 case PyUnicode_1BYTE_KIND: 2030 return ucs1lib_find_max_char(startptr, endptr); 2031 case PyUnicode_2BYTE_KIND: 2032 return ucs2lib_find_max_char(startptr, endptr); 2033 case PyUnicode_4BYTE_KIND: 2034 return ucs4lib_find_max_char(startptr, endptr); 2035 default: 2036 assert(0); 2037 return 0; 2038 } 2039} 2040 2041/* Ensure that a string uses the most efficient storage, if it is not the 2042 case: create a new string with of the right kind. Write NULL into *p_unicode 2043 on error. */ 2044static void 2045unicode_adjust_maxchar(PyObject **p_unicode) 2046{ 2047 PyObject *unicode, *copy; 2048 Py_UCS4 max_char; 2049 Py_ssize_t len; 2050 unsigned int kind; 2051 2052 assert(p_unicode != NULL); 2053 unicode = *p_unicode; 2054 assert(PyUnicode_IS_READY(unicode)); 2055 if (PyUnicode_IS_ASCII(unicode)) 2056 return; 2057 2058 len = PyUnicode_GET_LENGTH(unicode); 2059 kind = PyUnicode_KIND(unicode); 2060 if (kind == PyUnicode_1BYTE_KIND) { 2061 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2062 max_char = ucs1lib_find_max_char(u, u + len); 2063 if (max_char >= 128) 2064 return; 2065 } 2066 else if (kind == PyUnicode_2BYTE_KIND) { 2067 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2068 max_char = ucs2lib_find_max_char(u, u + len); 2069 if (max_char >= 256) 2070 return; 2071 } 2072 else { 2073 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2074 assert(kind == PyUnicode_4BYTE_KIND); 2075 max_char = ucs4lib_find_max_char(u, u + len); 2076 if (max_char >= 0x10000) 2077 return; 2078 } 2079 copy = PyUnicode_New(len, max_char); 2080 if (copy != NULL) 2081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2082 Py_DECREF(unicode); 2083 *p_unicode = copy; 2084} 2085 2086PyObject* 2087_PyUnicode_Copy(PyObject *unicode) 2088{ 2089 Py_ssize_t length; 2090 PyObject *copy; 2091 2092 if (!PyUnicode_Check(unicode)) { 2093 PyErr_BadInternalCall(); 2094 return NULL; 2095 } 2096 if (PyUnicode_READY(unicode) == -1) 2097 return NULL; 2098 2099 length = PyUnicode_GET_LENGTH(unicode); 2100 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2101 if (!copy) 2102 return NULL; 2103 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2104 2105 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2106 length * PyUnicode_KIND(unicode)); 2107 assert(_PyUnicode_CheckConsistency(copy, 1)); 2108 return copy; 2109} 2110 2111 2112/* Widen Unicode objects to larger buffers. Don't write terminating null 2113 character. Return NULL on error. */ 2114 2115void* 2116_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2117{ 2118 Py_ssize_t len; 2119 void *result; 2120 unsigned int skind; 2121 2122 if (PyUnicode_READY(s) == -1) 2123 return NULL; 2124 2125 len = PyUnicode_GET_LENGTH(s); 2126 skind = PyUnicode_KIND(s); 2127 if (skind >= kind) { 2128 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2129 return NULL; 2130 } 2131 switch (kind) { 2132 case PyUnicode_2BYTE_KIND: 2133 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2134 if (!result) 2135 return PyErr_NoMemory(); 2136 assert(skind == PyUnicode_1BYTE_KIND); 2137 _PyUnicode_CONVERT_BYTES( 2138 Py_UCS1, Py_UCS2, 2139 PyUnicode_1BYTE_DATA(s), 2140 PyUnicode_1BYTE_DATA(s) + len, 2141 result); 2142 return result; 2143 case PyUnicode_4BYTE_KIND: 2144 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2145 if (!result) 2146 return PyErr_NoMemory(); 2147 if (skind == PyUnicode_2BYTE_KIND) { 2148 _PyUnicode_CONVERT_BYTES( 2149 Py_UCS2, Py_UCS4, 2150 PyUnicode_2BYTE_DATA(s), 2151 PyUnicode_2BYTE_DATA(s) + len, 2152 result); 2153 } 2154 else { 2155 assert(skind == PyUnicode_1BYTE_KIND); 2156 _PyUnicode_CONVERT_BYTES( 2157 Py_UCS1, Py_UCS4, 2158 PyUnicode_1BYTE_DATA(s), 2159 PyUnicode_1BYTE_DATA(s) + len, 2160 result); 2161 } 2162 return result; 2163 default: 2164 break; 2165 } 2166 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2167 return NULL; 2168} 2169 2170static Py_UCS4* 2171as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2172 int copy_null) 2173{ 2174 int kind; 2175 void *data; 2176 Py_ssize_t len, targetlen; 2177 if (PyUnicode_READY(string) == -1) 2178 return NULL; 2179 kind = PyUnicode_KIND(string); 2180 data = PyUnicode_DATA(string); 2181 len = PyUnicode_GET_LENGTH(string); 2182 targetlen = len; 2183 if (copy_null) 2184 targetlen++; 2185 if (!target) { 2186 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2187 PyErr_NoMemory(); 2188 return NULL; 2189 } 2190 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2191 if (!target) { 2192 PyErr_NoMemory(); 2193 return NULL; 2194 } 2195 } 2196 else { 2197 if (targetsize < targetlen) { 2198 PyErr_Format(PyExc_SystemError, 2199 "string is longer than the buffer"); 2200 if (copy_null && 0 < targetsize) 2201 target[0] = 0; 2202 return NULL; 2203 } 2204 } 2205 if (kind == PyUnicode_1BYTE_KIND) { 2206 Py_UCS1 *start = (Py_UCS1 *) data; 2207 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2208 } 2209 else if (kind == PyUnicode_2BYTE_KIND) { 2210 Py_UCS2 *start = (Py_UCS2 *) data; 2211 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2212 } 2213 else { 2214 assert(kind == PyUnicode_4BYTE_KIND); 2215 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2216 } 2217 if (copy_null) 2218 target[len] = 0; 2219 return target; 2220} 2221 2222Py_UCS4* 2223PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2224 int copy_null) 2225{ 2226 if (target == NULL || targetsize < 0) { 2227 PyErr_BadInternalCall(); 2228 return NULL; 2229 } 2230 return as_ucs4(string, target, targetsize, copy_null); 2231} 2232 2233Py_UCS4* 2234PyUnicode_AsUCS4Copy(PyObject *string) 2235{ 2236 return as_ucs4(string, NULL, 0, 1); 2237} 2238 2239#ifdef HAVE_WCHAR_H 2240 2241PyObject * 2242PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2243{ 2244 if (w == NULL) { 2245 if (size == 0) { 2246 Py_INCREF(unicode_empty); 2247 return unicode_empty; 2248 } 2249 PyErr_BadInternalCall(); 2250 return NULL; 2251 } 2252 2253 if (size == -1) { 2254 size = wcslen(w); 2255 } 2256 2257 return PyUnicode_FromUnicode(w, size); 2258} 2259 2260#endif /* HAVE_WCHAR_H */ 2261 2262static void 2263makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2264 char c) 2265{ 2266 *fmt++ = '%'; 2267 if (longflag) 2268 *fmt++ = 'l'; 2269 else if (longlongflag) { 2270 /* longlongflag should only ever be nonzero on machines with 2271 HAVE_LONG_LONG defined */ 2272#ifdef HAVE_LONG_LONG 2273 char *f = PY_FORMAT_LONG_LONG; 2274 while (*f) 2275 *fmt++ = *f++; 2276#else 2277 /* we shouldn't ever get here */ 2278 assert(0); 2279 *fmt++ = 'l'; 2280#endif 2281 } 2282 else if (size_tflag) { 2283 char *f = PY_FORMAT_SIZE_T; 2284 while (*f) 2285 *fmt++ = *f++; 2286 } 2287 *fmt++ = c; 2288 *fmt = '\0'; 2289} 2290 2291/* maximum number of characters required for output of %lld or %p. 2292 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2293 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2294#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2295 2296static const char* 2297unicode_fromformat_arg(_PyUnicodeWriter *writer, 2298 const char *f, va_list *vargs) 2299{ 2300 const char *p; 2301 Py_ssize_t len; 2302 int zeropad; 2303 int width; 2304 int precision; 2305 int longflag; 2306 int longlongflag; 2307 int size_tflag; 2308 int fill; 2309 2310 p = f; 2311 f++; 2312 zeropad = 0; 2313 if (*f == '0') { 2314 zeropad = 1; 2315 f++; 2316 } 2317 2318 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2319 width = 0; 2320 while (Py_ISDIGIT((unsigned)*f)) { 2321 if (width > (INT_MAX - ((int)*f - '0')) / 10) { 2322 PyErr_SetString(PyExc_ValueError, 2323 "width too big"); 2324 return NULL; 2325 } 2326 width = (width*10) + (*f - '0'); 2327 f++; 2328 } 2329 precision = 0; 2330 if (*f == '.') { 2331 f++; 2332 while (Py_ISDIGIT((unsigned)*f)) { 2333 if (precision > (INT_MAX - ((int)*f - '0')) / 10) { 2334 PyErr_SetString(PyExc_ValueError, 2335 "precision too big"); 2336 return NULL; 2337 } 2338 precision = (precision*10) + (*f - '0'); 2339 f++; 2340 } 2341 if (*f == '%') { 2342 /* "%.3%s" => f points to "3" */ 2343 f--; 2344 } 2345 } 2346 if (*f == '\0') { 2347 /* bogus format "%.123" => go backward, f points to "3" */ 2348 f--; 2349 } 2350 2351 /* Handle %ld, %lu, %lld and %llu. */ 2352 longflag = 0; 2353 longlongflag = 0; 2354 size_tflag = 0; 2355 if (*f == 'l') { 2356 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2357 longflag = 1; 2358 ++f; 2359 } 2360#ifdef HAVE_LONG_LONG 2361 else if (f[1] == 'l' && 2362 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2363 longlongflag = 1; 2364 f += 2; 2365 } 2366#endif 2367 } 2368 /* handle the size_t flag. */ 2369 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2370 size_tflag = 1; 2371 ++f; 2372 } 2373 2374 if (f[1] == '\0') 2375 writer->overallocate = 0; 2376 2377 switch (*f) { 2378 case 'c': 2379 { 2380 int ordinal = va_arg(*vargs, int); 2381 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2382 PyErr_SetString(PyExc_ValueError, 2383 "character argument not in range(0x110000)"); 2384 return NULL; 2385 } 2386 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1) 2387 return NULL; 2388 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal); 2389 writer->pos++; 2390 break; 2391 } 2392 2393 case 'i': 2394 case 'd': 2395 case 'u': 2396 case 'x': 2397 { 2398 /* used by sprintf */ 2399 char fmt[10]; /* should be enough for "%0lld\0" */ 2400 char buffer[MAX_LONG_LONG_CHARS]; 2401 2402 if (*f == 'u') { 2403 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2404 2405 if (longflag) 2406 len = sprintf(buffer, fmt, 2407 va_arg(*vargs, unsigned long)); 2408#ifdef HAVE_LONG_LONG 2409 else if (longlongflag) 2410 len = sprintf(buffer, fmt, 2411 va_arg(*vargs, unsigned PY_LONG_LONG)); 2412#endif 2413 else if (size_tflag) 2414 len = sprintf(buffer, fmt, 2415 va_arg(*vargs, size_t)); 2416 else 2417 len = sprintf(buffer, fmt, 2418 va_arg(*vargs, unsigned int)); 2419 } 2420 else if (*f == 'x') { 2421 makefmt(fmt, 0, 0, 0, 'x'); 2422 len = sprintf(buffer, fmt, va_arg(*vargs, int)); 2423 } 2424 else { 2425 makefmt(fmt, longflag, longlongflag, size_tflag, *f); 2426 2427 if (longflag) 2428 len = sprintf(buffer, fmt, 2429 va_arg(*vargs, long)); 2430#ifdef HAVE_LONG_LONG 2431 else if (longlongflag) 2432 len = sprintf(buffer, fmt, 2433 va_arg(*vargs, PY_LONG_LONG)); 2434#endif 2435 else if (size_tflag) 2436 len = sprintf(buffer, fmt, 2437 va_arg(*vargs, Py_ssize_t)); 2438 else 2439 len = sprintf(buffer, fmt, 2440 va_arg(*vargs, int)); 2441 } 2442 assert(len >= 0); 2443 2444 if (precision < len) 2445 precision = len; 2446 if (width > precision) { 2447 Py_UCS4 fillchar; 2448 fill = width - precision; 2449 fillchar = zeropad?'0':' '; 2450 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1) 2451 return NULL; 2452 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2453 return NULL; 2454 writer->pos += fill; 2455 } 2456 if (precision > len) { 2457 fill = precision - len; 2458 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1) 2459 return NULL; 2460 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2461 return NULL; 2462 writer->pos += fill; 2463 } 2464 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) 2465 return NULL; 2466 break; 2467 } 2468 2469 case 'p': 2470 { 2471 char number[MAX_LONG_LONG_CHARS]; 2472 2473 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2474 assert(len >= 0); 2475 2476 /* %p is ill-defined: ensure leading 0x. */ 2477 if (number[1] == 'X') 2478 number[1] = 'x'; 2479 else if (number[1] != 'x') { 2480 memmove(number + 2, number, 2481 strlen(number) + 1); 2482 number[0] = '0'; 2483 number[1] = 'x'; 2484 len += 2; 2485 } 2486 2487 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) 2488 return NULL; 2489 break; 2490 } 2491 2492 case 's': 2493 { 2494 /* UTF-8 */ 2495 const char *s = va_arg(*vargs, const char*); 2496 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); 2497 if (!str) 2498 return NULL; 2499 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { 2500 Py_DECREF(str); 2501 return NULL; 2502 } 2503 Py_DECREF(str); 2504 break; 2505 } 2506 2507 case 'U': 2508 { 2509 PyObject *obj = va_arg(*vargs, PyObject *); 2510 assert(obj && _PyUnicode_CHECK(obj)); 2511 2512 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2513 return NULL; 2514 break; 2515 } 2516 2517 case 'V': 2518 { 2519 PyObject *obj = va_arg(*vargs, PyObject *); 2520 const char *str = va_arg(*vargs, const char *); 2521 PyObject *str_obj; 2522 assert(obj || str); 2523 if (obj) { 2524 assert(_PyUnicode_CHECK(obj)); 2525 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) 2526 return NULL; 2527 } 2528 else { 2529 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2530 if (!str_obj) 2531 return NULL; 2532 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { 2533 Py_DECREF(str_obj); 2534 return NULL; 2535 } 2536 Py_DECREF(str_obj); 2537 } 2538 break; 2539 } 2540 2541 case 'S': 2542 { 2543 PyObject *obj = va_arg(*vargs, PyObject *); 2544 PyObject *str; 2545 assert(obj); 2546 str = PyObject_Str(obj); 2547 if (!str) 2548 return NULL; 2549 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { 2550 Py_DECREF(str); 2551 return NULL; 2552 } 2553 Py_DECREF(str); 2554 break; 2555 } 2556 2557 case 'R': 2558 { 2559 PyObject *obj = va_arg(*vargs, PyObject *); 2560 PyObject *repr; 2561 assert(obj); 2562 repr = PyObject_Repr(obj); 2563 if (!repr) 2564 return NULL; 2565 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { 2566 Py_DECREF(repr); 2567 return NULL; 2568 } 2569 Py_DECREF(repr); 2570 break; 2571 } 2572 2573 case 'A': 2574 { 2575 PyObject *obj = va_arg(*vargs, PyObject *); 2576 PyObject *ascii; 2577 assert(obj); 2578 ascii = PyObject_ASCII(obj); 2579 if (!ascii) 2580 return NULL; 2581 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { 2582 Py_DECREF(ascii); 2583 return NULL; 2584 } 2585 Py_DECREF(ascii); 2586 break; 2587 } 2588 2589 case '%': 2590 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1) 2591 return NULL; 2592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); 2593 writer->pos++; 2594 break; 2595 2596 default: 2597 /* if we stumble upon an unknown formatting code, copy the rest 2598 of the format string to the output string. (we cannot just 2599 skip the code, since there's no way to know what's in the 2600 argument list) */ 2601 len = strlen(p); 2602 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) 2603 return NULL; 2604 f = p+len; 2605 return f; 2606 } 2607 2608 f++; 2609 return f; 2610} 2611 2612PyObject * 2613PyUnicode_FromFormatV(const char *format, va_list vargs) 2614{ 2615 va_list vargs2; 2616 const char *f; 2617 _PyUnicodeWriter writer; 2618 2619 _PyUnicodeWriter_Init(&writer, strlen(format) + 100); 2620 2621 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2622 Copy it to be able to pass a reference to a subfunction. */ 2623 Py_VA_COPY(vargs2, vargs); 2624 2625 for (f = format; *f; ) { 2626 if (*f == '%') { 2627 f = unicode_fromformat_arg(&writer, f, &vargs2); 2628 if (f == NULL) 2629 goto fail; 2630 } 2631 else { 2632 const char *p; 2633 Py_ssize_t len; 2634 2635 p = f; 2636 do 2637 { 2638 if ((unsigned char)*p > 127) { 2639 PyErr_Format(PyExc_ValueError, 2640 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2641 "string, got a non-ASCII byte: 0x%02x", 2642 (unsigned char)*p); 2643 return NULL; 2644 } 2645 p++; 2646 } 2647 while (*p != '\0' && *p != '%'); 2648 len = p - f; 2649 2650 if (*p == '\0') 2651 writer.overallocate = 0; 2652 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) 2653 goto fail; 2654 unicode_write_cstr(writer.buffer, writer.pos, f, len); 2655 writer.pos += len; 2656 2657 f = p; 2658 } 2659 } 2660 return _PyUnicodeWriter_Finish(&writer); 2661 2662 fail: 2663 _PyUnicodeWriter_Dealloc(&writer); 2664 return NULL; 2665} 2666 2667PyObject * 2668PyUnicode_FromFormat(const char *format, ...) 2669{ 2670 PyObject* ret; 2671 va_list vargs; 2672 2673#ifdef HAVE_STDARG_PROTOTYPES 2674 va_start(vargs, format); 2675#else 2676 va_start(vargs); 2677#endif 2678 ret = PyUnicode_FromFormatV(format, vargs); 2679 va_end(vargs); 2680 return ret; 2681} 2682 2683#ifdef HAVE_WCHAR_H 2684 2685/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2686 convert a Unicode object to a wide character string. 2687 2688 - If w is NULL: return the number of wide characters (including the null 2689 character) required to convert the unicode object. Ignore size argument. 2690 2691 - Otherwise: return the number of wide characters (excluding the null 2692 character) written into w. Write at most size wide characters (including 2693 the null character). */ 2694static Py_ssize_t 2695unicode_aswidechar(PyObject *unicode, 2696 wchar_t *w, 2697 Py_ssize_t size) 2698{ 2699 Py_ssize_t res; 2700 const wchar_t *wstr; 2701 2702 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2703 if (wstr == NULL) 2704 return -1; 2705 2706 if (w != NULL) { 2707 if (size > res) 2708 size = res + 1; 2709 else 2710 res = size; 2711 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2712 return res; 2713 } 2714 else 2715 return res + 1; 2716} 2717 2718Py_ssize_t 2719PyUnicode_AsWideChar(PyObject *unicode, 2720 wchar_t *w, 2721 Py_ssize_t size) 2722{ 2723 if (unicode == NULL) { 2724 PyErr_BadInternalCall(); 2725 return -1; 2726 } 2727 return unicode_aswidechar(unicode, w, size); 2728} 2729 2730wchar_t* 2731PyUnicode_AsWideCharString(PyObject *unicode, 2732 Py_ssize_t *size) 2733{ 2734 wchar_t* buffer; 2735 Py_ssize_t buflen; 2736 2737 if (unicode == NULL) { 2738 PyErr_BadInternalCall(); 2739 return NULL; 2740 } 2741 2742 buflen = unicode_aswidechar(unicode, NULL, 0); 2743 if (buflen == -1) 2744 return NULL; 2745 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2746 PyErr_NoMemory(); 2747 return NULL; 2748 } 2749 2750 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2751 if (buffer == NULL) { 2752 PyErr_NoMemory(); 2753 return NULL; 2754 } 2755 buflen = unicode_aswidechar(unicode, buffer, buflen); 2756 if (buflen == -1) { 2757 PyMem_FREE(buffer); 2758 return NULL; 2759 } 2760 if (size != NULL) 2761 *size = buflen; 2762 return buffer; 2763} 2764 2765#endif /* HAVE_WCHAR_H */ 2766 2767PyObject * 2768PyUnicode_FromOrdinal(int ordinal) 2769{ 2770 PyObject *v; 2771 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2772 PyErr_SetString(PyExc_ValueError, 2773 "chr() arg not in range(0x110000)"); 2774 return NULL; 2775 } 2776 2777 if (ordinal < 256) 2778 return get_latin1_char(ordinal); 2779 2780 v = PyUnicode_New(1, ordinal); 2781 if (v == NULL) 2782 return NULL; 2783 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2784 assert(_PyUnicode_CheckConsistency(v, 1)); 2785 return v; 2786} 2787 2788PyObject * 2789PyUnicode_FromObject(register PyObject *obj) 2790{ 2791 /* XXX Perhaps we should make this API an alias of 2792 PyObject_Str() instead ?! */ 2793 if (PyUnicode_CheckExact(obj)) { 2794 if (PyUnicode_READY(obj) == -1) 2795 return NULL; 2796 Py_INCREF(obj); 2797 return obj; 2798 } 2799 if (PyUnicode_Check(obj)) { 2800 /* For a Unicode subtype that's not a Unicode object, 2801 return a true Unicode object with the same data. */ 2802 return _PyUnicode_Copy(obj); 2803 } 2804 PyErr_Format(PyExc_TypeError, 2805 "Can't convert '%.100s' object to str implicitly", 2806 Py_TYPE(obj)->tp_name); 2807 return NULL; 2808} 2809 2810PyObject * 2811PyUnicode_FromEncodedObject(register PyObject *obj, 2812 const char *encoding, 2813 const char *errors) 2814{ 2815 Py_buffer buffer; 2816 PyObject *v; 2817 2818 if (obj == NULL) { 2819 PyErr_BadInternalCall(); 2820 return NULL; 2821 } 2822 2823 /* Decoding bytes objects is the most common case and should be fast */ 2824 if (PyBytes_Check(obj)) { 2825 if (PyBytes_GET_SIZE(obj) == 0) { 2826 Py_INCREF(unicode_empty); 2827 v = unicode_empty; 2828 } 2829 else { 2830 v = PyUnicode_Decode( 2831 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2832 encoding, errors); 2833 } 2834 return v; 2835 } 2836 2837 if (PyUnicode_Check(obj)) { 2838 PyErr_SetString(PyExc_TypeError, 2839 "decoding str is not supported"); 2840 return NULL; 2841 } 2842 2843 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2844 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2845 PyErr_Format(PyExc_TypeError, 2846 "coercing to str: need bytes, bytearray " 2847 "or buffer-like object, %.80s found", 2848 Py_TYPE(obj)->tp_name); 2849 return NULL; 2850 } 2851 2852 if (buffer.len == 0) { 2853 Py_INCREF(unicode_empty); 2854 v = unicode_empty; 2855 } 2856 else 2857 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2858 2859 PyBuffer_Release(&buffer); 2860 return v; 2861} 2862 2863/* Convert encoding to lower case and replace '_' with '-' in order to 2864 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2865 1 on success. */ 2866static int 2867normalize_encoding(const char *encoding, 2868 char *lower, 2869 size_t lower_len) 2870{ 2871 const char *e; 2872 char *l; 2873 char *l_end; 2874 2875 if (encoding == NULL) { 2876 strcpy(lower, "utf-8"); 2877 return 1; 2878 } 2879 e = encoding; 2880 l = lower; 2881 l_end = &lower[lower_len - 1]; 2882 while (*e) { 2883 if (l == l_end) 2884 return 0; 2885 if (Py_ISUPPER(*e)) { 2886 *l++ = Py_TOLOWER(*e++); 2887 } 2888 else if (*e == '_') { 2889 *l++ = '-'; 2890 e++; 2891 } 2892 else { 2893 *l++ = *e++; 2894 } 2895 } 2896 *l = '\0'; 2897 return 1; 2898} 2899 2900PyObject * 2901PyUnicode_Decode(const char *s, 2902 Py_ssize_t size, 2903 const char *encoding, 2904 const char *errors) 2905{ 2906 PyObject *buffer = NULL, *unicode; 2907 Py_buffer info; 2908 char lower[11]; /* Enough for any encoding shortcut */ 2909 2910 /* Shortcuts for common default encodings */ 2911 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2912 if ((strcmp(lower, "utf-8") == 0) || 2913 (strcmp(lower, "utf8") == 0)) 2914 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 2915 else if ((strcmp(lower, "latin-1") == 0) || 2916 (strcmp(lower, "latin1") == 0) || 2917 (strcmp(lower, "iso-8859-1") == 0)) 2918 return PyUnicode_DecodeLatin1(s, size, errors); 2919#ifdef HAVE_MBCS 2920 else if (strcmp(lower, "mbcs") == 0) 2921 return PyUnicode_DecodeMBCS(s, size, errors); 2922#endif 2923 else if (strcmp(lower, "ascii") == 0) 2924 return PyUnicode_DecodeASCII(s, size, errors); 2925 else if (strcmp(lower, "utf-16") == 0) 2926 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2927 else if (strcmp(lower, "utf-32") == 0) 2928 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2929 } 2930 2931 /* Decode via the codec registry */ 2932 buffer = NULL; 2933 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2934 goto onError; 2935 buffer = PyMemoryView_FromBuffer(&info); 2936 if (buffer == NULL) 2937 goto onError; 2938 unicode = PyCodec_Decode(buffer, encoding, errors); 2939 if (unicode == NULL) 2940 goto onError; 2941 if (!PyUnicode_Check(unicode)) { 2942 PyErr_Format(PyExc_TypeError, 2943 "decoder did not return a str object (type=%.400s)", 2944 Py_TYPE(unicode)->tp_name); 2945 Py_DECREF(unicode); 2946 goto onError; 2947 } 2948 Py_DECREF(buffer); 2949 return unicode_result(unicode); 2950 2951 onError: 2952 Py_XDECREF(buffer); 2953 return NULL; 2954} 2955 2956PyObject * 2957PyUnicode_AsDecodedObject(PyObject *unicode, 2958 const char *encoding, 2959 const char *errors) 2960{ 2961 PyObject *v; 2962 2963 if (!PyUnicode_Check(unicode)) { 2964 PyErr_BadArgument(); 2965 goto onError; 2966 } 2967 2968 if (encoding == NULL) 2969 encoding = PyUnicode_GetDefaultEncoding(); 2970 2971 /* Decode via the codec registry */ 2972 v = PyCodec_Decode(unicode, encoding, errors); 2973 if (v == NULL) 2974 goto onError; 2975 return unicode_result(v); 2976 2977 onError: 2978 return NULL; 2979} 2980 2981PyObject * 2982PyUnicode_AsDecodedUnicode(PyObject *unicode, 2983 const char *encoding, 2984 const char *errors) 2985{ 2986 PyObject *v; 2987 2988 if (!PyUnicode_Check(unicode)) { 2989 PyErr_BadArgument(); 2990 goto onError; 2991 } 2992 2993 if (encoding == NULL) 2994 encoding = PyUnicode_GetDefaultEncoding(); 2995 2996 /* Decode via the codec registry */ 2997 v = PyCodec_Decode(unicode, encoding, errors); 2998 if (v == NULL) 2999 goto onError; 3000 if (!PyUnicode_Check(v)) { 3001 PyErr_Format(PyExc_TypeError, 3002 "decoder did not return a str object (type=%.400s)", 3003 Py_TYPE(v)->tp_name); 3004 Py_DECREF(v); 3005 goto onError; 3006 } 3007 return unicode_result(v); 3008 3009 onError: 3010 return NULL; 3011} 3012 3013PyObject * 3014PyUnicode_Encode(const Py_UNICODE *s, 3015 Py_ssize_t size, 3016 const char *encoding, 3017 const char *errors) 3018{ 3019 PyObject *v, *unicode; 3020 3021 unicode = PyUnicode_FromUnicode(s, size); 3022 if (unicode == NULL) 3023 return NULL; 3024 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3025 Py_DECREF(unicode); 3026 return v; 3027} 3028 3029PyObject * 3030PyUnicode_AsEncodedObject(PyObject *unicode, 3031 const char *encoding, 3032 const char *errors) 3033{ 3034 PyObject *v; 3035 3036 if (!PyUnicode_Check(unicode)) { 3037 PyErr_BadArgument(); 3038 goto onError; 3039 } 3040 3041 if (encoding == NULL) 3042 encoding = PyUnicode_GetDefaultEncoding(); 3043 3044 /* Encode via the codec registry */ 3045 v = PyCodec_Encode(unicode, encoding, errors); 3046 if (v == NULL) 3047 goto onError; 3048 return v; 3049 3050 onError: 3051 return NULL; 3052} 3053 3054static size_t 3055wcstombs_errorpos(const wchar_t *wstr) 3056{ 3057 size_t len; 3058#if SIZEOF_WCHAR_T == 2 3059 wchar_t buf[3]; 3060#else 3061 wchar_t buf[2]; 3062#endif 3063 char outbuf[MB_LEN_MAX]; 3064 const wchar_t *start, *previous; 3065 3066#if SIZEOF_WCHAR_T == 2 3067 buf[2] = 0; 3068#else 3069 buf[1] = 0; 3070#endif 3071 start = wstr; 3072 while (*wstr != L'\0') 3073 { 3074 previous = wstr; 3075#if SIZEOF_WCHAR_T == 2 3076 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3077 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3078 { 3079 buf[0] = wstr[0]; 3080 buf[1] = wstr[1]; 3081 wstr += 2; 3082 } 3083 else { 3084 buf[0] = *wstr; 3085 buf[1] = 0; 3086 wstr++; 3087 } 3088#else 3089 buf[0] = *wstr; 3090 wstr++; 3091#endif 3092 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3093 if (len == (size_t)-1) 3094 return previous - start; 3095 } 3096 3097 /* failed to find the unencodable character */ 3098 return 0; 3099} 3100 3101static int 3102locale_error_handler(const char *errors, int *surrogateescape) 3103{ 3104 if (errors == NULL) { 3105 *surrogateescape = 0; 3106 return 0; 3107 } 3108 3109 if (strcmp(errors, "strict") == 0) { 3110 *surrogateescape = 0; 3111 return 0; 3112 } 3113 if (strcmp(errors, "surrogateescape") == 0) { 3114 *surrogateescape = 1; 3115 return 0; 3116 } 3117 PyErr_Format(PyExc_ValueError, 3118 "only 'strict' and 'surrogateescape' error handlers " 3119 "are supported, not '%s'", 3120 errors); 3121 return -1; 3122} 3123 3124PyObject * 3125PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3126{ 3127 Py_ssize_t wlen, wlen2; 3128 wchar_t *wstr; 3129 PyObject *bytes = NULL; 3130 char *errmsg; 3131 PyObject *reason; 3132 PyObject *exc; 3133 size_t error_pos; 3134 int surrogateescape; 3135 3136 if (locale_error_handler(errors, &surrogateescape) < 0) 3137 return NULL; 3138 3139 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3140 if (wstr == NULL) 3141 return NULL; 3142 3143 wlen2 = wcslen(wstr); 3144 if (wlen2 != wlen) { 3145 PyMem_Free(wstr); 3146 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3147 return NULL; 3148 } 3149 3150 if (surrogateescape) { 3151 /* locale encoding with surrogateescape */ 3152 char *str; 3153 3154 str = _Py_wchar2char(wstr, &error_pos); 3155 if (str == NULL) { 3156 if (error_pos == (size_t)-1) { 3157 PyErr_NoMemory(); 3158 PyMem_Free(wstr); 3159 return NULL; 3160 } 3161 else { 3162 goto encode_error; 3163 } 3164 } 3165 PyMem_Free(wstr); 3166 3167 bytes = PyBytes_FromString(str); 3168 PyMem_Free(str); 3169 } 3170 else { 3171 size_t len, len2; 3172 3173 len = wcstombs(NULL, wstr, 0); 3174 if (len == (size_t)-1) { 3175 error_pos = (size_t)-1; 3176 goto encode_error; 3177 } 3178 3179 bytes = PyBytes_FromStringAndSize(NULL, len); 3180 if (bytes == NULL) { 3181 PyMem_Free(wstr); 3182 return NULL; 3183 } 3184 3185 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3186 if (len2 == (size_t)-1 || len2 > len) { 3187 error_pos = (size_t)-1; 3188 goto encode_error; 3189 } 3190 PyMem_Free(wstr); 3191 } 3192 return bytes; 3193 3194encode_error: 3195 errmsg = strerror(errno); 3196 assert(errmsg != NULL); 3197 3198 if (error_pos == (size_t)-1) 3199 error_pos = wcstombs_errorpos(wstr); 3200 3201 PyMem_Free(wstr); 3202 Py_XDECREF(bytes); 3203 3204 if (errmsg != NULL) { 3205 size_t errlen; 3206 wstr = _Py_char2wchar(errmsg, &errlen); 3207 if (wstr != NULL) { 3208 reason = PyUnicode_FromWideChar(wstr, errlen); 3209 PyMem_Free(wstr); 3210 } else 3211 errmsg = NULL; 3212 } 3213 if (errmsg == NULL) 3214 reason = PyUnicode_FromString( 3215 "wcstombs() encountered an unencodable " 3216 "wide character"); 3217 if (reason == NULL) 3218 return NULL; 3219 3220 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3221 "locale", unicode, 3222 (Py_ssize_t)error_pos, 3223 (Py_ssize_t)(error_pos+1), 3224 reason); 3225 Py_DECREF(reason); 3226 if (exc != NULL) { 3227 PyCodec_StrictErrors(exc); 3228 Py_XDECREF(exc); 3229 } 3230 return NULL; 3231} 3232 3233PyObject * 3234PyUnicode_EncodeFSDefault(PyObject *unicode) 3235{ 3236#ifdef HAVE_MBCS 3237 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3238#elif defined(__APPLE__) 3239 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3240#else 3241 PyInterpreterState *interp = PyThreadState_GET()->interp; 3242 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3243 cannot use it to encode and decode filenames before it is loaded. Load 3244 the Python codec requires to encode at least its own filename. Use the C 3245 version of the locale codec until the codec registry is initialized and 3246 the Python codec is loaded. 3247 3248 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3249 cannot only rely on it: check also interp->fscodec_initialized for 3250 subinterpreters. */ 3251 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3252 return PyUnicode_AsEncodedString(unicode, 3253 Py_FileSystemDefaultEncoding, 3254 "surrogateescape"); 3255 } 3256 else { 3257 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3258 } 3259#endif 3260} 3261 3262PyObject * 3263PyUnicode_AsEncodedString(PyObject *unicode, 3264 const char *encoding, 3265 const char *errors) 3266{ 3267 PyObject *v; 3268 char lower[11]; /* Enough for any encoding shortcut */ 3269 3270 if (!PyUnicode_Check(unicode)) { 3271 PyErr_BadArgument(); 3272 return NULL; 3273 } 3274 3275 /* Shortcuts for common default encodings */ 3276 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3277 if ((strcmp(lower, "utf-8") == 0) || 3278 (strcmp(lower, "utf8") == 0)) 3279 { 3280 if (errors == NULL || strcmp(errors, "strict") == 0) 3281 return _PyUnicode_AsUTF8String(unicode, NULL); 3282 else 3283 return _PyUnicode_AsUTF8String(unicode, errors); 3284 } 3285 else if ((strcmp(lower, "latin-1") == 0) || 3286 (strcmp(lower, "latin1") == 0) || 3287 (strcmp(lower, "iso-8859-1") == 0)) 3288 return _PyUnicode_AsLatin1String(unicode, errors); 3289#ifdef HAVE_MBCS 3290 else if (strcmp(lower, "mbcs") == 0) 3291 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3292#endif 3293 else if (strcmp(lower, "ascii") == 0) 3294 return _PyUnicode_AsASCIIString(unicode, errors); 3295 } 3296 3297 /* Encode via the codec registry */ 3298 v = PyCodec_Encode(unicode, encoding, errors); 3299 if (v == NULL) 3300 return NULL; 3301 3302 /* The normal path */ 3303 if (PyBytes_Check(v)) 3304 return v; 3305 3306 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3307 if (PyByteArray_Check(v)) { 3308 int error; 3309 PyObject *b; 3310 3311 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3312 "encoder %s returned bytearray instead of bytes", 3313 encoding); 3314 if (error) { 3315 Py_DECREF(v); 3316 return NULL; 3317 } 3318 3319 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3320 Py_DECREF(v); 3321 return b; 3322 } 3323 3324 PyErr_Format(PyExc_TypeError, 3325 "encoder did not return a bytes object (type=%.400s)", 3326 Py_TYPE(v)->tp_name); 3327 Py_DECREF(v); 3328 return NULL; 3329} 3330 3331PyObject * 3332PyUnicode_AsEncodedUnicode(PyObject *unicode, 3333 const char *encoding, 3334 const char *errors) 3335{ 3336 PyObject *v; 3337 3338 if (!PyUnicode_Check(unicode)) { 3339 PyErr_BadArgument(); 3340 goto onError; 3341 } 3342 3343 if (encoding == NULL) 3344 encoding = PyUnicode_GetDefaultEncoding(); 3345 3346 /* Encode via the codec registry */ 3347 v = PyCodec_Encode(unicode, encoding, errors); 3348 if (v == NULL) 3349 goto onError; 3350 if (!PyUnicode_Check(v)) { 3351 PyErr_Format(PyExc_TypeError, 3352 "encoder did not return an str object (type=%.400s)", 3353 Py_TYPE(v)->tp_name); 3354 Py_DECREF(v); 3355 goto onError; 3356 } 3357 return v; 3358 3359 onError: 3360 return NULL; 3361} 3362 3363static size_t 3364mbstowcs_errorpos(const char *str, size_t len) 3365{ 3366#ifdef HAVE_MBRTOWC 3367 const char *start = str; 3368 mbstate_t mbs; 3369 size_t converted; 3370 wchar_t ch; 3371 3372 memset(&mbs, 0, sizeof mbs); 3373 while (len) 3374 { 3375 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3376 if (converted == 0) 3377 /* Reached end of string */ 3378 break; 3379 if (converted == (size_t)-1 || converted == (size_t)-2) { 3380 /* Conversion error or incomplete character */ 3381 return str - start; 3382 } 3383 else { 3384 str += converted; 3385 len -= converted; 3386 } 3387 } 3388 /* failed to find the undecodable byte sequence */ 3389 return 0; 3390#endif 3391 return 0; 3392} 3393 3394PyObject* 3395PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3396 const char *errors) 3397{ 3398 wchar_t smallbuf[256]; 3399 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3400 wchar_t *wstr; 3401 size_t wlen, wlen2; 3402 PyObject *unicode; 3403 int surrogateescape; 3404 size_t error_pos; 3405 char *errmsg; 3406 PyObject *reason, *exc; 3407 3408 if (locale_error_handler(errors, &surrogateescape) < 0) 3409 return NULL; 3410 3411 if (str[len] != '\0' || len != strlen(str)) { 3412 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3413 return NULL; 3414 } 3415 3416 if (surrogateescape) 3417 { 3418 wstr = _Py_char2wchar(str, &wlen); 3419 if (wstr == NULL) { 3420 if (wlen == (size_t)-1) 3421 PyErr_NoMemory(); 3422 else 3423 PyErr_SetFromErrno(PyExc_OSError); 3424 return NULL; 3425 } 3426 3427 unicode = PyUnicode_FromWideChar(wstr, wlen); 3428 PyMem_Free(wstr); 3429 } 3430 else { 3431#ifndef HAVE_BROKEN_MBSTOWCS 3432 wlen = mbstowcs(NULL, str, 0); 3433#else 3434 wlen = len; 3435#endif 3436 if (wlen == (size_t)-1) 3437 goto decode_error; 3438 if (wlen+1 <= smallbuf_len) { 3439 wstr = smallbuf; 3440 } 3441 else { 3442 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3443 return PyErr_NoMemory(); 3444 3445 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3446 if (!wstr) 3447 return PyErr_NoMemory(); 3448 } 3449 3450 /* This shouldn't fail now */ 3451 wlen2 = mbstowcs(wstr, str, wlen+1); 3452 if (wlen2 == (size_t)-1) { 3453 if (wstr != smallbuf) 3454 PyMem_Free(wstr); 3455 goto decode_error; 3456 } 3457#ifdef HAVE_BROKEN_MBSTOWCS 3458 assert(wlen2 == wlen); 3459#endif 3460 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3461 if (wstr != smallbuf) 3462 PyMem_Free(wstr); 3463 } 3464 return unicode; 3465 3466decode_error: 3467 errmsg = strerror(errno); 3468 assert(errmsg != NULL); 3469 3470 error_pos = mbstowcs_errorpos(str, len); 3471 if (errmsg != NULL) { 3472 size_t errlen; 3473 wstr = _Py_char2wchar(errmsg, &errlen); 3474 if (wstr != NULL) { 3475 reason = PyUnicode_FromWideChar(wstr, errlen); 3476 PyMem_Free(wstr); 3477 } else 3478 errmsg = NULL; 3479 } 3480 if (errmsg == NULL) 3481 reason = PyUnicode_FromString( 3482 "mbstowcs() encountered an invalid multibyte sequence"); 3483 if (reason == NULL) 3484 return NULL; 3485 3486 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3487 "locale", str, len, 3488 (Py_ssize_t)error_pos, 3489 (Py_ssize_t)(error_pos+1), 3490 reason); 3491 Py_DECREF(reason); 3492 if (exc != NULL) { 3493 PyCodec_StrictErrors(exc); 3494 Py_XDECREF(exc); 3495 } 3496 return NULL; 3497} 3498 3499PyObject* 3500PyUnicode_DecodeLocale(const char *str, const char *errors) 3501{ 3502 Py_ssize_t size = (Py_ssize_t)strlen(str); 3503 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3504} 3505 3506 3507PyObject* 3508PyUnicode_DecodeFSDefault(const char *s) { 3509 Py_ssize_t size = (Py_ssize_t)strlen(s); 3510 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3511} 3512 3513PyObject* 3514PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3515{ 3516#ifdef HAVE_MBCS 3517 return PyUnicode_DecodeMBCS(s, size, NULL); 3518#elif defined(__APPLE__) 3519 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3520#else 3521 PyInterpreterState *interp = PyThreadState_GET()->interp; 3522 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3523 cannot use it to encode and decode filenames before it is loaded. Load 3524 the Python codec requires to encode at least its own filename. Use the C 3525 version of the locale codec until the codec registry is initialized and 3526 the Python codec is loaded. 3527 3528 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3529 cannot only rely on it: check also interp->fscodec_initialized for 3530 subinterpreters. */ 3531 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3532 return PyUnicode_Decode(s, size, 3533 Py_FileSystemDefaultEncoding, 3534 "surrogateescape"); 3535 } 3536 else { 3537 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3538 } 3539#endif 3540} 3541 3542 3543int 3544_PyUnicode_HasNULChars(PyObject* str) 3545{ 3546 Py_ssize_t pos; 3547 3548 if (PyUnicode_READY(str) == -1) 3549 return -1; 3550 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str), 3551 PyUnicode_GET_LENGTH(str), '\0', 1); 3552 if (pos == -1) 3553 return 0; 3554 else 3555 return 1; 3556} 3557 3558int 3559PyUnicode_FSConverter(PyObject* arg, void* addr) 3560{ 3561 PyObject *output = NULL; 3562 Py_ssize_t size; 3563 void *data; 3564 if (arg == NULL) { 3565 Py_DECREF(*(PyObject**)addr); 3566 return 1; 3567 } 3568 if (PyBytes_Check(arg)) { 3569 output = arg; 3570 Py_INCREF(output); 3571 } 3572 else { 3573 arg = PyUnicode_FromObject(arg); 3574 if (!arg) 3575 return 0; 3576 output = PyUnicode_EncodeFSDefault(arg); 3577 Py_DECREF(arg); 3578 if (!output) 3579 return 0; 3580 if (!PyBytes_Check(output)) { 3581 Py_DECREF(output); 3582 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3583 return 0; 3584 } 3585 } 3586 size = PyBytes_GET_SIZE(output); 3587 data = PyBytes_AS_STRING(output); 3588 if (size != strlen(data)) { 3589 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3590 Py_DECREF(output); 3591 return 0; 3592 } 3593 *(PyObject**)addr = output; 3594 return Py_CLEANUP_SUPPORTED; 3595} 3596 3597 3598int 3599PyUnicode_FSDecoder(PyObject* arg, void* addr) 3600{ 3601 PyObject *output = NULL; 3602 if (arg == NULL) { 3603 Py_DECREF(*(PyObject**)addr); 3604 return 1; 3605 } 3606 if (PyUnicode_Check(arg)) { 3607 if (PyUnicode_READY(arg) == -1) 3608 return 0; 3609 output = arg; 3610 Py_INCREF(output); 3611 } 3612 else { 3613 arg = PyBytes_FromObject(arg); 3614 if (!arg) 3615 return 0; 3616 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3617 PyBytes_GET_SIZE(arg)); 3618 Py_DECREF(arg); 3619 if (!output) 3620 return 0; 3621 if (!PyUnicode_Check(output)) { 3622 Py_DECREF(output); 3623 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3624 return 0; 3625 } 3626 } 3627 if (PyUnicode_READY(output) == -1) { 3628 Py_DECREF(output); 3629 return 0; 3630 } 3631 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3632 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3633 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3634 Py_DECREF(output); 3635 return 0; 3636 } 3637 *(PyObject**)addr = output; 3638 return Py_CLEANUP_SUPPORTED; 3639} 3640 3641 3642char* 3643PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3644{ 3645 PyObject *bytes; 3646 3647 if (!PyUnicode_Check(unicode)) { 3648 PyErr_BadArgument(); 3649 return NULL; 3650 } 3651 if (PyUnicode_READY(unicode) == -1) 3652 return NULL; 3653 3654 if (PyUnicode_UTF8(unicode) == NULL) { 3655 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3656 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3657 if (bytes == NULL) 3658 return NULL; 3659 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3660 if (_PyUnicode_UTF8(unicode) == NULL) { 3661 Py_DECREF(bytes); 3662 return NULL; 3663 } 3664 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3665 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3666 PyBytes_AS_STRING(bytes), 3667 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3668 Py_DECREF(bytes); 3669 } 3670 3671 if (psize) 3672 *psize = PyUnicode_UTF8_LENGTH(unicode); 3673 return PyUnicode_UTF8(unicode); 3674} 3675 3676char* 3677PyUnicode_AsUTF8(PyObject *unicode) 3678{ 3679 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3680} 3681 3682Py_UNICODE * 3683PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3684{ 3685 const unsigned char *one_byte; 3686#if SIZEOF_WCHAR_T == 4 3687 const Py_UCS2 *two_bytes; 3688#else 3689 const Py_UCS4 *four_bytes; 3690 const Py_UCS4 *ucs4_end; 3691 Py_ssize_t num_surrogates; 3692#endif 3693 wchar_t *w; 3694 wchar_t *wchar_end; 3695 3696 if (!PyUnicode_Check(unicode)) { 3697 PyErr_BadArgument(); 3698 return NULL; 3699 } 3700 if (_PyUnicode_WSTR(unicode) == NULL) { 3701 /* Non-ASCII compact unicode object */ 3702 assert(_PyUnicode_KIND(unicode) != 0); 3703 assert(PyUnicode_IS_READY(unicode)); 3704 3705 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3706#if SIZEOF_WCHAR_T == 2 3707 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3708 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3709 num_surrogates = 0; 3710 3711 for (; four_bytes < ucs4_end; ++four_bytes) { 3712 if (*four_bytes > 0xFFFF) 3713 ++num_surrogates; 3714 } 3715 3716 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3717 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3718 if (!_PyUnicode_WSTR(unicode)) { 3719 PyErr_NoMemory(); 3720 return NULL; 3721 } 3722 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3723 3724 w = _PyUnicode_WSTR(unicode); 3725 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3726 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3727 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3728 if (*four_bytes > 0xFFFF) { 3729 assert(*four_bytes <= MAX_UNICODE); 3730 /* encode surrogate pair in this case */ 3731 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3732 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3733 } 3734 else 3735 *w = *four_bytes; 3736 3737 if (w > wchar_end) { 3738 assert(0 && "Miscalculated string end"); 3739 } 3740 } 3741 *w = 0; 3742#else 3743 /* sizeof(wchar_t) == 4 */ 3744 Py_FatalError("Impossible unicode object state, wstr and str " 3745 "should share memory already."); 3746 return NULL; 3747#endif 3748 } 3749 else { 3750 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3751 (_PyUnicode_LENGTH(unicode) + 1)); 3752 if (!_PyUnicode_WSTR(unicode)) { 3753 PyErr_NoMemory(); 3754 return NULL; 3755 } 3756 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3757 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3758 w = _PyUnicode_WSTR(unicode); 3759 wchar_end = w + _PyUnicode_LENGTH(unicode); 3760 3761 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3762 one_byte = PyUnicode_1BYTE_DATA(unicode); 3763 for (; w < wchar_end; ++one_byte, ++w) 3764 *w = *one_byte; 3765 /* null-terminate the wstr */ 3766 *w = 0; 3767 } 3768 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3769#if SIZEOF_WCHAR_T == 4 3770 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3771 for (; w < wchar_end; ++two_bytes, ++w) 3772 *w = *two_bytes; 3773 /* null-terminate the wstr */ 3774 *w = 0; 3775#else 3776 /* sizeof(wchar_t) == 2 */ 3777 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3778 _PyUnicode_WSTR(unicode) = NULL; 3779 Py_FatalError("Impossible unicode object state, wstr " 3780 "and str should share memory already."); 3781 return NULL; 3782#endif 3783 } 3784 else { 3785 assert(0 && "This should never happen."); 3786 } 3787 } 3788 } 3789 if (size != NULL) 3790 *size = PyUnicode_WSTR_LENGTH(unicode); 3791 return _PyUnicode_WSTR(unicode); 3792} 3793 3794Py_UNICODE * 3795PyUnicode_AsUnicode(PyObject *unicode) 3796{ 3797 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3798} 3799 3800 3801Py_ssize_t 3802PyUnicode_GetSize(PyObject *unicode) 3803{ 3804 if (!PyUnicode_Check(unicode)) { 3805 PyErr_BadArgument(); 3806 goto onError; 3807 } 3808 return PyUnicode_GET_SIZE(unicode); 3809 3810 onError: 3811 return -1; 3812} 3813 3814Py_ssize_t 3815PyUnicode_GetLength(PyObject *unicode) 3816{ 3817 if (!PyUnicode_Check(unicode)) { 3818 PyErr_BadArgument(); 3819 return -1; 3820 } 3821 if (PyUnicode_READY(unicode) == -1) 3822 return -1; 3823 return PyUnicode_GET_LENGTH(unicode); 3824} 3825 3826Py_UCS4 3827PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3828{ 3829 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3830 PyErr_BadArgument(); 3831 return (Py_UCS4)-1; 3832 } 3833 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3834 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3835 return (Py_UCS4)-1; 3836 } 3837 return PyUnicode_READ_CHAR(unicode, index); 3838} 3839 3840int 3841PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3842{ 3843 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3844 PyErr_BadArgument(); 3845 return -1; 3846 } 3847 assert(PyUnicode_IS_READY(unicode)); 3848 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3849 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3850 return -1; 3851 } 3852 if (unicode_check_modifiable(unicode)) 3853 return -1; 3854 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3855 PyErr_SetString(PyExc_ValueError, "character out of range"); 3856 return -1; 3857 } 3858 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3859 index, ch); 3860 return 0; 3861} 3862 3863const char * 3864PyUnicode_GetDefaultEncoding(void) 3865{ 3866 return "utf-8"; 3867} 3868 3869/* create or adjust a UnicodeDecodeError */ 3870static void 3871make_decode_exception(PyObject **exceptionObject, 3872 const char *encoding, 3873 const char *input, Py_ssize_t length, 3874 Py_ssize_t startpos, Py_ssize_t endpos, 3875 const char *reason) 3876{ 3877 if (*exceptionObject == NULL) { 3878 *exceptionObject = PyUnicodeDecodeError_Create( 3879 encoding, input, length, startpos, endpos, reason); 3880 } 3881 else { 3882 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3883 goto onError; 3884 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3885 goto onError; 3886 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3887 goto onError; 3888 } 3889 return; 3890 3891onError: 3892 Py_DECREF(*exceptionObject); 3893 *exceptionObject = NULL; 3894} 3895 3896#ifdef HAVE_MBCS 3897/* error handling callback helper: 3898 build arguments, call the callback and check the arguments, 3899 if no exception occurred, copy the replacement to the output 3900 and adjust various state variables. 3901 return 0 on success, -1 on error 3902*/ 3903 3904static int 3905unicode_decode_call_errorhandler_wchar( 3906 const char *errors, PyObject **errorHandler, 3907 const char *encoding, const char *reason, 3908 const char **input, const char **inend, Py_ssize_t *startinpos, 3909 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3910 PyObject **output, Py_ssize_t *outpos) 3911{ 3912 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3913 3914 PyObject *restuple = NULL; 3915 PyObject *repunicode = NULL; 3916 Py_ssize_t outsize; 3917 Py_ssize_t insize; 3918 Py_ssize_t requiredsize; 3919 Py_ssize_t newpos; 3920 PyObject *inputobj = NULL; 3921 wchar_t *repwstr; 3922 Py_ssize_t repwlen; 3923 3924 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 3925 outsize = _PyUnicode_WSTR_LENGTH(*output); 3926 3927 if (*errorHandler == NULL) { 3928 *errorHandler = PyCodec_LookupError(errors); 3929 if (*errorHandler == NULL) 3930 goto onError; 3931 } 3932 3933 make_decode_exception(exceptionObject, 3934 encoding, 3935 *input, *inend - *input, 3936 *startinpos, *endinpos, 3937 reason); 3938 if (*exceptionObject == NULL) 3939 goto onError; 3940 3941 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3942 if (restuple == NULL) 3943 goto onError; 3944 if (!PyTuple_Check(restuple)) { 3945 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3946 goto onError; 3947 } 3948 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3949 goto onError; 3950 3951 /* Copy back the bytes variables, which might have been modified by the 3952 callback */ 3953 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3954 if (!inputobj) 3955 goto onError; 3956 if (!PyBytes_Check(inputobj)) { 3957 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3958 } 3959 *input = PyBytes_AS_STRING(inputobj); 3960 insize = PyBytes_GET_SIZE(inputobj); 3961 *inend = *input + insize; 3962 /* we can DECREF safely, as the exception has another reference, 3963 so the object won't go away. */ 3964 Py_DECREF(inputobj); 3965 3966 if (newpos<0) 3967 newpos = insize+newpos; 3968 if (newpos<0 || newpos>insize) { 3969 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3970 goto onError; 3971 } 3972 3973 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 3974 if (repwstr == NULL) 3975 goto onError; 3976 /* need more space? (at least enough for what we 3977 have+the replacement+the rest of the string (starting 3978 at the new input position), so we won't have to check space 3979 when there are no errors in the rest of the string) */ 3980 requiredsize = *outpos + repwlen + insize-newpos; 3981 if (requiredsize > outsize) { 3982 if (requiredsize < 2*outsize) 3983 requiredsize = 2*outsize; 3984 if (unicode_resize(output, requiredsize) < 0) 3985 goto onError; 3986 } 3987 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 3988 *outpos += repwlen; 3989 3990 *endinpos = newpos; 3991 *inptr = *input + newpos; 3992 3993 /* we made it! */ 3994 Py_XDECREF(restuple); 3995 return 0; 3996 3997 onError: 3998 Py_XDECREF(restuple); 3999 return -1; 4000} 4001#endif /* HAVE_MBCS */ 4002 4003static int 4004unicode_decode_call_errorhandler_writer( 4005 const char *errors, PyObject **errorHandler, 4006 const char *encoding, const char *reason, 4007 const char **input, const char **inend, Py_ssize_t *startinpos, 4008 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4009 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4010{ 4011 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4012 4013 PyObject *restuple = NULL; 4014 PyObject *repunicode = NULL; 4015 Py_ssize_t insize; 4016 Py_ssize_t newpos; 4017 PyObject *inputobj = NULL; 4018 4019 if (*errorHandler == NULL) { 4020 *errorHandler = PyCodec_LookupError(errors); 4021 if (*errorHandler == NULL) 4022 goto onError; 4023 } 4024 4025 make_decode_exception(exceptionObject, 4026 encoding, 4027 *input, *inend - *input, 4028 *startinpos, *endinpos, 4029 reason); 4030 if (*exceptionObject == NULL) 4031 goto onError; 4032 4033 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4034 if (restuple == NULL) 4035 goto onError; 4036 if (!PyTuple_Check(restuple)) { 4037 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4038 goto onError; 4039 } 4040 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4041 goto onError; 4042 4043 /* Copy back the bytes variables, which might have been modified by the 4044 callback */ 4045 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4046 if (!inputobj) 4047 goto onError; 4048 if (!PyBytes_Check(inputobj)) { 4049 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4050 } 4051 *input = PyBytes_AS_STRING(inputobj); 4052 insize = PyBytes_GET_SIZE(inputobj); 4053 *inend = *input + insize; 4054 /* we can DECREF safely, as the exception has another reference, 4055 so the object won't go away. */ 4056 Py_DECREF(inputobj); 4057 4058 if (newpos<0) 4059 newpos = insize+newpos; 4060 if (newpos<0 || newpos>insize) { 4061 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4062 goto onError; 4063 } 4064 4065 writer->overallocate = 1; 4066 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4067 return 4068 4069 *endinpos = newpos; 4070 *inptr = *input + newpos; 4071 4072 /* we made it! */ 4073 Py_XDECREF(restuple); 4074 return 0; 4075 4076 onError: 4077 Py_XDECREF(restuple); 4078 return -1; 4079} 4080 4081/* --- UTF-7 Codec -------------------------------------------------------- */ 4082 4083/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4084 4085/* Three simple macros defining base-64. */ 4086 4087/* Is c a base-64 character? */ 4088 4089#define IS_BASE64(c) \ 4090 (((c) >= 'A' && (c) <= 'Z') || \ 4091 ((c) >= 'a' && (c) <= 'z') || \ 4092 ((c) >= '0' && (c) <= '9') || \ 4093 (c) == '+' || (c) == '/') 4094 4095/* given that c is a base-64 character, what is its base-64 value? */ 4096 4097#define FROM_BASE64(c) \ 4098 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4099 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4100 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4101 (c) == '+' ? 62 : 63) 4102 4103/* What is the base-64 character of the bottom 6 bits of n? */ 4104 4105#define TO_BASE64(n) \ 4106 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4107 4108/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4109 * decoded as itself. We are permissive on decoding; the only ASCII 4110 * byte not decoding to itself is the + which begins a base64 4111 * string. */ 4112 4113#define DECODE_DIRECT(c) \ 4114 ((c) <= 127 && (c) != '+') 4115 4116/* The UTF-7 encoder treats ASCII characters differently according to 4117 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4118 * the above). See RFC2152. This array identifies these different 4119 * sets: 4120 * 0 : "Set D" 4121 * alphanumeric and '(),-./:? 4122 * 1 : "Set O" 4123 * !"#$%&*;<=>@[]^_`{|} 4124 * 2 : "whitespace" 4125 * ht nl cr sp 4126 * 3 : special (must be base64 encoded) 4127 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4128 */ 4129 4130static 4131char utf7_category[128] = { 4132/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4133 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4134/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4135 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4136/* sp ! " # $ % & ' ( ) * + , - . / */ 4137 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4138/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4140/* @ A B C D E F G H I J K L M N O */ 4141 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4142/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4144/* ` a b c d e f g h i j k l m n o */ 4145 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4146/* p q r s t u v w x y z { | } ~ del */ 4147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4148}; 4149 4150/* ENCODE_DIRECT: this character should be encoded as itself. The 4151 * answer depends on whether we are encoding set O as itself, and also 4152 * on whether we are encoding whitespace as itself. RFC2152 makes it 4153 * clear that the answers to these questions vary between 4154 * applications, so this code needs to be flexible. */ 4155 4156#define ENCODE_DIRECT(c, directO, directWS) \ 4157 ((c) < 128 && (c) > 0 && \ 4158 ((utf7_category[(c)] == 0) || \ 4159 (directWS && (utf7_category[(c)] == 2)) || \ 4160 (directO && (utf7_category[(c)] == 1)))) 4161 4162PyObject * 4163PyUnicode_DecodeUTF7(const char *s, 4164 Py_ssize_t size, 4165 const char *errors) 4166{ 4167 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4168} 4169 4170/* The decoder. The only state we preserve is our read position, 4171 * i.e. how many characters we have consumed. So if we end in the 4172 * middle of a shift sequence we have to back off the read position 4173 * and the output to the beginning of the sequence, otherwise we lose 4174 * all the shift state (seen bits, number of bits seen, high 4175 * surrogate). */ 4176 4177PyObject * 4178PyUnicode_DecodeUTF7Stateful(const char *s, 4179 Py_ssize_t size, 4180 const char *errors, 4181 Py_ssize_t *consumed) 4182{ 4183 const char *starts = s; 4184 Py_ssize_t startinpos; 4185 Py_ssize_t endinpos; 4186 const char *e; 4187 _PyUnicodeWriter writer; 4188 const char *errmsg = ""; 4189 int inShift = 0; 4190 Py_ssize_t shiftOutStart; 4191 unsigned int base64bits = 0; 4192 unsigned long base64buffer = 0; 4193 Py_UCS4 surrogate = 0; 4194 PyObject *errorHandler = NULL; 4195 PyObject *exc = NULL; 4196 4197 if (size == 0) { 4198 if (consumed) 4199 *consumed = 0; 4200 Py_INCREF(unicode_empty); 4201 return unicode_empty; 4202 } 4203 4204 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4205 _PyUnicodeWriter_Init(&writer, 0); 4206 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 4207 goto onError; 4208 4209 shiftOutStart = 0; 4210 e = s + size; 4211 4212 while (s < e) { 4213 Py_UCS4 ch; 4214 restart: 4215 ch = (unsigned char) *s; 4216 4217 if (inShift) { /* in a base-64 section */ 4218 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4219 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4220 base64bits += 6; 4221 s++; 4222 if (base64bits >= 16) { 4223 /* we have enough bits for a UTF-16 value */ 4224 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4225 base64bits -= 16; 4226 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4227 if (surrogate) { 4228 /* expecting a second surrogate */ 4229 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4230 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4231 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1) 4232 goto onError; 4233 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2); 4234 writer.pos++; 4235 surrogate = 0; 4236 continue; 4237 } 4238 else { 4239 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1) 4240 goto onError; 4241 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate); 4242 writer.pos++; 4243 surrogate = 0; 4244 } 4245 } 4246 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4247 /* first surrogate */ 4248 surrogate = outCh; 4249 } 4250 else { 4251 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1) 4252 goto onError; 4253 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh); 4254 writer.pos++; 4255 } 4256 } 4257 } 4258 else { /* now leaving a base-64 section */ 4259 inShift = 0; 4260 s++; 4261 if (surrogate) { 4262 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1) 4263 goto onError; 4264 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate); 4265 writer.pos++; 4266 surrogate = 0; 4267 } 4268 if (base64bits > 0) { /* left-over bits */ 4269 if (base64bits >= 6) { 4270 /* We've seen at least one base-64 character */ 4271 errmsg = "partial character in shift sequence"; 4272 goto utf7Error; 4273 } 4274 else { 4275 /* Some bits remain; they should be zero */ 4276 if (base64buffer != 0) { 4277 errmsg = "non-zero padding bits in shift sequence"; 4278 goto utf7Error; 4279 } 4280 } 4281 } 4282 if (ch != '-') { 4283 /* '-' is absorbed; other terminating 4284 characters are preserved */ 4285 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 4286 goto onError; 4287 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 4288 writer.pos++; 4289 } 4290 } 4291 } 4292 else if ( ch == '+' ) { 4293 startinpos = s-starts; 4294 s++; /* consume '+' */ 4295 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4296 s++; 4297 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1) 4298 goto onError; 4299 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+'); 4300 writer.pos++; 4301 } 4302 else { /* begin base64-encoded section */ 4303 inShift = 1; 4304 shiftOutStart = writer.pos; 4305 base64bits = 0; 4306 } 4307 } 4308 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4309 s++; 4310 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 4311 goto onError; 4312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 4313 writer.pos++; 4314 } 4315 else { 4316 startinpos = s-starts; 4317 s++; 4318 errmsg = "unexpected special character"; 4319 goto utf7Error; 4320 } 4321 continue; 4322utf7Error: 4323 endinpos = s-starts; 4324 if (unicode_decode_call_errorhandler_writer( 4325 errors, &errorHandler, 4326 "utf7", errmsg, 4327 &starts, &e, &startinpos, &endinpos, &exc, &s, 4328 &writer)) 4329 goto onError; 4330 } 4331 4332 /* end of string */ 4333 4334 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4335 /* if we're in an inconsistent state, that's an error */ 4336 if (surrogate || 4337 (base64bits >= 6) || 4338 (base64bits > 0 && base64buffer != 0)) { 4339 endinpos = size; 4340 if (unicode_decode_call_errorhandler_writer( 4341 errors, &errorHandler, 4342 "utf7", "unterminated shift sequence", 4343 &starts, &e, &startinpos, &endinpos, &exc, &s, 4344 &writer)) 4345 goto onError; 4346 if (s < e) 4347 goto restart; 4348 } 4349 } 4350 4351 /* return state */ 4352 if (consumed) { 4353 if (inShift) { 4354 writer.pos = shiftOutStart; /* back off output */ 4355 *consumed = startinpos; 4356 } 4357 else { 4358 *consumed = s-starts; 4359 } 4360 } 4361 4362 Py_XDECREF(errorHandler); 4363 Py_XDECREF(exc); 4364 return _PyUnicodeWriter_Finish(&writer); 4365 4366 onError: 4367 Py_XDECREF(errorHandler); 4368 Py_XDECREF(exc); 4369 _PyUnicodeWriter_Dealloc(&writer); 4370 return NULL; 4371} 4372 4373 4374PyObject * 4375_PyUnicode_EncodeUTF7(PyObject *str, 4376 int base64SetO, 4377 int base64WhiteSpace, 4378 const char *errors) 4379{ 4380 int kind; 4381 void *data; 4382 Py_ssize_t len; 4383 PyObject *v; 4384 int inShift = 0; 4385 Py_ssize_t i; 4386 unsigned int base64bits = 0; 4387 unsigned long base64buffer = 0; 4388 char * out; 4389 char * start; 4390 4391 if (PyUnicode_READY(str) == -1) 4392 return NULL; 4393 kind = PyUnicode_KIND(str); 4394 data = PyUnicode_DATA(str); 4395 len = PyUnicode_GET_LENGTH(str); 4396 4397 if (len == 0) 4398 return PyBytes_FromStringAndSize(NULL, 0); 4399 4400 /* It might be possible to tighten this worst case */ 4401 if (len > PY_SSIZE_T_MAX / 8) 4402 return PyErr_NoMemory(); 4403 v = PyBytes_FromStringAndSize(NULL, len * 8); 4404 if (v == NULL) 4405 return NULL; 4406 4407 start = out = PyBytes_AS_STRING(v); 4408 for (i = 0; i < len; ++i) { 4409 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4410 4411 if (inShift) { 4412 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4413 /* shifting out */ 4414 if (base64bits) { /* output remaining bits */ 4415 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4416 base64buffer = 0; 4417 base64bits = 0; 4418 } 4419 inShift = 0; 4420 /* Characters not in the BASE64 set implicitly unshift the sequence 4421 so no '-' is required, except if the character is itself a '-' */ 4422 if (IS_BASE64(ch) || ch == '-') { 4423 *out++ = '-'; 4424 } 4425 *out++ = (char) ch; 4426 } 4427 else { 4428 goto encode_char; 4429 } 4430 } 4431 else { /* not in a shift sequence */ 4432 if (ch == '+') { 4433 *out++ = '+'; 4434 *out++ = '-'; 4435 } 4436 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4437 *out++ = (char) ch; 4438 } 4439 else { 4440 *out++ = '+'; 4441 inShift = 1; 4442 goto encode_char; 4443 } 4444 } 4445 continue; 4446encode_char: 4447 if (ch >= 0x10000) { 4448 assert(ch <= MAX_UNICODE); 4449 4450 /* code first surrogate */ 4451 base64bits += 16; 4452 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4453 while (base64bits >= 6) { 4454 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4455 base64bits -= 6; 4456 } 4457 /* prepare second surrogate */ 4458 ch = Py_UNICODE_LOW_SURROGATE(ch); 4459 } 4460 base64bits += 16; 4461 base64buffer = (base64buffer << 16) | ch; 4462 while (base64bits >= 6) { 4463 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4464 base64bits -= 6; 4465 } 4466 } 4467 if (base64bits) 4468 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4469 if (inShift) 4470 *out++ = '-'; 4471 if (_PyBytes_Resize(&v, out - start) < 0) 4472 return NULL; 4473 return v; 4474} 4475PyObject * 4476PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4477 Py_ssize_t size, 4478 int base64SetO, 4479 int base64WhiteSpace, 4480 const char *errors) 4481{ 4482 PyObject *result; 4483 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4484 if (tmp == NULL) 4485 return NULL; 4486 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4487 base64WhiteSpace, errors); 4488 Py_DECREF(tmp); 4489 return result; 4490} 4491 4492#undef IS_BASE64 4493#undef FROM_BASE64 4494#undef TO_BASE64 4495#undef DECODE_DIRECT 4496#undef ENCODE_DIRECT 4497 4498/* --- UTF-8 Codec -------------------------------------------------------- */ 4499 4500PyObject * 4501PyUnicode_DecodeUTF8(const char *s, 4502 Py_ssize_t size, 4503 const char *errors) 4504{ 4505 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4506} 4507 4508#include "stringlib/asciilib.h" 4509#include "stringlib/codecs.h" 4510#include "stringlib/undef.h" 4511 4512#include "stringlib/ucs1lib.h" 4513#include "stringlib/codecs.h" 4514#include "stringlib/undef.h" 4515 4516#include "stringlib/ucs2lib.h" 4517#include "stringlib/codecs.h" 4518#include "stringlib/undef.h" 4519 4520#include "stringlib/ucs4lib.h" 4521#include "stringlib/codecs.h" 4522#include "stringlib/undef.h" 4523 4524/* Mask to quickly check whether a C 'long' contains a 4525 non-ASCII, UTF8-encoded char. */ 4526#if (SIZEOF_LONG == 8) 4527# define ASCII_CHAR_MASK 0x8080808080808080UL 4528#elif (SIZEOF_LONG == 4) 4529# define ASCII_CHAR_MASK 0x80808080UL 4530#else 4531# error C 'long' size should be either 4 or 8! 4532#endif 4533 4534static Py_ssize_t 4535ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4536{ 4537 const char *p = start; 4538 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4539 4540#if SIZEOF_LONG <= SIZEOF_VOID_P 4541 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4542 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4543 /* Fast path, see in STRINGLIB(utf8_decode) for 4544 an explanation. */ 4545 /* Help register allocation */ 4546 register const char *_p = p; 4547 register Py_UCS1 * q = dest; 4548 while (_p < aligned_end) { 4549 unsigned long value = *(const unsigned long *) _p; 4550 if (value & ASCII_CHAR_MASK) 4551 break; 4552 *((unsigned long *)q) = value; 4553 _p += SIZEOF_LONG; 4554 q += SIZEOF_LONG; 4555 } 4556 p = _p; 4557 while (p < end) { 4558 if ((unsigned char)*p & 0x80) 4559 break; 4560 *q++ = *p++; 4561 } 4562 return p - start; 4563 } 4564#endif 4565 while (p < end) { 4566 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4567 for an explanation. */ 4568 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4569 /* Help register allocation */ 4570 register const char *_p = p; 4571 while (_p < aligned_end) { 4572 unsigned long value = *(unsigned long *) _p; 4573 if (value & ASCII_CHAR_MASK) 4574 break; 4575 _p += SIZEOF_LONG; 4576 } 4577 p = _p; 4578 if (_p == end) 4579 break; 4580 } 4581 if ((unsigned char)*p & 0x80) 4582 break; 4583 ++p; 4584 } 4585 memcpy(dest, start, p - start); 4586 return p - start; 4587} 4588 4589PyObject * 4590PyUnicode_DecodeUTF8Stateful(const char *s, 4591 Py_ssize_t size, 4592 const char *errors, 4593 Py_ssize_t *consumed) 4594{ 4595 _PyUnicodeWriter writer; 4596 const char *starts = s; 4597 const char *end = s + size; 4598 4599 Py_ssize_t startinpos; 4600 Py_ssize_t endinpos; 4601 const char *errmsg = ""; 4602 PyObject *errorHandler = NULL; 4603 PyObject *exc = NULL; 4604 4605 if (size == 0) { 4606 if (consumed) 4607 *consumed = 0; 4608 Py_INCREF(unicode_empty); 4609 return unicode_empty; 4610 } 4611 4612 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4613 if (size == 1 && (unsigned char)s[0] < 128) { 4614 if (consumed) 4615 *consumed = 1; 4616 return get_latin1_char((unsigned char)s[0]); 4617 } 4618 4619 _PyUnicodeWriter_Init(&writer, 0); 4620 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 4621 goto onError; 4622 4623 writer.pos = ascii_decode(s, end, writer.data); 4624 s += writer.pos; 4625 while (s < end) { 4626 Py_UCS4 ch; 4627 int kind = writer.kind; 4628 if (kind == PyUnicode_1BYTE_KIND) { 4629 if (PyUnicode_IS_ASCII(writer.buffer)) 4630 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4631 else 4632 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4633 } else if (kind == PyUnicode_2BYTE_KIND) { 4634 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4635 } else { 4636 assert(kind == PyUnicode_4BYTE_KIND); 4637 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4638 } 4639 4640 switch (ch) { 4641 case 0: 4642 if (s == end || consumed) 4643 goto End; 4644 errmsg = "unexpected end of data"; 4645 startinpos = s - starts; 4646 endinpos = end - starts; 4647 break; 4648 case 1: 4649 errmsg = "invalid start byte"; 4650 startinpos = s - starts; 4651 endinpos = startinpos + 1; 4652 break; 4653 case 2: 4654 case 3: 4655 case 4: 4656 errmsg = "invalid continuation byte"; 4657 startinpos = s - starts; 4658 endinpos = startinpos + ch - 1; 4659 break; 4660 default: 4661 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 4662 goto onError; 4663 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 4664 writer.pos++; 4665 continue; 4666 } 4667 4668 if (unicode_decode_call_errorhandler_writer( 4669 errors, &errorHandler, 4670 "utf-8", errmsg, 4671 &starts, &end, &startinpos, &endinpos, &exc, &s, 4672 &writer)) 4673 goto onError; 4674 } 4675 4676End: 4677 if (consumed) 4678 *consumed = s - starts; 4679 4680 Py_XDECREF(errorHandler); 4681 Py_XDECREF(exc); 4682 return _PyUnicodeWriter_Finish(&writer); 4683 4684onError: 4685 Py_XDECREF(errorHandler); 4686 Py_XDECREF(exc); 4687 _PyUnicodeWriter_Dealloc(&writer); 4688 return NULL; 4689} 4690 4691#ifdef __APPLE__ 4692 4693/* Simplified UTF-8 decoder using surrogateescape error handler, 4694 used to decode the command line arguments on Mac OS X. 4695 4696 Return a pointer to a newly allocated wide character string (use 4697 PyMem_Free() to free the memory), or NULL on memory allocation error. */ 4698 4699wchar_t* 4700_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4701{ 4702 const char *e; 4703 wchar_t *unicode; 4704 Py_ssize_t outpos; 4705 4706 /* Note: size will always be longer than the resulting Unicode 4707 character count */ 4708 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) 4709 return NULL; 4710 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4711 if (!unicode) 4712 return NULL; 4713 4714 /* Unpack UTF-8 encoded data */ 4715 e = s + size; 4716 outpos = 0; 4717 while (s < e) { 4718 Py_UCS4 ch; 4719#if SIZEOF_WCHAR_T == 4 4720 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4721#else 4722 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4723#endif 4724 if (ch > 0xFF) { 4725#if SIZEOF_WCHAR_T == 4 4726 assert(0); 4727#else 4728 assert(Py_UNICODE_IS_SURROGATE(ch)); 4729 /* compute and append the two surrogates: */ 4730 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4731 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4732#endif 4733 } 4734 else { 4735 if (!ch && s == e) 4736 break; 4737 /* surrogateescape */ 4738 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4739 } 4740 } 4741 unicode[outpos] = L'\0'; 4742 return unicode; 4743} 4744 4745#endif /* __APPLE__ */ 4746 4747/* Primary internal function which creates utf8 encoded bytes objects. 4748 4749 Allocation strategy: if the string is short, convert into a stack buffer 4750 and allocate exactly as much space needed at the end. Else allocate the 4751 maximum possible needed (4 result bytes per Unicode character), and return 4752 the excess memory at the end. 4753*/ 4754PyObject * 4755_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4756{ 4757 enum PyUnicode_Kind kind; 4758 void *data; 4759 Py_ssize_t size; 4760 4761 if (!PyUnicode_Check(unicode)) { 4762 PyErr_BadArgument(); 4763 return NULL; 4764 } 4765 4766 if (PyUnicode_READY(unicode) == -1) 4767 return NULL; 4768 4769 if (PyUnicode_UTF8(unicode)) 4770 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4771 PyUnicode_UTF8_LENGTH(unicode)); 4772 4773 kind = PyUnicode_KIND(unicode); 4774 data = PyUnicode_DATA(unicode); 4775 size = PyUnicode_GET_LENGTH(unicode); 4776 4777 switch (kind) { 4778 default: 4779 assert(0); 4780 case PyUnicode_1BYTE_KIND: 4781 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4782 assert(!PyUnicode_IS_ASCII(unicode)); 4783 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4784 case PyUnicode_2BYTE_KIND: 4785 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4786 case PyUnicode_4BYTE_KIND: 4787 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4788 } 4789} 4790 4791PyObject * 4792PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4793 Py_ssize_t size, 4794 const char *errors) 4795{ 4796 PyObject *v, *unicode; 4797 4798 unicode = PyUnicode_FromUnicode(s, size); 4799 if (unicode == NULL) 4800 return NULL; 4801 v = _PyUnicode_AsUTF8String(unicode, errors); 4802 Py_DECREF(unicode); 4803 return v; 4804} 4805 4806PyObject * 4807PyUnicode_AsUTF8String(PyObject *unicode) 4808{ 4809 return _PyUnicode_AsUTF8String(unicode, NULL); 4810} 4811 4812/* --- UTF-32 Codec ------------------------------------------------------- */ 4813 4814PyObject * 4815PyUnicode_DecodeUTF32(const char *s, 4816 Py_ssize_t size, 4817 const char *errors, 4818 int *byteorder) 4819{ 4820 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4821} 4822 4823PyObject * 4824PyUnicode_DecodeUTF32Stateful(const char *s, 4825 Py_ssize_t size, 4826 const char *errors, 4827 int *byteorder, 4828 Py_ssize_t *consumed) 4829{ 4830 const char *starts = s; 4831 Py_ssize_t startinpos; 4832 Py_ssize_t endinpos; 4833 _PyUnicodeWriter writer; 4834 const unsigned char *q, *e; 4835 int le, bo = 0; /* assume native ordering by default */ 4836 const char *errmsg = ""; 4837 PyObject *errorHandler = NULL; 4838 PyObject *exc = NULL; 4839 4840 q = (unsigned char *)s; 4841 e = q + size; 4842 4843 if (byteorder) 4844 bo = *byteorder; 4845 4846 /* Check for BOM marks (U+FEFF) in the input and adjust current 4847 byte order setting accordingly. In native mode, the leading BOM 4848 mark is skipped, in all other modes, it is copied to the output 4849 stream as-is (giving a ZWNBSP character). */ 4850 if (bo == 0 && size >= 4) { 4851 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4852 if (bom == 0x0000FEFF) { 4853 bo = -1; 4854 q += 4; 4855 } 4856 else if (bom == 0xFFFE0000) { 4857 bo = 1; 4858 q += 4; 4859 } 4860 if (byteorder) 4861 *byteorder = bo; 4862 } 4863 4864 if (q == e) { 4865 if (consumed) 4866 *consumed = size; 4867 Py_INCREF(unicode_empty); 4868 return unicode_empty; 4869 } 4870 4871#ifdef WORDS_BIGENDIAN 4872 le = bo < 0; 4873#else 4874 le = bo <= 0; 4875#endif 4876 4877 _PyUnicodeWriter_Init(&writer, 0); 4878 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) 4879 goto onError; 4880 4881 while (1) { 4882 Py_UCS4 ch = 0; 4883 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 4884 4885 if (e - q >= 4) { 4886 enum PyUnicode_Kind kind = writer.kind; 4887 void *data = writer.data; 4888 const unsigned char *last = e - 4; 4889 Py_ssize_t pos = writer.pos; 4890 if (le) { 4891 do { 4892 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4893 if (ch > maxch) 4894 break; 4895 PyUnicode_WRITE(kind, data, pos++, ch); 4896 q += 4; 4897 } while (q <= last); 4898 } 4899 else { 4900 do { 4901 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 4902 if (ch > maxch) 4903 break; 4904 PyUnicode_WRITE(kind, data, pos++, ch); 4905 q += 4; 4906 } while (q <= last); 4907 } 4908 writer.pos = pos; 4909 } 4910 4911 if (ch <= maxch) { 4912 if (q == e || consumed) 4913 break; 4914 /* remaining bytes at the end? (size should be divisible by 4) */ 4915 errmsg = "truncated data"; 4916 startinpos = ((const char *)q) - starts; 4917 endinpos = ((const char *)e) - starts; 4918 } 4919 else { 4920 if (ch < 0x110000) { 4921 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 4922 goto onError; 4923 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 4924 writer.pos++; 4925 q += 4; 4926 continue; 4927 } 4928 errmsg = "codepoint not in range(0x110000)"; 4929 startinpos = ((const char *)q) - starts; 4930 endinpos = startinpos + 4; 4931 } 4932 4933 /* The remaining input chars are ignored if the callback 4934 chooses to skip the input */ 4935 if (unicode_decode_call_errorhandler_writer( 4936 errors, &errorHandler, 4937 "utf32", errmsg, 4938 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4939 &writer)) 4940 goto onError; 4941 } 4942 4943 if (consumed) 4944 *consumed = (const char *)q-starts; 4945 4946 Py_XDECREF(errorHandler); 4947 Py_XDECREF(exc); 4948 return _PyUnicodeWriter_Finish(&writer); 4949 4950 onError: 4951 _PyUnicodeWriter_Dealloc(&writer); 4952 Py_XDECREF(errorHandler); 4953 Py_XDECREF(exc); 4954 return NULL; 4955} 4956 4957PyObject * 4958_PyUnicode_EncodeUTF32(PyObject *str, 4959 const char *errors, 4960 int byteorder) 4961{ 4962 int kind; 4963 void *data; 4964 Py_ssize_t len; 4965 PyObject *v; 4966 unsigned char *p; 4967 Py_ssize_t nsize, i; 4968 /* Offsets from p for storing byte pairs in the right order. */ 4969#if PY_LITTLE_ENDIAN 4970 int iorder[] = {0, 1, 2, 3}; 4971#else 4972 int iorder[] = {3, 2, 1, 0}; 4973#endif 4974 4975#define STORECHAR(CH) \ 4976 do { \ 4977 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4978 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4979 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4980 p[iorder[0]] = (CH) & 0xff; \ 4981 p += 4; \ 4982 } while(0) 4983 4984 if (!PyUnicode_Check(str)) { 4985 PyErr_BadArgument(); 4986 return NULL; 4987 } 4988 if (PyUnicode_READY(str) == -1) 4989 return NULL; 4990 kind = PyUnicode_KIND(str); 4991 data = PyUnicode_DATA(str); 4992 len = PyUnicode_GET_LENGTH(str); 4993 4994 nsize = len + (byteorder == 0); 4995 if (nsize > PY_SSIZE_T_MAX / 4) 4996 return PyErr_NoMemory(); 4997 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 4998 if (v == NULL) 4999 return NULL; 5000 5001 p = (unsigned char *)PyBytes_AS_STRING(v); 5002 if (byteorder == 0) 5003 STORECHAR(0xFEFF); 5004 if (len == 0) 5005 goto done; 5006 5007 if (byteorder == -1) { 5008 /* force LE */ 5009 iorder[0] = 0; 5010 iorder[1] = 1; 5011 iorder[2] = 2; 5012 iorder[3] = 3; 5013 } 5014 else if (byteorder == 1) { 5015 /* force BE */ 5016 iorder[0] = 3; 5017 iorder[1] = 2; 5018 iorder[2] = 1; 5019 iorder[3] = 0; 5020 } 5021 5022 for (i = 0; i < len; i++) 5023 STORECHAR(PyUnicode_READ(kind, data, i)); 5024 5025 done: 5026 return v; 5027#undef STORECHAR 5028} 5029 5030PyObject * 5031PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5032 Py_ssize_t size, 5033 const char *errors, 5034 int byteorder) 5035{ 5036 PyObject *result; 5037 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5038 if (tmp == NULL) 5039 return NULL; 5040 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5041 Py_DECREF(tmp); 5042 return result; 5043} 5044 5045PyObject * 5046PyUnicode_AsUTF32String(PyObject *unicode) 5047{ 5048 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5049} 5050 5051/* --- UTF-16 Codec ------------------------------------------------------- */ 5052 5053PyObject * 5054PyUnicode_DecodeUTF16(const char *s, 5055 Py_ssize_t size, 5056 const char *errors, 5057 int *byteorder) 5058{ 5059 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5060} 5061 5062PyObject * 5063PyUnicode_DecodeUTF16Stateful(const char *s, 5064 Py_ssize_t size, 5065 const char *errors, 5066 int *byteorder, 5067 Py_ssize_t *consumed) 5068{ 5069 const char *starts = s; 5070 Py_ssize_t startinpos; 5071 Py_ssize_t endinpos; 5072 _PyUnicodeWriter writer; 5073 const unsigned char *q, *e; 5074 int bo = 0; /* assume native ordering by default */ 5075 int native_ordering; 5076 const char *errmsg = ""; 5077 PyObject *errorHandler = NULL; 5078 PyObject *exc = NULL; 5079 5080 q = (unsigned char *)s; 5081 e = q + size; 5082 5083 if (byteorder) 5084 bo = *byteorder; 5085 5086 /* Check for BOM marks (U+FEFF) in the input and adjust current 5087 byte order setting accordingly. In native mode, the leading BOM 5088 mark is skipped, in all other modes, it is copied to the output 5089 stream as-is (giving a ZWNBSP character). */ 5090 if (bo == 0 && size >= 2) { 5091 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5092 if (bom == 0xFEFF) { 5093 q += 2; 5094 bo = -1; 5095 } 5096 else if (bom == 0xFFFE) { 5097 q += 2; 5098 bo = 1; 5099 } 5100 if (byteorder) 5101 *byteorder = bo; 5102 } 5103 5104 if (q == e) { 5105 if (consumed) 5106 *consumed = size; 5107 Py_INCREF(unicode_empty); 5108 return unicode_empty; 5109 } 5110 5111#if PY_LITTLE_ENDIAN 5112 native_ordering = bo <= 0; 5113#else 5114 native_ordering = bo >= 0; 5115#endif 5116 5117 /* Note: size will always be longer than the resulting Unicode 5118 character count */ 5119 _PyUnicodeWriter_Init(&writer, 0); 5120 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) 5121 goto onError; 5122 5123 while (1) { 5124 Py_UCS4 ch = 0; 5125 if (e - q >= 2) { 5126 int kind = writer.kind; 5127 if (kind == PyUnicode_1BYTE_KIND) { 5128 if (PyUnicode_IS_ASCII(writer.buffer)) 5129 ch = asciilib_utf16_decode(&q, e, 5130 (Py_UCS1*)writer.data, &writer.pos, 5131 native_ordering); 5132 else 5133 ch = ucs1lib_utf16_decode(&q, e, 5134 (Py_UCS1*)writer.data, &writer.pos, 5135 native_ordering); 5136 } else if (kind == PyUnicode_2BYTE_KIND) { 5137 ch = ucs2lib_utf16_decode(&q, e, 5138 (Py_UCS2*)writer.data, &writer.pos, 5139 native_ordering); 5140 } else { 5141 assert(kind == PyUnicode_4BYTE_KIND); 5142 ch = ucs4lib_utf16_decode(&q, e, 5143 (Py_UCS4*)writer.data, &writer.pos, 5144 native_ordering); 5145 } 5146 } 5147 5148 switch (ch) 5149 { 5150 case 0: 5151 /* remaining byte at the end? (size should be even) */ 5152 if (q == e || consumed) 5153 goto End; 5154 errmsg = "truncated data"; 5155 startinpos = ((const char *)q) - starts; 5156 endinpos = ((const char *)e) - starts; 5157 break; 5158 /* The remaining input chars are ignored if the callback 5159 chooses to skip the input */ 5160 case 1: 5161 errmsg = "unexpected end of data"; 5162 startinpos = ((const char *)q) - 2 - starts; 5163 endinpos = ((const char *)e) - starts; 5164 break; 5165 case 2: 5166 errmsg = "illegal encoding"; 5167 startinpos = ((const char *)q) - 2 - starts; 5168 endinpos = startinpos + 2; 5169 break; 5170 case 3: 5171 errmsg = "illegal UTF-16 surrogate"; 5172 startinpos = ((const char *)q) - 4 - starts; 5173 endinpos = startinpos + 2; 5174 break; 5175 default: 5176 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 5177 goto onError; 5178 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 5179 writer.pos++; 5180 continue; 5181 } 5182 5183 if (unicode_decode_call_errorhandler_writer( 5184 errors, 5185 &errorHandler, 5186 "utf16", errmsg, 5187 &starts, 5188 (const char **)&e, 5189 &startinpos, 5190 &endinpos, 5191 &exc, 5192 (const char **)&q, 5193 &writer)) 5194 goto onError; 5195 } 5196 5197End: 5198 if (consumed) 5199 *consumed = (const char *)q-starts; 5200 5201 Py_XDECREF(errorHandler); 5202 Py_XDECREF(exc); 5203 return _PyUnicodeWriter_Finish(&writer); 5204 5205 onError: 5206 _PyUnicodeWriter_Dealloc(&writer); 5207 Py_XDECREF(errorHandler); 5208 Py_XDECREF(exc); 5209 return NULL; 5210} 5211 5212PyObject * 5213_PyUnicode_EncodeUTF16(PyObject *str, 5214 const char *errors, 5215 int byteorder) 5216{ 5217 enum PyUnicode_Kind kind; 5218 const void *data; 5219 Py_ssize_t len; 5220 PyObject *v; 5221 unsigned short *out; 5222 Py_ssize_t bytesize; 5223 Py_ssize_t pairs; 5224#if PY_BIG_ENDIAN 5225 int native_ordering = byteorder >= 0; 5226#else 5227 int native_ordering = byteorder <= 0; 5228#endif 5229 5230 if (!PyUnicode_Check(str)) { 5231 PyErr_BadArgument(); 5232 return NULL; 5233 } 5234 if (PyUnicode_READY(str) == -1) 5235 return NULL; 5236 kind = PyUnicode_KIND(str); 5237 data = PyUnicode_DATA(str); 5238 len = PyUnicode_GET_LENGTH(str); 5239 5240 pairs = 0; 5241 if (kind == PyUnicode_4BYTE_KIND) { 5242 const Py_UCS4 *in = (const Py_UCS4 *)data; 5243 const Py_UCS4 *end = in + len; 5244 while (in < end) 5245 if (*in++ >= 0x10000) 5246 pairs++; 5247 } 5248 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5249 return PyErr_NoMemory(); 5250 bytesize = (len + pairs + (byteorder == 0)) * 2; 5251 v = PyBytes_FromStringAndSize(NULL, bytesize); 5252 if (v == NULL) 5253 return NULL; 5254 5255 /* output buffer is 2-bytes aligned */ 5256 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5257 out = (unsigned short *)PyBytes_AS_STRING(v); 5258 if (byteorder == 0) 5259 *out++ = 0xFEFF; 5260 if (len == 0) 5261 goto done; 5262 5263 switch (kind) { 5264 case PyUnicode_1BYTE_KIND: { 5265 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5266 break; 5267 } 5268 case PyUnicode_2BYTE_KIND: { 5269 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5270 break; 5271 } 5272 case PyUnicode_4BYTE_KIND: { 5273 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5274 break; 5275 } 5276 default: 5277 assert(0); 5278 } 5279 5280 done: 5281 return v; 5282} 5283 5284PyObject * 5285PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5286 Py_ssize_t size, 5287 const char *errors, 5288 int byteorder) 5289{ 5290 PyObject *result; 5291 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5292 if (tmp == NULL) 5293 return NULL; 5294 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5295 Py_DECREF(tmp); 5296 return result; 5297} 5298 5299PyObject * 5300PyUnicode_AsUTF16String(PyObject *unicode) 5301{ 5302 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5303} 5304 5305/* --- Unicode Escape Codec ----------------------------------------------- */ 5306 5307/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5308 if all the escapes in the string make it still a valid ASCII string. 5309 Returns -1 if any escapes were found which cause the string to 5310 pop out of ASCII range. Otherwise returns the length of the 5311 required buffer to hold the string. 5312 */ 5313static Py_ssize_t 5314length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5315{ 5316 const unsigned char *p = (const unsigned char *)s; 5317 const unsigned char *end = p + size; 5318 Py_ssize_t length = 0; 5319 5320 if (size < 0) 5321 return -1; 5322 5323 for (; p < end; ++p) { 5324 if (*p > 127) { 5325 /* Non-ASCII */ 5326 return -1; 5327 } 5328 else if (*p != '\\') { 5329 /* Normal character */ 5330 ++length; 5331 } 5332 else { 5333 /* Backslash-escape, check next char */ 5334 ++p; 5335 /* Escape sequence reaches till end of string or 5336 non-ASCII follow-up. */ 5337 if (p >= end || *p > 127) 5338 return -1; 5339 switch (*p) { 5340 case '\n': 5341 /* backslash + \n result in zero characters */ 5342 break; 5343 case '\\': case '\'': case '\"': 5344 case 'b': case 'f': case 't': 5345 case 'n': case 'r': case 'v': case 'a': 5346 ++length; 5347 break; 5348 case '0': case '1': case '2': case '3': 5349 case '4': case '5': case '6': case '7': 5350 case 'x': case 'u': case 'U': case 'N': 5351 /* these do not guarantee ASCII characters */ 5352 return -1; 5353 default: 5354 /* count the backslash + the other character */ 5355 length += 2; 5356 } 5357 } 5358 } 5359 return length; 5360} 5361 5362static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5363 5364PyObject * 5365PyUnicode_DecodeUnicodeEscape(const char *s, 5366 Py_ssize_t size, 5367 const char *errors) 5368{ 5369 const char *starts = s; 5370 Py_ssize_t startinpos; 5371 Py_ssize_t endinpos; 5372 int j; 5373 _PyUnicodeWriter writer; 5374 const char *end; 5375 char* message; 5376 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5377 PyObject *errorHandler = NULL; 5378 PyObject *exc = NULL; 5379 Py_ssize_t len; 5380 5381 len = length_of_escaped_ascii_string(s, size); 5382 if (len == 0) { 5383 Py_INCREF(unicode_empty); 5384 return unicode_empty; 5385 } 5386 5387 /* After length_of_escaped_ascii_string() there are two alternatives, 5388 either the string is pure ASCII with named escapes like \n, etc. 5389 and we determined it's exact size (common case) 5390 or it contains \x, \u, ... escape sequences. then we create a 5391 legacy wchar string and resize it at the end of this function. */ 5392 _PyUnicodeWriter_Init(&writer, 0); 5393 if (len > 0) { 5394 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) 5395 goto onError; 5396 assert(writer.kind == PyUnicode_1BYTE_KIND); 5397 } 5398 else { 5399 /* Escaped strings will always be longer than the resulting 5400 Unicode string, so we start with size here and then reduce the 5401 length after conversion to the true value. 5402 (but if the error callback returns a long replacement string 5403 we'll have to allocate more space) */ 5404 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 5405 goto onError; 5406 } 5407 5408 if (size == 0) 5409 return _PyUnicodeWriter_Finish(&writer); 5410 end = s + size; 5411 5412 while (s < end) { 5413 unsigned char c; 5414 Py_UCS4 x; 5415 int digits; 5416 5417 /* Non-escape characters are interpreted as Unicode ordinals */ 5418 if (*s != '\\') { 5419 x = (unsigned char)*s; 5420 s++; 5421 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1) 5422 goto onError; 5423 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x); 5424 writer.pos++; 5425 continue; 5426 } 5427 5428 startinpos = s-starts; 5429 /* \ - Escapes */ 5430 s++; 5431 c = *s++; 5432 if (s > end) 5433 c = '\0'; /* Invalid after \ */ 5434 5435 /* The only case in which i == ascii_length is a backslash 5436 followed by a newline. */ 5437 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n')); 5438 5439 switch (c) { 5440 5441 /* \x escapes */ 5442#define WRITECHAR(ch) \ 5443 do { \ 5444 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \ 5445 goto onError; \ 5446 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \ 5447 writer.pos++; \ 5448 } while(0) 5449 5450 case '\n': break; 5451 case '\\': WRITECHAR('\\'); break; 5452 case '\'': WRITECHAR('\''); break; 5453 case '\"': WRITECHAR('\"'); break; 5454 case 'b': WRITECHAR('\b'); break; 5455 /* FF */ 5456 case 'f': WRITECHAR('\014'); break; 5457 case 't': WRITECHAR('\t'); break; 5458 case 'n': WRITECHAR('\n'); break; 5459 case 'r': WRITECHAR('\r'); break; 5460 /* VT */ 5461 case 'v': WRITECHAR('\013'); break; 5462 /* BEL, not classic C */ 5463 case 'a': WRITECHAR('\007'); break; 5464 5465 /* \OOO (octal) escapes */ 5466 case '0': case '1': case '2': case '3': 5467 case '4': case '5': case '6': case '7': 5468 x = s[-1] - '0'; 5469 if (s < end && '0' <= *s && *s <= '7') { 5470 x = (x<<3) + *s++ - '0'; 5471 if (s < end && '0' <= *s && *s <= '7') 5472 x = (x<<3) + *s++ - '0'; 5473 } 5474 WRITECHAR(x); 5475 break; 5476 5477 /* hex escapes */ 5478 /* \xXX */ 5479 case 'x': 5480 digits = 2; 5481 message = "truncated \\xXX escape"; 5482 goto hexescape; 5483 5484 /* \uXXXX */ 5485 case 'u': 5486 digits = 4; 5487 message = "truncated \\uXXXX escape"; 5488 goto hexescape; 5489 5490 /* \UXXXXXXXX */ 5491 case 'U': 5492 digits = 8; 5493 message = "truncated \\UXXXXXXXX escape"; 5494 hexescape: 5495 chr = 0; 5496 if (s+digits>end) { 5497 endinpos = size; 5498 if (unicode_decode_call_errorhandler_writer( 5499 errors, &errorHandler, 5500 "unicodeescape", "end of string in escape sequence", 5501 &starts, &end, &startinpos, &endinpos, &exc, &s, 5502 &writer)) 5503 goto onError; 5504 goto nextByte; 5505 } 5506 for (j = 0; j < digits; ++j) { 5507 c = (unsigned char) s[j]; 5508 if (!Py_ISXDIGIT(c)) { 5509 endinpos = (s+j+1)-starts; 5510 if (unicode_decode_call_errorhandler_writer( 5511 errors, &errorHandler, 5512 "unicodeescape", message, 5513 &starts, &end, &startinpos, &endinpos, &exc, &s, 5514 &writer)) 5515 goto onError; 5516 goto nextByte; 5517 } 5518 chr = (chr<<4) & ~0xF; 5519 if (c >= '0' && c <= '9') 5520 chr += c - '0'; 5521 else if (c >= 'a' && c <= 'f') 5522 chr += 10 + c - 'a'; 5523 else 5524 chr += 10 + c - 'A'; 5525 } 5526 s += j; 5527 if (chr == 0xffffffff && PyErr_Occurred()) 5528 /* _decoding_error will have already written into the 5529 target buffer. */ 5530 break; 5531 store: 5532 /* when we get here, chr is a 32-bit unicode character */ 5533 if (chr <= MAX_UNICODE) { 5534 WRITECHAR(chr); 5535 } else { 5536 endinpos = s-starts; 5537 if (unicode_decode_call_errorhandler_writer( 5538 errors, &errorHandler, 5539 "unicodeescape", "illegal Unicode character", 5540 &starts, &end, &startinpos, &endinpos, &exc, &s, 5541 &writer)) 5542 goto onError; 5543 } 5544 break; 5545 5546 /* \N{name} */ 5547 case 'N': 5548 message = "malformed \\N character escape"; 5549 if (ucnhash_CAPI == NULL) { 5550 /* load the unicode data module */ 5551 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5552 PyUnicodeData_CAPSULE_NAME, 1); 5553 if (ucnhash_CAPI == NULL) 5554 goto ucnhashError; 5555 } 5556 if (*s == '{') { 5557 const char *start = s+1; 5558 /* look for the closing brace */ 5559 while (*s != '}' && s < end) 5560 s++; 5561 if (s > start && s < end && *s == '}') { 5562 /* found a name. look it up in the unicode database */ 5563 message = "unknown Unicode character name"; 5564 s++; 5565 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5566 &chr, 0)) 5567 goto store; 5568 } 5569 } 5570 endinpos = s-starts; 5571 if (unicode_decode_call_errorhandler_writer( 5572 errors, &errorHandler, 5573 "unicodeescape", message, 5574 &starts, &end, &startinpos, &endinpos, &exc, &s, 5575 &writer)) 5576 goto onError; 5577 break; 5578 5579 default: 5580 if (s > end) { 5581 message = "\\ at end of string"; 5582 s--; 5583 endinpos = s-starts; 5584 if (unicode_decode_call_errorhandler_writer( 5585 errors, &errorHandler, 5586 "unicodeescape", message, 5587 &starts, &end, &startinpos, &endinpos, &exc, &s, 5588 &writer)) 5589 goto onError; 5590 } 5591 else { 5592 WRITECHAR('\\'); 5593 WRITECHAR(s[-1]); 5594 } 5595 break; 5596 } 5597 nextByte: 5598 ; 5599 } 5600#undef WRITECHAR 5601 5602 Py_XDECREF(errorHandler); 5603 Py_XDECREF(exc); 5604 return _PyUnicodeWriter_Finish(&writer); 5605 5606 ucnhashError: 5607 PyErr_SetString( 5608 PyExc_UnicodeError, 5609 "\\N escapes not supported (can't load unicodedata module)" 5610 ); 5611 _PyUnicodeWriter_Dealloc(&writer); 5612 Py_XDECREF(errorHandler); 5613 Py_XDECREF(exc); 5614 return NULL; 5615 5616 onError: 5617 _PyUnicodeWriter_Dealloc(&writer); 5618 Py_XDECREF(errorHandler); 5619 Py_XDECREF(exc); 5620 return NULL; 5621} 5622 5623/* Return a Unicode-Escape string version of the Unicode object. 5624 5625 If quotes is true, the string is enclosed in u"" or u'' quotes as 5626 appropriate. 5627 5628*/ 5629 5630PyObject * 5631PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5632{ 5633 Py_ssize_t i, len; 5634 PyObject *repr; 5635 char *p; 5636 int kind; 5637 void *data; 5638 Py_ssize_t expandsize = 0; 5639 5640 /* Initial allocation is based on the longest-possible character 5641 escape. 5642 5643 For UCS1 strings it's '\xxx', 4 bytes per source character. 5644 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5645 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5646 */ 5647 5648 if (!PyUnicode_Check(unicode)) { 5649 PyErr_BadArgument(); 5650 return NULL; 5651 } 5652 if (PyUnicode_READY(unicode) == -1) 5653 return NULL; 5654 len = PyUnicode_GET_LENGTH(unicode); 5655 kind = PyUnicode_KIND(unicode); 5656 data = PyUnicode_DATA(unicode); 5657 switch (kind) { 5658 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5659 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5660 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5661 } 5662 5663 if (len == 0) 5664 return PyBytes_FromStringAndSize(NULL, 0); 5665 5666 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5667 return PyErr_NoMemory(); 5668 5669 repr = PyBytes_FromStringAndSize(NULL, 5670 2 5671 + expandsize*len 5672 + 1); 5673 if (repr == NULL) 5674 return NULL; 5675 5676 p = PyBytes_AS_STRING(repr); 5677 5678 for (i = 0; i < len; i++) { 5679 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5680 5681 /* Escape backslashes */ 5682 if (ch == '\\') { 5683 *p++ = '\\'; 5684 *p++ = (char) ch; 5685 continue; 5686 } 5687 5688 /* Map 21-bit characters to '\U00xxxxxx' */ 5689 else if (ch >= 0x10000) { 5690 assert(ch <= MAX_UNICODE); 5691 *p++ = '\\'; 5692 *p++ = 'U'; 5693 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5694 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5695 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5696 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5697 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5698 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5699 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5700 *p++ = Py_hexdigits[ch & 0x0000000F]; 5701 continue; 5702 } 5703 5704 /* Map 16-bit characters to '\uxxxx' */ 5705 if (ch >= 256) { 5706 *p++ = '\\'; 5707 *p++ = 'u'; 5708 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5709 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5710 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5711 *p++ = Py_hexdigits[ch & 0x000F]; 5712 } 5713 5714 /* Map special whitespace to '\t', \n', '\r' */ 5715 else if (ch == '\t') { 5716 *p++ = '\\'; 5717 *p++ = 't'; 5718 } 5719 else if (ch == '\n') { 5720 *p++ = '\\'; 5721 *p++ = 'n'; 5722 } 5723 else if (ch == '\r') { 5724 *p++ = '\\'; 5725 *p++ = 'r'; 5726 } 5727 5728 /* Map non-printable US ASCII to '\xhh' */ 5729 else if (ch < ' ' || ch >= 0x7F) { 5730 *p++ = '\\'; 5731 *p++ = 'x'; 5732 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5733 *p++ = Py_hexdigits[ch & 0x000F]; 5734 } 5735 5736 /* Copy everything else as-is */ 5737 else 5738 *p++ = (char) ch; 5739 } 5740 5741 assert(p - PyBytes_AS_STRING(repr) > 0); 5742 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5743 return NULL; 5744 return repr; 5745} 5746 5747PyObject * 5748PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5749 Py_ssize_t size) 5750{ 5751 PyObject *result; 5752 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5753 if (tmp == NULL) 5754 return NULL; 5755 result = PyUnicode_AsUnicodeEscapeString(tmp); 5756 Py_DECREF(tmp); 5757 return result; 5758} 5759 5760/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5761 5762PyObject * 5763PyUnicode_DecodeRawUnicodeEscape(const char *s, 5764 Py_ssize_t size, 5765 const char *errors) 5766{ 5767 const char *starts = s; 5768 Py_ssize_t startinpos; 5769 Py_ssize_t endinpos; 5770 _PyUnicodeWriter writer; 5771 const char *end; 5772 const char *bs; 5773 PyObject *errorHandler = NULL; 5774 PyObject *exc = NULL; 5775 5776 if (size == 0) { 5777 Py_INCREF(unicode_empty); 5778 return unicode_empty; 5779 } 5780 5781 /* Escaped strings will always be longer than the resulting 5782 Unicode string, so we start with size here and then reduce the 5783 length after conversion to the true value. (But decoding error 5784 handler might have to resize the string) */ 5785 _PyUnicodeWriter_Init(&writer, 1); 5786 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 5787 goto onError; 5788 5789 end = s + size; 5790 while (s < end) { 5791 unsigned char c; 5792 Py_UCS4 x; 5793 int i; 5794 int count; 5795 5796 /* Non-escape characters are interpreted as Unicode ordinals */ 5797 if (*s != '\\') { 5798 x = (unsigned char)*s++; 5799 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1) 5800 goto onError; 5801 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x); 5802 writer.pos++; 5803 continue; 5804 } 5805 startinpos = s-starts; 5806 5807 /* \u-escapes are only interpreted iff the number of leading 5808 backslashes if odd */ 5809 bs = s; 5810 for (;s < end;) { 5811 if (*s != '\\') 5812 break; 5813 x = (unsigned char)*s++; 5814 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1) 5815 goto onError; 5816 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x); 5817 writer.pos++; 5818 } 5819 if (((s - bs) & 1) == 0 || 5820 s >= end || 5821 (*s != 'u' && *s != 'U')) { 5822 continue; 5823 } 5824 writer.pos--; 5825 count = *s=='u' ? 4 : 8; 5826 s++; 5827 5828 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5829 for (x = 0, i = 0; i < count; ++i, ++s) { 5830 c = (unsigned char)*s; 5831 if (!Py_ISXDIGIT(c)) { 5832 endinpos = s-starts; 5833 if (unicode_decode_call_errorhandler_writer( 5834 errors, &errorHandler, 5835 "rawunicodeescape", "truncated \\uXXXX", 5836 &starts, &end, &startinpos, &endinpos, &exc, &s, 5837 &writer)) 5838 goto onError; 5839 goto nextByte; 5840 } 5841 x = (x<<4) & ~0xF; 5842 if (c >= '0' && c <= '9') 5843 x += c - '0'; 5844 else if (c >= 'a' && c <= 'f') 5845 x += 10 + c - 'a'; 5846 else 5847 x += 10 + c - 'A'; 5848 } 5849 if (x <= MAX_UNICODE) { 5850 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1) 5851 goto onError; 5852 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x); 5853 writer.pos++; 5854 } 5855 else { 5856 endinpos = s-starts; 5857 if (unicode_decode_call_errorhandler_writer( 5858 errors, &errorHandler, 5859 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5860 &starts, &end, &startinpos, &endinpos, &exc, &s, 5861 &writer)) 5862 goto onError; 5863 } 5864 nextByte: 5865 ; 5866 } 5867 Py_XDECREF(errorHandler); 5868 Py_XDECREF(exc); 5869 return _PyUnicodeWriter_Finish(&writer); 5870 5871 onError: 5872 _PyUnicodeWriter_Dealloc(&writer); 5873 Py_XDECREF(errorHandler); 5874 Py_XDECREF(exc); 5875 return NULL; 5876} 5877 5878 5879PyObject * 5880PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5881{ 5882 PyObject *repr; 5883 char *p; 5884 char *q; 5885 Py_ssize_t expandsize, pos; 5886 int kind; 5887 void *data; 5888 Py_ssize_t len; 5889 5890 if (!PyUnicode_Check(unicode)) { 5891 PyErr_BadArgument(); 5892 return NULL; 5893 } 5894 if (PyUnicode_READY(unicode) == -1) 5895 return NULL; 5896 kind = PyUnicode_KIND(unicode); 5897 data = PyUnicode_DATA(unicode); 5898 len = PyUnicode_GET_LENGTH(unicode); 5899 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 5900 bytes, and 1 byte characters 4. */ 5901 expandsize = kind * 2 + 2; 5902 5903 if (len > PY_SSIZE_T_MAX / expandsize) 5904 return PyErr_NoMemory(); 5905 5906 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 5907 if (repr == NULL) 5908 return NULL; 5909 if (len == 0) 5910 return repr; 5911 5912 p = q = PyBytes_AS_STRING(repr); 5913 for (pos = 0; pos < len; pos++) { 5914 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 5915 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5916 if (ch >= 0x10000) { 5917 assert(ch <= MAX_UNICODE); 5918 *p++ = '\\'; 5919 *p++ = 'U'; 5920 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 5921 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 5922 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 5923 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 5924 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 5925 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 5926 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 5927 *p++ = Py_hexdigits[ch & 15]; 5928 } 5929 /* Map 16-bit characters to '\uxxxx' */ 5930 else if (ch >= 256) { 5931 *p++ = '\\'; 5932 *p++ = 'u'; 5933 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 5934 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 5935 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 5936 *p++ = Py_hexdigits[ch & 15]; 5937 } 5938 /* Copy everything else as-is */ 5939 else 5940 *p++ = (char) ch; 5941 } 5942 5943 assert(p > q); 5944 if (_PyBytes_Resize(&repr, p - q) < 0) 5945 return NULL; 5946 return repr; 5947} 5948 5949PyObject * 5950PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5951 Py_ssize_t size) 5952{ 5953 PyObject *result; 5954 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5955 if (tmp == NULL) 5956 return NULL; 5957 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 5958 Py_DECREF(tmp); 5959 return result; 5960} 5961 5962/* --- Unicode Internal Codec ------------------------------------------- */ 5963 5964PyObject * 5965_PyUnicode_DecodeUnicodeInternal(const char *s, 5966 Py_ssize_t size, 5967 const char *errors) 5968{ 5969 const char *starts = s; 5970 Py_ssize_t startinpos; 5971 Py_ssize_t endinpos; 5972 _PyUnicodeWriter writer; 5973 const char *end; 5974 const char *reason; 5975 PyObject *errorHandler = NULL; 5976 PyObject *exc = NULL; 5977 5978 if (PyErr_WarnEx(PyExc_DeprecationWarning, 5979 "unicode_internal codec has been deprecated", 5980 1)) 5981 return NULL; 5982 5983 if (size == 0) { 5984 Py_INCREF(unicode_empty); 5985 return unicode_empty; 5986 } 5987 5988 /* XXX overflow detection missing */ 5989 _PyUnicodeWriter_Init(&writer, 0); 5990 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) 5991 goto onError; 5992 end = s + size; 5993 5994 while (s < end) { 5995 Py_UNICODE uch; 5996 Py_UCS4 ch; 5997 /* We copy the raw representation one byte at a time because the 5998 pointer may be unaligned (see test_codeccallbacks). */ 5999 ((char *) &uch)[0] = s[0]; 6000 ((char *) &uch)[1] = s[1]; 6001#ifdef Py_UNICODE_WIDE 6002 ((char *) &uch)[2] = s[2]; 6003 ((char *) &uch)[3] = s[3]; 6004#endif 6005 ch = uch; 6006 6007 /* We have to sanity check the raw data, otherwise doom looms for 6008 some malformed UCS-4 data. */ 6009 if ( 6010#ifdef Py_UNICODE_WIDE 6011 ch > 0x10ffff || 6012#endif 6013 end-s < Py_UNICODE_SIZE 6014 ) 6015 { 6016 startinpos = s - starts; 6017 if (end-s < Py_UNICODE_SIZE) { 6018 endinpos = end-starts; 6019 reason = "truncated input"; 6020 } 6021 else { 6022 endinpos = s - starts + Py_UNICODE_SIZE; 6023 reason = "illegal code point (> 0x10FFFF)"; 6024 } 6025 if (unicode_decode_call_errorhandler_writer( 6026 errors, &errorHandler, 6027 "unicode_internal", reason, 6028 &starts, &end, &startinpos, &endinpos, &exc, &s, 6029 &writer)) 6030 goto onError; 6031 continue; 6032 } 6033 6034 s += Py_UNICODE_SIZE; 6035#ifndef Py_UNICODE_WIDE 6036 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6037 { 6038 Py_UNICODE uch2; 6039 ((char *) &uch2)[0] = s[0]; 6040 ((char *) &uch2)[1] = s[1]; 6041 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6042 { 6043 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6044 s += Py_UNICODE_SIZE; 6045 } 6046 } 6047#endif 6048 6049 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 6050 goto onError; 6051 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 6052 writer.pos++; 6053 } 6054 6055 Py_XDECREF(errorHandler); 6056 Py_XDECREF(exc); 6057 return _PyUnicodeWriter_Finish(&writer); 6058 6059 onError: 6060 _PyUnicodeWriter_Dealloc(&writer); 6061 Py_XDECREF(errorHandler); 6062 Py_XDECREF(exc); 6063 return NULL; 6064} 6065 6066/* --- Latin-1 Codec ------------------------------------------------------ */ 6067 6068PyObject * 6069PyUnicode_DecodeLatin1(const char *s, 6070 Py_ssize_t size, 6071 const char *errors) 6072{ 6073 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6074 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6075} 6076 6077/* create or adjust a UnicodeEncodeError */ 6078static void 6079make_encode_exception(PyObject **exceptionObject, 6080 const char *encoding, 6081 PyObject *unicode, 6082 Py_ssize_t startpos, Py_ssize_t endpos, 6083 const char *reason) 6084{ 6085 if (*exceptionObject == NULL) { 6086 *exceptionObject = PyObject_CallFunction( 6087 PyExc_UnicodeEncodeError, "sOnns", 6088 encoding, unicode, startpos, endpos, reason); 6089 } 6090 else { 6091 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6092 goto onError; 6093 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6094 goto onError; 6095 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6096 goto onError; 6097 return; 6098 onError: 6099 Py_DECREF(*exceptionObject); 6100 *exceptionObject = NULL; 6101 } 6102} 6103 6104/* raises a UnicodeEncodeError */ 6105static void 6106raise_encode_exception(PyObject **exceptionObject, 6107 const char *encoding, 6108 PyObject *unicode, 6109 Py_ssize_t startpos, Py_ssize_t endpos, 6110 const char *reason) 6111{ 6112 make_encode_exception(exceptionObject, 6113 encoding, unicode, startpos, endpos, reason); 6114 if (*exceptionObject != NULL) 6115 PyCodec_StrictErrors(*exceptionObject); 6116} 6117 6118/* error handling callback helper: 6119 build arguments, call the callback and check the arguments, 6120 put the result into newpos and return the replacement string, which 6121 has to be freed by the caller */ 6122static PyObject * 6123unicode_encode_call_errorhandler(const char *errors, 6124 PyObject **errorHandler, 6125 const char *encoding, const char *reason, 6126 PyObject *unicode, PyObject **exceptionObject, 6127 Py_ssize_t startpos, Py_ssize_t endpos, 6128 Py_ssize_t *newpos) 6129{ 6130 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6131 Py_ssize_t len; 6132 PyObject *restuple; 6133 PyObject *resunicode; 6134 6135 if (*errorHandler == NULL) { 6136 *errorHandler = PyCodec_LookupError(errors); 6137 if (*errorHandler == NULL) 6138 return NULL; 6139 } 6140 6141 if (PyUnicode_READY(unicode) == -1) 6142 return NULL; 6143 len = PyUnicode_GET_LENGTH(unicode); 6144 6145 make_encode_exception(exceptionObject, 6146 encoding, unicode, startpos, endpos, reason); 6147 if (*exceptionObject == NULL) 6148 return NULL; 6149 6150 restuple = PyObject_CallFunctionObjArgs( 6151 *errorHandler, *exceptionObject, NULL); 6152 if (restuple == NULL) 6153 return NULL; 6154 if (!PyTuple_Check(restuple)) { 6155 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6156 Py_DECREF(restuple); 6157 return NULL; 6158 } 6159 if (!PyArg_ParseTuple(restuple, argparse, 6160 &resunicode, newpos)) { 6161 Py_DECREF(restuple); 6162 return NULL; 6163 } 6164 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6165 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6166 Py_DECREF(restuple); 6167 return NULL; 6168 } 6169 if (*newpos<0) 6170 *newpos = len + *newpos; 6171 if (*newpos<0 || *newpos>len) { 6172 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6173 Py_DECREF(restuple); 6174 return NULL; 6175 } 6176 Py_INCREF(resunicode); 6177 Py_DECREF(restuple); 6178 return resunicode; 6179} 6180 6181static PyObject * 6182unicode_encode_ucs1(PyObject *unicode, 6183 const char *errors, 6184 unsigned int limit) 6185{ 6186 /* input state */ 6187 Py_ssize_t pos=0, size; 6188 int kind; 6189 void *data; 6190 /* output object */ 6191 PyObject *res; 6192 /* pointer into the output */ 6193 char *str; 6194 /* current output position */ 6195 Py_ssize_t ressize; 6196 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6197 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6198 PyObject *errorHandler = NULL; 6199 PyObject *exc = NULL; 6200 /* the following variable is used for caching string comparisons 6201 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6202 int known_errorHandler = -1; 6203 6204 if (PyUnicode_READY(unicode) == -1) 6205 return NULL; 6206 size = PyUnicode_GET_LENGTH(unicode); 6207 kind = PyUnicode_KIND(unicode); 6208 data = PyUnicode_DATA(unicode); 6209 /* allocate enough for a simple encoding without 6210 replacements, if we need more, we'll resize */ 6211 if (size == 0) 6212 return PyBytes_FromStringAndSize(NULL, 0); 6213 res = PyBytes_FromStringAndSize(NULL, size); 6214 if (res == NULL) 6215 return NULL; 6216 str = PyBytes_AS_STRING(res); 6217 ressize = size; 6218 6219 while (pos < size) { 6220 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6221 6222 /* can we encode this? */ 6223 if (c<limit) { 6224 /* no overflow check, because we know that the space is enough */ 6225 *str++ = (char)c; 6226 ++pos; 6227 } 6228 else { 6229 Py_ssize_t requiredsize; 6230 PyObject *repunicode; 6231 Py_ssize_t repsize, newpos, respos, i; 6232 /* startpos for collecting unencodable chars */ 6233 Py_ssize_t collstart = pos; 6234 Py_ssize_t collend = pos; 6235 /* find all unecodable characters */ 6236 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6237 ++collend; 6238 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6239 if (known_errorHandler==-1) { 6240 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6241 known_errorHandler = 1; 6242 else if (!strcmp(errors, "replace")) 6243 known_errorHandler = 2; 6244 else if (!strcmp(errors, "ignore")) 6245 known_errorHandler = 3; 6246 else if (!strcmp(errors, "xmlcharrefreplace")) 6247 known_errorHandler = 4; 6248 else 6249 known_errorHandler = 0; 6250 } 6251 switch (known_errorHandler) { 6252 case 1: /* strict */ 6253 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6254 goto onError; 6255 case 2: /* replace */ 6256 while (collstart++<collend) 6257 *str++ = '?'; /* fall through */ 6258 case 3: /* ignore */ 6259 pos = collend; 6260 break; 6261 case 4: /* xmlcharrefreplace */ 6262 respos = str - PyBytes_AS_STRING(res); 6263 /* determine replacement size */ 6264 for (i = collstart, repsize = 0; i < collend; ++i) { 6265 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6266 if (ch < 10) 6267 repsize += 2+1+1; 6268 else if (ch < 100) 6269 repsize += 2+2+1; 6270 else if (ch < 1000) 6271 repsize += 2+3+1; 6272 else if (ch < 10000) 6273 repsize += 2+4+1; 6274 else if (ch < 100000) 6275 repsize += 2+5+1; 6276 else if (ch < 1000000) 6277 repsize += 2+6+1; 6278 else { 6279 assert(ch <= MAX_UNICODE); 6280 repsize += 2+7+1; 6281 } 6282 } 6283 requiredsize = respos+repsize+(size-collend); 6284 if (requiredsize > ressize) { 6285 if (requiredsize<2*ressize) 6286 requiredsize = 2*ressize; 6287 if (_PyBytes_Resize(&res, requiredsize)) 6288 goto onError; 6289 str = PyBytes_AS_STRING(res) + respos; 6290 ressize = requiredsize; 6291 } 6292 /* generate replacement */ 6293 for (i = collstart; i < collend; ++i) { 6294 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6295 } 6296 pos = collend; 6297 break; 6298 default: 6299 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6300 encoding, reason, unicode, &exc, 6301 collstart, collend, &newpos); 6302 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6303 PyUnicode_READY(repunicode) == -1)) 6304 goto onError; 6305 if (PyBytes_Check(repunicode)) { 6306 /* Directly copy bytes result to output. */ 6307 repsize = PyBytes_Size(repunicode); 6308 if (repsize > 1) { 6309 /* Make room for all additional bytes. */ 6310 respos = str - PyBytes_AS_STRING(res); 6311 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6312 Py_DECREF(repunicode); 6313 goto onError; 6314 } 6315 str = PyBytes_AS_STRING(res) + respos; 6316 ressize += repsize-1; 6317 } 6318 memcpy(str, PyBytes_AsString(repunicode), repsize); 6319 str += repsize; 6320 pos = newpos; 6321 Py_DECREF(repunicode); 6322 break; 6323 } 6324 /* need more space? (at least enough for what we 6325 have+the replacement+the rest of the string, so 6326 we won't have to check space for encodable characters) */ 6327 respos = str - PyBytes_AS_STRING(res); 6328 repsize = PyUnicode_GET_LENGTH(repunicode); 6329 requiredsize = respos+repsize+(size-collend); 6330 if (requiredsize > ressize) { 6331 if (requiredsize<2*ressize) 6332 requiredsize = 2*ressize; 6333 if (_PyBytes_Resize(&res, requiredsize)) { 6334 Py_DECREF(repunicode); 6335 goto onError; 6336 } 6337 str = PyBytes_AS_STRING(res) + respos; 6338 ressize = requiredsize; 6339 } 6340 /* check if there is anything unencodable in the replacement 6341 and copy it to the output */ 6342 for (i = 0; repsize-->0; ++i, ++str) { 6343 c = PyUnicode_READ_CHAR(repunicode, i); 6344 if (c >= limit) { 6345 raise_encode_exception(&exc, encoding, unicode, 6346 pos, pos+1, reason); 6347 Py_DECREF(repunicode); 6348 goto onError; 6349 } 6350 *str = (char)c; 6351 } 6352 pos = newpos; 6353 Py_DECREF(repunicode); 6354 } 6355 } 6356 } 6357 /* Resize if we allocated to much */ 6358 size = str - PyBytes_AS_STRING(res); 6359 if (size < ressize) { /* If this falls res will be NULL */ 6360 assert(size >= 0); 6361 if (_PyBytes_Resize(&res, size) < 0) 6362 goto onError; 6363 } 6364 6365 Py_XDECREF(errorHandler); 6366 Py_XDECREF(exc); 6367 return res; 6368 6369 onError: 6370 Py_XDECREF(res); 6371 Py_XDECREF(errorHandler); 6372 Py_XDECREF(exc); 6373 return NULL; 6374} 6375 6376/* Deprecated */ 6377PyObject * 6378PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6379 Py_ssize_t size, 6380 const char *errors) 6381{ 6382 PyObject *result; 6383 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6384 if (unicode == NULL) 6385 return NULL; 6386 result = unicode_encode_ucs1(unicode, errors, 256); 6387 Py_DECREF(unicode); 6388 return result; 6389} 6390 6391PyObject * 6392_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6393{ 6394 if (!PyUnicode_Check(unicode)) { 6395 PyErr_BadArgument(); 6396 return NULL; 6397 } 6398 if (PyUnicode_READY(unicode) == -1) 6399 return NULL; 6400 /* Fast path: if it is a one-byte string, construct 6401 bytes object directly. */ 6402 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6403 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6404 PyUnicode_GET_LENGTH(unicode)); 6405 /* Non-Latin-1 characters present. Defer to above function to 6406 raise the exception. */ 6407 return unicode_encode_ucs1(unicode, errors, 256); 6408} 6409 6410PyObject* 6411PyUnicode_AsLatin1String(PyObject *unicode) 6412{ 6413 return _PyUnicode_AsLatin1String(unicode, NULL); 6414} 6415 6416/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6417 6418PyObject * 6419PyUnicode_DecodeASCII(const char *s, 6420 Py_ssize_t size, 6421 const char *errors) 6422{ 6423 const char *starts = s; 6424 _PyUnicodeWriter writer; 6425 int kind; 6426 void *data; 6427 Py_ssize_t startinpos; 6428 Py_ssize_t endinpos; 6429 Py_ssize_t outpos; 6430 const char *e; 6431 PyObject *errorHandler = NULL; 6432 PyObject *exc = NULL; 6433 6434 if (size == 0) { 6435 Py_INCREF(unicode_empty); 6436 return unicode_empty; 6437 } 6438 6439 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6440 if (size == 1 && (unsigned char)s[0] < 128) 6441 return get_latin1_char((unsigned char)s[0]); 6442 6443 _PyUnicodeWriter_Init(&writer, 0); 6444 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 6445 goto onError; 6446 6447 e = s + size; 6448 data = writer.data; 6449 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6450 writer.pos = outpos; 6451 if (writer.pos == size) 6452 return _PyUnicodeWriter_Finish(&writer); 6453 6454 s += writer.pos; 6455 kind = writer.kind; 6456 while (s < e) { 6457 register unsigned char c = (unsigned char)*s; 6458 if (c < 128) { 6459 PyUnicode_WRITE(kind, data, writer.pos, c); 6460 writer.pos++; 6461 ++s; 6462 } 6463 else { 6464 startinpos = s-starts; 6465 endinpos = startinpos + 1; 6466 if (unicode_decode_call_errorhandler_writer( 6467 errors, &errorHandler, 6468 "ascii", "ordinal not in range(128)", 6469 &starts, &e, &startinpos, &endinpos, &exc, &s, 6470 &writer)) 6471 goto onError; 6472 kind = writer.kind; 6473 data = writer.data; 6474 } 6475 } 6476 Py_XDECREF(errorHandler); 6477 Py_XDECREF(exc); 6478 return _PyUnicodeWriter_Finish(&writer); 6479 6480 onError: 6481 _PyUnicodeWriter_Dealloc(&writer); 6482 Py_XDECREF(errorHandler); 6483 Py_XDECREF(exc); 6484 return NULL; 6485} 6486 6487/* Deprecated */ 6488PyObject * 6489PyUnicode_EncodeASCII(const Py_UNICODE *p, 6490 Py_ssize_t size, 6491 const char *errors) 6492{ 6493 PyObject *result; 6494 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6495 if (unicode == NULL) 6496 return NULL; 6497 result = unicode_encode_ucs1(unicode, errors, 128); 6498 Py_DECREF(unicode); 6499 return result; 6500} 6501 6502PyObject * 6503_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6504{ 6505 if (!PyUnicode_Check(unicode)) { 6506 PyErr_BadArgument(); 6507 return NULL; 6508 } 6509 if (PyUnicode_READY(unicode) == -1) 6510 return NULL; 6511 /* Fast path: if it is an ASCII-only string, construct bytes object 6512 directly. Else defer to above function to raise the exception. */ 6513 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6514 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6515 PyUnicode_GET_LENGTH(unicode)); 6516 return unicode_encode_ucs1(unicode, errors, 128); 6517} 6518 6519PyObject * 6520PyUnicode_AsASCIIString(PyObject *unicode) 6521{ 6522 return _PyUnicode_AsASCIIString(unicode, NULL); 6523} 6524 6525#ifdef HAVE_MBCS 6526 6527/* --- MBCS codecs for Windows -------------------------------------------- */ 6528 6529#if SIZEOF_INT < SIZEOF_SIZE_T 6530#define NEED_RETRY 6531#endif 6532 6533#ifndef WC_ERR_INVALID_CHARS 6534# define WC_ERR_INVALID_CHARS 0x0080 6535#endif 6536 6537static char* 6538code_page_name(UINT code_page, PyObject **obj) 6539{ 6540 *obj = NULL; 6541 if (code_page == CP_ACP) 6542 return "mbcs"; 6543 if (code_page == CP_UTF7) 6544 return "CP_UTF7"; 6545 if (code_page == CP_UTF8) 6546 return "CP_UTF8"; 6547 6548 *obj = PyBytes_FromFormat("cp%u", code_page); 6549 if (*obj == NULL) 6550 return NULL; 6551 return PyBytes_AS_STRING(*obj); 6552} 6553 6554static int 6555is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6556{ 6557 const char *curr = s + offset; 6558 const char *prev; 6559 6560 if (!IsDBCSLeadByteEx(code_page, *curr)) 6561 return 0; 6562 6563 prev = CharPrevExA(code_page, s, curr, 0); 6564 if (prev == curr) 6565 return 1; 6566 /* FIXME: This code is limited to "true" double-byte encodings, 6567 as it assumes an incomplete character consists of a single 6568 byte. */ 6569 if (curr - prev == 2) 6570 return 1; 6571 if (!IsDBCSLeadByteEx(code_page, *prev)) 6572 return 1; 6573 return 0; 6574} 6575 6576static DWORD 6577decode_code_page_flags(UINT code_page) 6578{ 6579 if (code_page == CP_UTF7) { 6580 /* The CP_UTF7 decoder only supports flags=0 */ 6581 return 0; 6582 } 6583 else 6584 return MB_ERR_INVALID_CHARS; 6585} 6586 6587/* 6588 * Decode a byte string from a Windows code page into unicode object in strict 6589 * mode. 6590 * 6591 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6592 * WindowsError and returns -1 on other error. 6593 */ 6594static int 6595decode_code_page_strict(UINT code_page, 6596 PyObject **v, 6597 const char *in, 6598 int insize) 6599{ 6600 const DWORD flags = decode_code_page_flags(code_page); 6601 wchar_t *out; 6602 DWORD outsize; 6603 6604 /* First get the size of the result */ 6605 assert(insize > 0); 6606 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6607 if (outsize <= 0) 6608 goto error; 6609 6610 if (*v == NULL) { 6611 /* Create unicode object */ 6612 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6613 *v = (PyObject*)_PyUnicode_New(outsize); 6614 if (*v == NULL) 6615 return -1; 6616 out = PyUnicode_AS_UNICODE(*v); 6617 } 6618 else { 6619 /* Extend unicode object */ 6620 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6621 if (unicode_resize(v, n + outsize) < 0) 6622 return -1; 6623 out = PyUnicode_AS_UNICODE(*v) + n; 6624 } 6625 6626 /* Do the conversion */ 6627 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6628 if (outsize <= 0) 6629 goto error; 6630 return insize; 6631 6632error: 6633 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6634 return -2; 6635 PyErr_SetFromWindowsErr(0); 6636 return -1; 6637} 6638 6639/* 6640 * Decode a byte string from a code page into unicode object with an error 6641 * handler. 6642 * 6643 * Returns consumed size if succeed, or raise a WindowsError or 6644 * UnicodeDecodeError exception and returns -1 on error. 6645 */ 6646static int 6647decode_code_page_errors(UINT code_page, 6648 PyObject **v, 6649 const char *in, const int size, 6650 const char *errors) 6651{ 6652 const char *startin = in; 6653 const char *endin = in + size; 6654 const DWORD flags = decode_code_page_flags(code_page); 6655 /* Ideally, we should get reason from FormatMessage. This is the Windows 6656 2000 English version of the message. */ 6657 const char *reason = "No mapping for the Unicode character exists " 6658 "in the target code page."; 6659 /* each step cannot decode more than 1 character, but a character can be 6660 represented as a surrogate pair */ 6661 wchar_t buffer[2], *startout, *out; 6662 int insize, outsize; 6663 PyObject *errorHandler = NULL; 6664 PyObject *exc = NULL; 6665 PyObject *encoding_obj = NULL; 6666 char *encoding; 6667 DWORD err; 6668 int ret = -1; 6669 6670 assert(size > 0); 6671 6672 encoding = code_page_name(code_page, &encoding_obj); 6673 if (encoding == NULL) 6674 return -1; 6675 6676 if (errors == NULL || strcmp(errors, "strict") == 0) { 6677 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6678 UnicodeDecodeError. */ 6679 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6680 if (exc != NULL) { 6681 PyCodec_StrictErrors(exc); 6682 Py_CLEAR(exc); 6683 } 6684 goto error; 6685 } 6686 6687 if (*v == NULL) { 6688 /* Create unicode object */ 6689 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6690 PyErr_NoMemory(); 6691 goto error; 6692 } 6693 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6694 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6695 if (*v == NULL) 6696 goto error; 6697 startout = PyUnicode_AS_UNICODE(*v); 6698 } 6699 else { 6700 /* Extend unicode object */ 6701 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6702 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6703 PyErr_NoMemory(); 6704 goto error; 6705 } 6706 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6707 goto error; 6708 startout = PyUnicode_AS_UNICODE(*v) + n; 6709 } 6710 6711 /* Decode the byte string character per character */ 6712 out = startout; 6713 while (in < endin) 6714 { 6715 /* Decode a character */ 6716 insize = 1; 6717 do 6718 { 6719 outsize = MultiByteToWideChar(code_page, flags, 6720 in, insize, 6721 buffer, Py_ARRAY_LENGTH(buffer)); 6722 if (outsize > 0) 6723 break; 6724 err = GetLastError(); 6725 if (err != ERROR_NO_UNICODE_TRANSLATION 6726 && err != ERROR_INSUFFICIENT_BUFFER) 6727 { 6728 PyErr_SetFromWindowsErr(0); 6729 goto error; 6730 } 6731 insize++; 6732 } 6733 /* 4=maximum length of a UTF-8 sequence */ 6734 while (insize <= 4 && (in + insize) <= endin); 6735 6736 if (outsize <= 0) { 6737 Py_ssize_t startinpos, endinpos, outpos; 6738 6739 startinpos = in - startin; 6740 endinpos = startinpos + 1; 6741 outpos = out - PyUnicode_AS_UNICODE(*v); 6742 if (unicode_decode_call_errorhandler_wchar( 6743 errors, &errorHandler, 6744 encoding, reason, 6745 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6746 v, &outpos)) 6747 { 6748 goto error; 6749 } 6750 out = PyUnicode_AS_UNICODE(*v) + outpos; 6751 } 6752 else { 6753 in += insize; 6754 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6755 out += outsize; 6756 } 6757 } 6758 6759 /* write a NUL character at the end */ 6760 *out = 0; 6761 6762 /* Extend unicode object */ 6763 outsize = out - startout; 6764 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6765 if (unicode_resize(v, outsize) < 0) 6766 goto error; 6767 ret = size; 6768 6769error: 6770 Py_XDECREF(encoding_obj); 6771 Py_XDECREF(errorHandler); 6772 Py_XDECREF(exc); 6773 return ret; 6774} 6775 6776static PyObject * 6777decode_code_page_stateful(int code_page, 6778 const char *s, Py_ssize_t size, 6779 const char *errors, Py_ssize_t *consumed) 6780{ 6781 PyObject *v = NULL; 6782 int chunk_size, final, converted, done; 6783 6784 if (code_page < 0) { 6785 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6786 return NULL; 6787 } 6788 6789 if (consumed) 6790 *consumed = 0; 6791 6792 do 6793 { 6794#ifdef NEED_RETRY 6795 if (size > INT_MAX) { 6796 chunk_size = INT_MAX; 6797 final = 0; 6798 done = 0; 6799 } 6800 else 6801#endif 6802 { 6803 chunk_size = (int)size; 6804 final = (consumed == NULL); 6805 done = 1; 6806 } 6807 6808 /* Skip trailing lead-byte unless 'final' is set */ 6809 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6810 --chunk_size; 6811 6812 if (chunk_size == 0 && done) { 6813 if (v != NULL) 6814 break; 6815 Py_INCREF(unicode_empty); 6816 return unicode_empty; 6817 } 6818 6819 6820 converted = decode_code_page_strict(code_page, &v, 6821 s, chunk_size); 6822 if (converted == -2) 6823 converted = decode_code_page_errors(code_page, &v, 6824 s, chunk_size, 6825 errors); 6826 assert(converted != 0); 6827 6828 if (converted < 0) { 6829 Py_XDECREF(v); 6830 return NULL; 6831 } 6832 6833 if (consumed) 6834 *consumed += converted; 6835 6836 s += converted; 6837 size -= converted; 6838 } while (!done); 6839 6840 return unicode_result(v); 6841} 6842 6843PyObject * 6844PyUnicode_DecodeCodePageStateful(int code_page, 6845 const char *s, 6846 Py_ssize_t size, 6847 const char *errors, 6848 Py_ssize_t *consumed) 6849{ 6850 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6851} 6852 6853PyObject * 6854PyUnicode_DecodeMBCSStateful(const char *s, 6855 Py_ssize_t size, 6856 const char *errors, 6857 Py_ssize_t *consumed) 6858{ 6859 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6860} 6861 6862PyObject * 6863PyUnicode_DecodeMBCS(const char *s, 6864 Py_ssize_t size, 6865 const char *errors) 6866{ 6867 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6868} 6869 6870static DWORD 6871encode_code_page_flags(UINT code_page, const char *errors) 6872{ 6873 if (code_page == CP_UTF8) { 6874 if (winver.dwMajorVersion >= 6) 6875 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 6876 and later */ 6877 return WC_ERR_INVALID_CHARS; 6878 else 6879 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 6880 return 0; 6881 } 6882 else if (code_page == CP_UTF7) { 6883 /* CP_UTF7 only supports flags=0 */ 6884 return 0; 6885 } 6886 else { 6887 if (errors != NULL && strcmp(errors, "replace") == 0) 6888 return 0; 6889 else 6890 return WC_NO_BEST_FIT_CHARS; 6891 } 6892} 6893 6894/* 6895 * Encode a Unicode string to a Windows code page into a byte string in strict 6896 * mode. 6897 * 6898 * Returns consumed characters if succeed, returns -2 on encode error, or raise 6899 * a WindowsError and returns -1 on other error. 6900 */ 6901static int 6902encode_code_page_strict(UINT code_page, PyObject **outbytes, 6903 PyObject *unicode, Py_ssize_t offset, int len, 6904 const char* errors) 6905{ 6906 BOOL usedDefaultChar = FALSE; 6907 BOOL *pusedDefaultChar = &usedDefaultChar; 6908 int outsize; 6909 PyObject *exc = NULL; 6910 wchar_t *p; 6911 Py_ssize_t size; 6912 const DWORD flags = encode_code_page_flags(code_page, NULL); 6913 char *out; 6914 /* Create a substring so that we can get the UTF-16 representation 6915 of just the slice under consideration. */ 6916 PyObject *substring; 6917 6918 assert(len > 0); 6919 6920 if (code_page != CP_UTF8 && code_page != CP_UTF7) 6921 pusedDefaultChar = &usedDefaultChar; 6922 else 6923 pusedDefaultChar = NULL; 6924 6925 substring = PyUnicode_Substring(unicode, offset, offset+len); 6926 if (substring == NULL) 6927 return -1; 6928 p = PyUnicode_AsUnicodeAndSize(substring, &size); 6929 if (p == NULL) { 6930 Py_DECREF(substring); 6931 return -1; 6932 } 6933 6934 /* First get the size of the result */ 6935 outsize = WideCharToMultiByte(code_page, flags, 6936 p, size, 6937 NULL, 0, 6938 NULL, pusedDefaultChar); 6939 if (outsize <= 0) 6940 goto error; 6941 /* If we used a default char, then we failed! */ 6942 if (pusedDefaultChar && *pusedDefaultChar) { 6943 Py_DECREF(substring); 6944 return -2; 6945 } 6946 6947 if (*outbytes == NULL) { 6948 /* Create string object */ 6949 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 6950 if (*outbytes == NULL) { 6951 Py_DECREF(substring); 6952 return -1; 6953 } 6954 out = PyBytes_AS_STRING(*outbytes); 6955 } 6956 else { 6957 /* Extend string object */ 6958 const Py_ssize_t n = PyBytes_Size(*outbytes); 6959 if (outsize > PY_SSIZE_T_MAX - n) { 6960 PyErr_NoMemory(); 6961 Py_DECREF(substring); 6962 return -1; 6963 } 6964 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 6965 Py_DECREF(substring); 6966 return -1; 6967 } 6968 out = PyBytes_AS_STRING(*outbytes) + n; 6969 } 6970 6971 /* Do the conversion */ 6972 outsize = WideCharToMultiByte(code_page, flags, 6973 p, size, 6974 out, outsize, 6975 NULL, pusedDefaultChar); 6976 Py_CLEAR(substring); 6977 if (outsize <= 0) 6978 goto error; 6979 if (pusedDefaultChar && *pusedDefaultChar) 6980 return -2; 6981 return 0; 6982 6983error: 6984 Py_XDECREF(substring); 6985 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6986 return -2; 6987 PyErr_SetFromWindowsErr(0); 6988 return -1; 6989} 6990 6991/* 6992 * Encode a Unicode string to a Windows code page into a byte string using a 6993 * error handler. 6994 * 6995 * Returns consumed characters if succeed, or raise a WindowsError and returns 6996 * -1 on other error. 6997 */ 6998static int 6999encode_code_page_errors(UINT code_page, PyObject **outbytes, 7000 PyObject *unicode, Py_ssize_t unicode_offset, 7001 Py_ssize_t insize, const char* errors) 7002{ 7003 const DWORD flags = encode_code_page_flags(code_page, errors); 7004 Py_ssize_t pos = unicode_offset; 7005 Py_ssize_t endin = unicode_offset + insize; 7006 /* Ideally, we should get reason from FormatMessage. This is the Windows 7007 2000 English version of the message. */ 7008 const char *reason = "invalid character"; 7009 /* 4=maximum length of a UTF-8 sequence */ 7010 char buffer[4]; 7011 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7012 Py_ssize_t outsize; 7013 char *out; 7014 PyObject *errorHandler = NULL; 7015 PyObject *exc = NULL; 7016 PyObject *encoding_obj = NULL; 7017 char *encoding; 7018 Py_ssize_t newpos, newoutsize; 7019 PyObject *rep; 7020 int ret = -1; 7021 7022 assert(insize > 0); 7023 7024 encoding = code_page_name(code_page, &encoding_obj); 7025 if (encoding == NULL) 7026 return -1; 7027 7028 if (errors == NULL || strcmp(errors, "strict") == 0) { 7029 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7030 then we raise a UnicodeEncodeError. */ 7031 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7032 if (exc != NULL) { 7033 PyCodec_StrictErrors(exc); 7034 Py_DECREF(exc); 7035 } 7036 Py_XDECREF(encoding_obj); 7037 return -1; 7038 } 7039 7040 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7041 pusedDefaultChar = &usedDefaultChar; 7042 else 7043 pusedDefaultChar = NULL; 7044 7045 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7046 PyErr_NoMemory(); 7047 goto error; 7048 } 7049 outsize = insize * Py_ARRAY_LENGTH(buffer); 7050 7051 if (*outbytes == NULL) { 7052 /* Create string object */ 7053 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7054 if (*outbytes == NULL) 7055 goto error; 7056 out = PyBytes_AS_STRING(*outbytes); 7057 } 7058 else { 7059 /* Extend string object */ 7060 Py_ssize_t n = PyBytes_Size(*outbytes); 7061 if (n > PY_SSIZE_T_MAX - outsize) { 7062 PyErr_NoMemory(); 7063 goto error; 7064 } 7065 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7066 goto error; 7067 out = PyBytes_AS_STRING(*outbytes) + n; 7068 } 7069 7070 /* Encode the string character per character */ 7071 while (pos < endin) 7072 { 7073 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7074 wchar_t chars[2]; 7075 int charsize; 7076 if (ch < 0x10000) { 7077 chars[0] = (wchar_t)ch; 7078 charsize = 1; 7079 } 7080 else { 7081 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7082 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7083 charsize = 2; 7084 } 7085 7086 outsize = WideCharToMultiByte(code_page, flags, 7087 chars, charsize, 7088 buffer, Py_ARRAY_LENGTH(buffer), 7089 NULL, pusedDefaultChar); 7090 if (outsize > 0) { 7091 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7092 { 7093 pos++; 7094 memcpy(out, buffer, outsize); 7095 out += outsize; 7096 continue; 7097 } 7098 } 7099 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7100 PyErr_SetFromWindowsErr(0); 7101 goto error; 7102 } 7103 7104 rep = unicode_encode_call_errorhandler( 7105 errors, &errorHandler, encoding, reason, 7106 unicode, &exc, 7107 pos, pos + 1, &newpos); 7108 if (rep == NULL) 7109 goto error; 7110 pos = newpos; 7111 7112 if (PyBytes_Check(rep)) { 7113 outsize = PyBytes_GET_SIZE(rep); 7114 if (outsize != 1) { 7115 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7116 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7117 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7118 Py_DECREF(rep); 7119 goto error; 7120 } 7121 out = PyBytes_AS_STRING(*outbytes) + offset; 7122 } 7123 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7124 out += outsize; 7125 } 7126 else { 7127 Py_ssize_t i; 7128 enum PyUnicode_Kind kind; 7129 void *data; 7130 7131 if (PyUnicode_READY(rep) == -1) { 7132 Py_DECREF(rep); 7133 goto error; 7134 } 7135 7136 outsize = PyUnicode_GET_LENGTH(rep); 7137 if (outsize != 1) { 7138 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7139 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7140 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7141 Py_DECREF(rep); 7142 goto error; 7143 } 7144 out = PyBytes_AS_STRING(*outbytes) + offset; 7145 } 7146 kind = PyUnicode_KIND(rep); 7147 data = PyUnicode_DATA(rep); 7148 for (i=0; i < outsize; i++) { 7149 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7150 if (ch > 127) { 7151 raise_encode_exception(&exc, 7152 encoding, unicode, 7153 pos, pos + 1, 7154 "unable to encode error handler result to ASCII"); 7155 Py_DECREF(rep); 7156 goto error; 7157 } 7158 *out = (unsigned char)ch; 7159 out++; 7160 } 7161 } 7162 Py_DECREF(rep); 7163 } 7164 /* write a NUL byte */ 7165 *out = 0; 7166 outsize = out - PyBytes_AS_STRING(*outbytes); 7167 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7168 if (_PyBytes_Resize(outbytes, outsize) < 0) 7169 goto error; 7170 ret = 0; 7171 7172error: 7173 Py_XDECREF(encoding_obj); 7174 Py_XDECREF(errorHandler); 7175 Py_XDECREF(exc); 7176 return ret; 7177} 7178 7179static PyObject * 7180encode_code_page(int code_page, 7181 PyObject *unicode, 7182 const char *errors) 7183{ 7184 Py_ssize_t len; 7185 PyObject *outbytes = NULL; 7186 Py_ssize_t offset; 7187 int chunk_len, ret, done; 7188 7189 if (PyUnicode_READY(unicode) == -1) 7190 return NULL; 7191 len = PyUnicode_GET_LENGTH(unicode); 7192 7193 if (code_page < 0) { 7194 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7195 return NULL; 7196 } 7197 7198 if (len == 0) 7199 return PyBytes_FromStringAndSize(NULL, 0); 7200 7201 offset = 0; 7202 do 7203 { 7204#ifdef NEED_RETRY 7205 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7206 chunks. */ 7207 if (len > INT_MAX/2) { 7208 chunk_len = INT_MAX/2; 7209 done = 0; 7210 } 7211 else 7212#endif 7213 { 7214 chunk_len = (int)len; 7215 done = 1; 7216 } 7217 7218 ret = encode_code_page_strict(code_page, &outbytes, 7219 unicode, offset, chunk_len, 7220 errors); 7221 if (ret == -2) 7222 ret = encode_code_page_errors(code_page, &outbytes, 7223 unicode, offset, 7224 chunk_len, errors); 7225 if (ret < 0) { 7226 Py_XDECREF(outbytes); 7227 return NULL; 7228 } 7229 7230 offset += chunk_len; 7231 len -= chunk_len; 7232 } while (!done); 7233 7234 return outbytes; 7235} 7236 7237PyObject * 7238PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7239 Py_ssize_t size, 7240 const char *errors) 7241{ 7242 PyObject *unicode, *res; 7243 unicode = PyUnicode_FromUnicode(p, size); 7244 if (unicode == NULL) 7245 return NULL; 7246 res = encode_code_page(CP_ACP, unicode, errors); 7247 Py_DECREF(unicode); 7248 return res; 7249} 7250 7251PyObject * 7252PyUnicode_EncodeCodePage(int code_page, 7253 PyObject *unicode, 7254 const char *errors) 7255{ 7256 return encode_code_page(code_page, unicode, errors); 7257} 7258 7259PyObject * 7260PyUnicode_AsMBCSString(PyObject *unicode) 7261{ 7262 if (!PyUnicode_Check(unicode)) { 7263 PyErr_BadArgument(); 7264 return NULL; 7265 } 7266 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7267} 7268 7269#undef NEED_RETRY 7270 7271#endif /* HAVE_MBCS */ 7272 7273/* --- Character Mapping Codec -------------------------------------------- */ 7274 7275PyObject * 7276PyUnicode_DecodeCharmap(const char *s, 7277 Py_ssize_t size, 7278 PyObject *mapping, 7279 const char *errors) 7280{ 7281 const char *starts = s; 7282 Py_ssize_t startinpos; 7283 Py_ssize_t endinpos; 7284 const char *e; 7285 _PyUnicodeWriter writer; 7286 PyObject *errorHandler = NULL; 7287 PyObject *exc = NULL; 7288 7289 /* Default to Latin-1 */ 7290 if (mapping == NULL) 7291 return PyUnicode_DecodeLatin1(s, size, errors); 7292 7293 if (size == 0) { 7294 Py_INCREF(unicode_empty); 7295 return unicode_empty; 7296 } 7297 _PyUnicodeWriter_Init(&writer, 0); 7298 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 7299 goto onError; 7300 7301 e = s + size; 7302 if (PyUnicode_CheckExact(mapping)) { 7303 Py_ssize_t maplen; 7304 enum PyUnicode_Kind mapkind; 7305 void *mapdata; 7306 Py_UCS4 x; 7307 7308 if (PyUnicode_READY(mapping) == -1) 7309 return NULL; 7310 7311 maplen = PyUnicode_GET_LENGTH(mapping); 7312 mapdata = PyUnicode_DATA(mapping); 7313 mapkind = PyUnicode_KIND(mapping); 7314 while (s < e) { 7315 unsigned char ch; 7316 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7317 enum PyUnicode_Kind outkind = writer.kind; 7318 void *outdata = writer.data; 7319 if (outkind == PyUnicode_1BYTE_KIND) { 7320 Py_UCS4 maxchar = writer.maxchar; 7321 while (s < e) { 7322 unsigned char ch = *s; 7323 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7324 if (x > maxchar) 7325 goto Error; 7326 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x); 7327 writer.pos++; 7328 ++s; 7329 } 7330 break; 7331 } 7332 else if (outkind == PyUnicode_2BYTE_KIND) { 7333 while (s < e) { 7334 unsigned char ch = *s; 7335 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7336 if (x == 0xFFFE) 7337 goto Error; 7338 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x); 7339 writer.pos++; 7340 ++s; 7341 } 7342 break; 7343 } 7344 } 7345 ch = *s; 7346 7347 if (ch < maplen) 7348 x = PyUnicode_READ(mapkind, mapdata, ch); 7349 else 7350 x = 0xfffe; /* invalid value */ 7351Error: 7352 if (x == 0xfffe) 7353 { 7354 /* undefined mapping */ 7355 startinpos = s-starts; 7356 endinpos = startinpos+1; 7357 if (unicode_decode_call_errorhandler_writer( 7358 errors, &errorHandler, 7359 "charmap", "character maps to <undefined>", 7360 &starts, &e, &startinpos, &endinpos, &exc, &s, 7361 &writer)) { 7362 goto onError; 7363 } 7364 continue; 7365 } 7366 7367 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1) 7368 goto onError; 7369 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x); 7370 writer.pos++; 7371 ++s; 7372 } 7373 } 7374 else { 7375 while (s < e) { 7376 unsigned char ch = *s; 7377 PyObject *w, *x; 7378 7379 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7380 w = PyLong_FromLong((long)ch); 7381 if (w == NULL) 7382 goto onError; 7383 x = PyObject_GetItem(mapping, w); 7384 Py_DECREF(w); 7385 if (x == NULL) { 7386 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7387 /* No mapping found means: mapping is undefined. */ 7388 PyErr_Clear(); 7389 x = Py_None; 7390 Py_INCREF(x); 7391 } else 7392 goto onError; 7393 } 7394 7395 /* Apply mapping */ 7396 if (PyLong_Check(x)) { 7397 long value = PyLong_AS_LONG(x); 7398 if (value < 0 || value > MAX_UNICODE) { 7399 PyErr_Format(PyExc_TypeError, 7400 "character mapping must be in range(0x%lx)", 7401 (unsigned long)MAX_UNICODE + 1); 7402 Py_DECREF(x); 7403 goto onError; 7404 } 7405 7406 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) 7407 goto onError; 7408 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value); 7409 writer.pos++; 7410 } 7411 else if (x == Py_None) { 7412 /* undefined mapping */ 7413 startinpos = s-starts; 7414 endinpos = startinpos+1; 7415 if (unicode_decode_call_errorhandler_writer( 7416 errors, &errorHandler, 7417 "charmap", "character maps to <undefined>", 7418 &starts, &e, &startinpos, &endinpos, &exc, &s, 7419 &writer)) { 7420 Py_DECREF(x); 7421 goto onError; 7422 } 7423 Py_DECREF(x); 7424 continue; 7425 } 7426 else if (PyUnicode_Check(x)) { 7427 writer.overallocate = 1; 7428 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) 7429 goto onError; 7430 } 7431 else { 7432 /* wrong return value */ 7433 PyErr_SetString(PyExc_TypeError, 7434 "character mapping must return integer, None or str"); 7435 Py_DECREF(x); 7436 goto onError; 7437 } 7438 Py_DECREF(x); 7439 ++s; 7440 } 7441 } 7442 Py_XDECREF(errorHandler); 7443 Py_XDECREF(exc); 7444 return _PyUnicodeWriter_Finish(&writer); 7445 7446 onError: 7447 Py_XDECREF(errorHandler); 7448 Py_XDECREF(exc); 7449 _PyUnicodeWriter_Dealloc(&writer); 7450 return NULL; 7451} 7452 7453/* Charmap encoding: the lookup table */ 7454 7455struct encoding_map { 7456 PyObject_HEAD 7457 unsigned char level1[32]; 7458 int count2, count3; 7459 unsigned char level23[1]; 7460}; 7461 7462static PyObject* 7463encoding_map_size(PyObject *obj, PyObject* args) 7464{ 7465 struct encoding_map *map = (struct encoding_map*)obj; 7466 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7467 128*map->count3); 7468} 7469 7470static PyMethodDef encoding_map_methods[] = { 7471 {"size", encoding_map_size, METH_NOARGS, 7472 PyDoc_STR("Return the size (in bytes) of this object") }, 7473 { 0 } 7474}; 7475 7476static void 7477encoding_map_dealloc(PyObject* o) 7478{ 7479 PyObject_FREE(o); 7480} 7481 7482static PyTypeObject EncodingMapType = { 7483 PyVarObject_HEAD_INIT(NULL, 0) 7484 "EncodingMap", /*tp_name*/ 7485 sizeof(struct encoding_map), /*tp_basicsize*/ 7486 0, /*tp_itemsize*/ 7487 /* methods */ 7488 encoding_map_dealloc, /*tp_dealloc*/ 7489 0, /*tp_print*/ 7490 0, /*tp_getattr*/ 7491 0, /*tp_setattr*/ 7492 0, /*tp_reserved*/ 7493 0, /*tp_repr*/ 7494 0, /*tp_as_number*/ 7495 0, /*tp_as_sequence*/ 7496 0, /*tp_as_mapping*/ 7497 0, /*tp_hash*/ 7498 0, /*tp_call*/ 7499 0, /*tp_str*/ 7500 0, /*tp_getattro*/ 7501 0, /*tp_setattro*/ 7502 0, /*tp_as_buffer*/ 7503 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7504 0, /*tp_doc*/ 7505 0, /*tp_traverse*/ 7506 0, /*tp_clear*/ 7507 0, /*tp_richcompare*/ 7508 0, /*tp_weaklistoffset*/ 7509 0, /*tp_iter*/ 7510 0, /*tp_iternext*/ 7511 encoding_map_methods, /*tp_methods*/ 7512 0, /*tp_members*/ 7513 0, /*tp_getset*/ 7514 0, /*tp_base*/ 7515 0, /*tp_dict*/ 7516 0, /*tp_descr_get*/ 7517 0, /*tp_descr_set*/ 7518 0, /*tp_dictoffset*/ 7519 0, /*tp_init*/ 7520 0, /*tp_alloc*/ 7521 0, /*tp_new*/ 7522 0, /*tp_free*/ 7523 0, /*tp_is_gc*/ 7524}; 7525 7526PyObject* 7527PyUnicode_BuildEncodingMap(PyObject* string) 7528{ 7529 PyObject *result; 7530 struct encoding_map *mresult; 7531 int i; 7532 int need_dict = 0; 7533 unsigned char level1[32]; 7534 unsigned char level2[512]; 7535 unsigned char *mlevel1, *mlevel2, *mlevel3; 7536 int count2 = 0, count3 = 0; 7537 int kind; 7538 void *data; 7539 Py_ssize_t length; 7540 Py_UCS4 ch; 7541 7542 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7543 PyErr_BadArgument(); 7544 return NULL; 7545 } 7546 kind = PyUnicode_KIND(string); 7547 data = PyUnicode_DATA(string); 7548 length = PyUnicode_GET_LENGTH(string); 7549 length = Py_MIN(length, 256); 7550 memset(level1, 0xFF, sizeof level1); 7551 memset(level2, 0xFF, sizeof level2); 7552 7553 /* If there isn't a one-to-one mapping of NULL to \0, 7554 or if there are non-BMP characters, we need to use 7555 a mapping dictionary. */ 7556 if (PyUnicode_READ(kind, data, 0) != 0) 7557 need_dict = 1; 7558 for (i = 1; i < length; i++) { 7559 int l1, l2; 7560 ch = PyUnicode_READ(kind, data, i); 7561 if (ch == 0 || ch > 0xFFFF) { 7562 need_dict = 1; 7563 break; 7564 } 7565 if (ch == 0xFFFE) 7566 /* unmapped character */ 7567 continue; 7568 l1 = ch >> 11; 7569 l2 = ch >> 7; 7570 if (level1[l1] == 0xFF) 7571 level1[l1] = count2++; 7572 if (level2[l2] == 0xFF) 7573 level2[l2] = count3++; 7574 } 7575 7576 if (count2 >= 0xFF || count3 >= 0xFF) 7577 need_dict = 1; 7578 7579 if (need_dict) { 7580 PyObject *result = PyDict_New(); 7581 PyObject *key, *value; 7582 if (!result) 7583 return NULL; 7584 for (i = 0; i < length; i++) { 7585 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7586 value = PyLong_FromLong(i); 7587 if (!key || !value) 7588 goto failed1; 7589 if (PyDict_SetItem(result, key, value) == -1) 7590 goto failed1; 7591 Py_DECREF(key); 7592 Py_DECREF(value); 7593 } 7594 return result; 7595 failed1: 7596 Py_XDECREF(key); 7597 Py_XDECREF(value); 7598 Py_DECREF(result); 7599 return NULL; 7600 } 7601 7602 /* Create a three-level trie */ 7603 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7604 16*count2 + 128*count3 - 1); 7605 if (!result) 7606 return PyErr_NoMemory(); 7607 PyObject_Init(result, &EncodingMapType); 7608 mresult = (struct encoding_map*)result; 7609 mresult->count2 = count2; 7610 mresult->count3 = count3; 7611 mlevel1 = mresult->level1; 7612 mlevel2 = mresult->level23; 7613 mlevel3 = mresult->level23 + 16*count2; 7614 memcpy(mlevel1, level1, 32); 7615 memset(mlevel2, 0xFF, 16*count2); 7616 memset(mlevel3, 0, 128*count3); 7617 count3 = 0; 7618 for (i = 1; i < length; i++) { 7619 int o1, o2, o3, i2, i3; 7620 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7621 if (ch == 0xFFFE) 7622 /* unmapped character */ 7623 continue; 7624 o1 = ch>>11; 7625 o2 = (ch>>7) & 0xF; 7626 i2 = 16*mlevel1[o1] + o2; 7627 if (mlevel2[i2] == 0xFF) 7628 mlevel2[i2] = count3++; 7629 o3 = ch & 0x7F; 7630 i3 = 128*mlevel2[i2] + o3; 7631 mlevel3[i3] = i; 7632 } 7633 return result; 7634} 7635 7636static int 7637encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7638{ 7639 struct encoding_map *map = (struct encoding_map*)mapping; 7640 int l1 = c>>11; 7641 int l2 = (c>>7) & 0xF; 7642 int l3 = c & 0x7F; 7643 int i; 7644 7645 if (c > 0xFFFF) 7646 return -1; 7647 if (c == 0) 7648 return 0; 7649 /* level 1*/ 7650 i = map->level1[l1]; 7651 if (i == 0xFF) { 7652 return -1; 7653 } 7654 /* level 2*/ 7655 i = map->level23[16*i+l2]; 7656 if (i == 0xFF) { 7657 return -1; 7658 } 7659 /* level 3 */ 7660 i = map->level23[16*map->count2 + 128*i + l3]; 7661 if (i == 0) { 7662 return -1; 7663 } 7664 return i; 7665} 7666 7667/* Lookup the character ch in the mapping. If the character 7668 can't be found, Py_None is returned (or NULL, if another 7669 error occurred). */ 7670static PyObject * 7671charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7672{ 7673 PyObject *w = PyLong_FromLong((long)c); 7674 PyObject *x; 7675 7676 if (w == NULL) 7677 return NULL; 7678 x = PyObject_GetItem(mapping, w); 7679 Py_DECREF(w); 7680 if (x == NULL) { 7681 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7682 /* No mapping found means: mapping is undefined. */ 7683 PyErr_Clear(); 7684 x = Py_None; 7685 Py_INCREF(x); 7686 return x; 7687 } else 7688 return NULL; 7689 } 7690 else if (x == Py_None) 7691 return x; 7692 else if (PyLong_Check(x)) { 7693 long value = PyLong_AS_LONG(x); 7694 if (value < 0 || value > 255) { 7695 PyErr_SetString(PyExc_TypeError, 7696 "character mapping must be in range(256)"); 7697 Py_DECREF(x); 7698 return NULL; 7699 } 7700 return x; 7701 } 7702 else if (PyBytes_Check(x)) 7703 return x; 7704 else { 7705 /* wrong return value */ 7706 PyErr_Format(PyExc_TypeError, 7707 "character mapping must return integer, bytes or None, not %.400s", 7708 x->ob_type->tp_name); 7709 Py_DECREF(x); 7710 return NULL; 7711 } 7712} 7713 7714static int 7715charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7716{ 7717 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7718 /* exponentially overallocate to minimize reallocations */ 7719 if (requiredsize < 2*outsize) 7720 requiredsize = 2*outsize; 7721 if (_PyBytes_Resize(outobj, requiredsize)) 7722 return -1; 7723 return 0; 7724} 7725 7726typedef enum charmapencode_result { 7727 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7728} charmapencode_result; 7729/* lookup the character, put the result in the output string and adjust 7730 various state variables. Resize the output bytes object if not enough 7731 space is available. Return a new reference to the object that 7732 was put in the output buffer, or Py_None, if the mapping was undefined 7733 (in which case no character was written) or NULL, if a 7734 reallocation error occurred. The caller must decref the result */ 7735static charmapencode_result 7736charmapencode_output(Py_UCS4 c, PyObject *mapping, 7737 PyObject **outobj, Py_ssize_t *outpos) 7738{ 7739 PyObject *rep; 7740 char *outstart; 7741 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7742 7743 if (Py_TYPE(mapping) == &EncodingMapType) { 7744 int res = encoding_map_lookup(c, mapping); 7745 Py_ssize_t requiredsize = *outpos+1; 7746 if (res == -1) 7747 return enc_FAILED; 7748 if (outsize<requiredsize) 7749 if (charmapencode_resize(outobj, outpos, requiredsize)) 7750 return enc_EXCEPTION; 7751 outstart = PyBytes_AS_STRING(*outobj); 7752 outstart[(*outpos)++] = (char)res; 7753 return enc_SUCCESS; 7754 } 7755 7756 rep = charmapencode_lookup(c, mapping); 7757 if (rep==NULL) 7758 return enc_EXCEPTION; 7759 else if (rep==Py_None) { 7760 Py_DECREF(rep); 7761 return enc_FAILED; 7762 } else { 7763 if (PyLong_Check(rep)) { 7764 Py_ssize_t requiredsize = *outpos+1; 7765 if (outsize<requiredsize) 7766 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7767 Py_DECREF(rep); 7768 return enc_EXCEPTION; 7769 } 7770 outstart = PyBytes_AS_STRING(*outobj); 7771 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7772 } 7773 else { 7774 const char *repchars = PyBytes_AS_STRING(rep); 7775 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7776 Py_ssize_t requiredsize = *outpos+repsize; 7777 if (outsize<requiredsize) 7778 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7779 Py_DECREF(rep); 7780 return enc_EXCEPTION; 7781 } 7782 outstart = PyBytes_AS_STRING(*outobj); 7783 memcpy(outstart + *outpos, repchars, repsize); 7784 *outpos += repsize; 7785 } 7786 } 7787 Py_DECREF(rep); 7788 return enc_SUCCESS; 7789} 7790 7791/* handle an error in PyUnicode_EncodeCharmap 7792 Return 0 on success, -1 on error */ 7793static int 7794charmap_encoding_error( 7795 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7796 PyObject **exceptionObject, 7797 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7798 PyObject **res, Py_ssize_t *respos) 7799{ 7800 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7801 Py_ssize_t size, repsize; 7802 Py_ssize_t newpos; 7803 enum PyUnicode_Kind kind; 7804 void *data; 7805 Py_ssize_t index; 7806 /* startpos for collecting unencodable chars */ 7807 Py_ssize_t collstartpos = *inpos; 7808 Py_ssize_t collendpos = *inpos+1; 7809 Py_ssize_t collpos; 7810 char *encoding = "charmap"; 7811 char *reason = "character maps to <undefined>"; 7812 charmapencode_result x; 7813 Py_UCS4 ch; 7814 int val; 7815 7816 if (PyUnicode_READY(unicode) == -1) 7817 return -1; 7818 size = PyUnicode_GET_LENGTH(unicode); 7819 /* find all unencodable characters */ 7820 while (collendpos < size) { 7821 PyObject *rep; 7822 if (Py_TYPE(mapping) == &EncodingMapType) { 7823 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7824 val = encoding_map_lookup(ch, mapping); 7825 if (val != -1) 7826 break; 7827 ++collendpos; 7828 continue; 7829 } 7830 7831 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7832 rep = charmapencode_lookup(ch, mapping); 7833 if (rep==NULL) 7834 return -1; 7835 else if (rep!=Py_None) { 7836 Py_DECREF(rep); 7837 break; 7838 } 7839 Py_DECREF(rep); 7840 ++collendpos; 7841 } 7842 /* cache callback name lookup 7843 * (if not done yet, i.e. it's the first error) */ 7844 if (*known_errorHandler==-1) { 7845 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7846 *known_errorHandler = 1; 7847 else if (!strcmp(errors, "replace")) 7848 *known_errorHandler = 2; 7849 else if (!strcmp(errors, "ignore")) 7850 *known_errorHandler = 3; 7851 else if (!strcmp(errors, "xmlcharrefreplace")) 7852 *known_errorHandler = 4; 7853 else 7854 *known_errorHandler = 0; 7855 } 7856 switch (*known_errorHandler) { 7857 case 1: /* strict */ 7858 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7859 return -1; 7860 case 2: /* replace */ 7861 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7862 x = charmapencode_output('?', mapping, res, respos); 7863 if (x==enc_EXCEPTION) { 7864 return -1; 7865 } 7866 else if (x==enc_FAILED) { 7867 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7868 return -1; 7869 } 7870 } 7871 /* fall through */ 7872 case 3: /* ignore */ 7873 *inpos = collendpos; 7874 break; 7875 case 4: /* xmlcharrefreplace */ 7876 /* generate replacement (temporarily (mis)uses p) */ 7877 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7878 char buffer[2+29+1+1]; 7879 char *cp; 7880 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 7881 for (cp = buffer; *cp; ++cp) { 7882 x = charmapencode_output(*cp, mapping, res, respos); 7883 if (x==enc_EXCEPTION) 7884 return -1; 7885 else if (x==enc_FAILED) { 7886 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7887 return -1; 7888 } 7889 } 7890 } 7891 *inpos = collendpos; 7892 break; 7893 default: 7894 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7895 encoding, reason, unicode, exceptionObject, 7896 collstartpos, collendpos, &newpos); 7897 if (repunicode == NULL) 7898 return -1; 7899 if (PyBytes_Check(repunicode)) { 7900 /* Directly copy bytes result to output. */ 7901 Py_ssize_t outsize = PyBytes_Size(*res); 7902 Py_ssize_t requiredsize; 7903 repsize = PyBytes_Size(repunicode); 7904 requiredsize = *respos + repsize; 7905 if (requiredsize > outsize) 7906 /* Make room for all additional bytes. */ 7907 if (charmapencode_resize(res, respos, requiredsize)) { 7908 Py_DECREF(repunicode); 7909 return -1; 7910 } 7911 memcpy(PyBytes_AsString(*res) + *respos, 7912 PyBytes_AsString(repunicode), repsize); 7913 *respos += repsize; 7914 *inpos = newpos; 7915 Py_DECREF(repunicode); 7916 break; 7917 } 7918 /* generate replacement */ 7919 if (PyUnicode_READY(repunicode) == -1) { 7920 Py_DECREF(repunicode); 7921 return -1; 7922 } 7923 repsize = PyUnicode_GET_LENGTH(repunicode); 7924 data = PyUnicode_DATA(repunicode); 7925 kind = PyUnicode_KIND(repunicode); 7926 for (index = 0; index < repsize; index++) { 7927 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 7928 x = charmapencode_output(repch, mapping, res, respos); 7929 if (x==enc_EXCEPTION) { 7930 Py_DECREF(repunicode); 7931 return -1; 7932 } 7933 else if (x==enc_FAILED) { 7934 Py_DECREF(repunicode); 7935 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7936 return -1; 7937 } 7938 } 7939 *inpos = newpos; 7940 Py_DECREF(repunicode); 7941 } 7942 return 0; 7943} 7944 7945PyObject * 7946_PyUnicode_EncodeCharmap(PyObject *unicode, 7947 PyObject *mapping, 7948 const char *errors) 7949{ 7950 /* output object */ 7951 PyObject *res = NULL; 7952 /* current input position */ 7953 Py_ssize_t inpos = 0; 7954 Py_ssize_t size; 7955 /* current output position */ 7956 Py_ssize_t respos = 0; 7957 PyObject *errorHandler = NULL; 7958 PyObject *exc = NULL; 7959 /* the following variable is used for caching string comparisons 7960 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7961 * 3=ignore, 4=xmlcharrefreplace */ 7962 int known_errorHandler = -1; 7963 7964 if (PyUnicode_READY(unicode) == -1) 7965 return NULL; 7966 size = PyUnicode_GET_LENGTH(unicode); 7967 7968 /* Default to Latin-1 */ 7969 if (mapping == NULL) 7970 return unicode_encode_ucs1(unicode, errors, 256); 7971 7972 /* allocate enough for a simple encoding without 7973 replacements, if we need more, we'll resize */ 7974 res = PyBytes_FromStringAndSize(NULL, size); 7975 if (res == NULL) 7976 goto onError; 7977 if (size == 0) 7978 return res; 7979 7980 while (inpos<size) { 7981 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 7982 /* try to encode it */ 7983 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 7984 if (x==enc_EXCEPTION) /* error */ 7985 goto onError; 7986 if (x==enc_FAILED) { /* unencodable character */ 7987 if (charmap_encoding_error(unicode, &inpos, mapping, 7988 &exc, 7989 &known_errorHandler, &errorHandler, errors, 7990 &res, &respos)) { 7991 goto onError; 7992 } 7993 } 7994 else 7995 /* done with this character => adjust input position */ 7996 ++inpos; 7997 } 7998 7999 /* Resize if we allocated to much */ 8000 if (respos<PyBytes_GET_SIZE(res)) 8001 if (_PyBytes_Resize(&res, respos) < 0) 8002 goto onError; 8003 8004 Py_XDECREF(exc); 8005 Py_XDECREF(errorHandler); 8006 return res; 8007 8008 onError: 8009 Py_XDECREF(res); 8010 Py_XDECREF(exc); 8011 Py_XDECREF(errorHandler); 8012 return NULL; 8013} 8014 8015/* Deprecated */ 8016PyObject * 8017PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8018 Py_ssize_t size, 8019 PyObject *mapping, 8020 const char *errors) 8021{ 8022 PyObject *result; 8023 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8024 if (unicode == NULL) 8025 return NULL; 8026 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8027 Py_DECREF(unicode); 8028 return result; 8029} 8030 8031PyObject * 8032PyUnicode_AsCharmapString(PyObject *unicode, 8033 PyObject *mapping) 8034{ 8035 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8036 PyErr_BadArgument(); 8037 return NULL; 8038 } 8039 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8040} 8041 8042/* create or adjust a UnicodeTranslateError */ 8043static void 8044make_translate_exception(PyObject **exceptionObject, 8045 PyObject *unicode, 8046 Py_ssize_t startpos, Py_ssize_t endpos, 8047 const char *reason) 8048{ 8049 if (*exceptionObject == NULL) { 8050 *exceptionObject = _PyUnicodeTranslateError_Create( 8051 unicode, startpos, endpos, reason); 8052 } 8053 else { 8054 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8055 goto onError; 8056 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8057 goto onError; 8058 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8059 goto onError; 8060 return; 8061 onError: 8062 Py_DECREF(*exceptionObject); 8063 *exceptionObject = NULL; 8064 } 8065} 8066 8067/* error handling callback helper: 8068 build arguments, call the callback and check the arguments, 8069 put the result into newpos and return the replacement string, which 8070 has to be freed by the caller */ 8071static PyObject * 8072unicode_translate_call_errorhandler(const char *errors, 8073 PyObject **errorHandler, 8074 const char *reason, 8075 PyObject *unicode, PyObject **exceptionObject, 8076 Py_ssize_t startpos, Py_ssize_t endpos, 8077 Py_ssize_t *newpos) 8078{ 8079 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8080 8081 Py_ssize_t i_newpos; 8082 PyObject *restuple; 8083 PyObject *resunicode; 8084 8085 if (*errorHandler == NULL) { 8086 *errorHandler = PyCodec_LookupError(errors); 8087 if (*errorHandler == NULL) 8088 return NULL; 8089 } 8090 8091 make_translate_exception(exceptionObject, 8092 unicode, startpos, endpos, reason); 8093 if (*exceptionObject == NULL) 8094 return NULL; 8095 8096 restuple = PyObject_CallFunctionObjArgs( 8097 *errorHandler, *exceptionObject, NULL); 8098 if (restuple == NULL) 8099 return NULL; 8100 if (!PyTuple_Check(restuple)) { 8101 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8102 Py_DECREF(restuple); 8103 return NULL; 8104 } 8105 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8106 &resunicode, &i_newpos)) { 8107 Py_DECREF(restuple); 8108 return NULL; 8109 } 8110 if (i_newpos<0) 8111 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8112 else 8113 *newpos = i_newpos; 8114 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8115 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8116 Py_DECREF(restuple); 8117 return NULL; 8118 } 8119 Py_INCREF(resunicode); 8120 Py_DECREF(restuple); 8121 return resunicode; 8122} 8123 8124/* Lookup the character ch in the mapping and put the result in result, 8125 which must be decrefed by the caller. 8126 Return 0 on success, -1 on error */ 8127static int 8128charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8129{ 8130 PyObject *w = PyLong_FromLong((long)c); 8131 PyObject *x; 8132 8133 if (w == NULL) 8134 return -1; 8135 x = PyObject_GetItem(mapping, w); 8136 Py_DECREF(w); 8137 if (x == NULL) { 8138 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8139 /* No mapping found means: use 1:1 mapping. */ 8140 PyErr_Clear(); 8141 *result = NULL; 8142 return 0; 8143 } else 8144 return -1; 8145 } 8146 else if (x == Py_None) { 8147 *result = x; 8148 return 0; 8149 } 8150 else if (PyLong_Check(x)) { 8151 long value = PyLong_AS_LONG(x); 8152 long max = PyUnicode_GetMax(); 8153 if (value < 0 || value > max) { 8154 PyErr_Format(PyExc_TypeError, 8155 "character mapping must be in range(0x%x)", max+1); 8156 Py_DECREF(x); 8157 return -1; 8158 } 8159 *result = x; 8160 return 0; 8161 } 8162 else if (PyUnicode_Check(x)) { 8163 *result = x; 8164 return 0; 8165 } 8166 else { 8167 /* wrong return value */ 8168 PyErr_SetString(PyExc_TypeError, 8169 "character mapping must return integer, None or str"); 8170 Py_DECREF(x); 8171 return -1; 8172 } 8173} 8174/* ensure that *outobj is at least requiredsize characters long, 8175 if not reallocate and adjust various state variables. 8176 Return 0 on success, -1 on error */ 8177static int 8178charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8179 Py_ssize_t requiredsize) 8180{ 8181 Py_ssize_t oldsize = *psize; 8182 Py_UCS4 *new_outobj; 8183 if (requiredsize > oldsize) { 8184 /* exponentially overallocate to minimize reallocations */ 8185 if (requiredsize < 2 * oldsize) 8186 requiredsize = 2 * oldsize; 8187 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8188 if (new_outobj == 0) 8189 return -1; 8190 *outobj = new_outobj; 8191 *psize = requiredsize; 8192 } 8193 return 0; 8194} 8195/* lookup the character, put the result in the output string and adjust 8196 various state variables. Return a new reference to the object that 8197 was put in the output buffer in *result, or Py_None, if the mapping was 8198 undefined (in which case no character was written). 8199 The called must decref result. 8200 Return 0 on success, -1 on error. */ 8201static int 8202charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8203 PyObject *mapping, Py_UCS4 **output, 8204 Py_ssize_t *osize, Py_ssize_t *opos, 8205 PyObject **res) 8206{ 8207 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8208 if (charmaptranslate_lookup(curinp, mapping, res)) 8209 return -1; 8210 if (*res==NULL) { 8211 /* not found => default to 1:1 mapping */ 8212 (*output)[(*opos)++] = curinp; 8213 } 8214 else if (*res==Py_None) 8215 ; 8216 else if (PyLong_Check(*res)) { 8217 /* no overflow check, because we know that the space is enough */ 8218 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8219 } 8220 else if (PyUnicode_Check(*res)) { 8221 Py_ssize_t repsize; 8222 if (PyUnicode_READY(*res) == -1) 8223 return -1; 8224 repsize = PyUnicode_GET_LENGTH(*res); 8225 if (repsize==1) { 8226 /* no overflow check, because we know that the space is enough */ 8227 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8228 } 8229 else if (repsize!=0) { 8230 /* more than one character */ 8231 Py_ssize_t requiredsize = *opos + 8232 (PyUnicode_GET_LENGTH(input) - ipos) + 8233 repsize - 1; 8234 Py_ssize_t i; 8235 if (charmaptranslate_makespace(output, osize, requiredsize)) 8236 return -1; 8237 for(i = 0; i < repsize; i++) 8238 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8239 } 8240 } 8241 else 8242 return -1; 8243 return 0; 8244} 8245 8246PyObject * 8247_PyUnicode_TranslateCharmap(PyObject *input, 8248 PyObject *mapping, 8249 const char *errors) 8250{ 8251 /* input object */ 8252 char *idata; 8253 Py_ssize_t size, i; 8254 int kind; 8255 /* output buffer */ 8256 Py_UCS4 *output = NULL; 8257 Py_ssize_t osize; 8258 PyObject *res; 8259 /* current output position */ 8260 Py_ssize_t opos; 8261 char *reason = "character maps to <undefined>"; 8262 PyObject *errorHandler = NULL; 8263 PyObject *exc = NULL; 8264 /* the following variable is used for caching string comparisons 8265 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8266 * 3=ignore, 4=xmlcharrefreplace */ 8267 int known_errorHandler = -1; 8268 8269 if (mapping == NULL) { 8270 PyErr_BadArgument(); 8271 return NULL; 8272 } 8273 8274 if (PyUnicode_READY(input) == -1) 8275 return NULL; 8276 idata = (char*)PyUnicode_DATA(input); 8277 kind = PyUnicode_KIND(input); 8278 size = PyUnicode_GET_LENGTH(input); 8279 i = 0; 8280 8281 if (size == 0) { 8282 Py_INCREF(input); 8283 return input; 8284 } 8285 8286 /* allocate enough for a simple 1:1 translation without 8287 replacements, if we need more, we'll resize */ 8288 osize = size; 8289 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8290 opos = 0; 8291 if (output == NULL) { 8292 PyErr_NoMemory(); 8293 goto onError; 8294 } 8295 8296 while (i<size) { 8297 /* try to encode it */ 8298 PyObject *x = NULL; 8299 if (charmaptranslate_output(input, i, mapping, 8300 &output, &osize, &opos, &x)) { 8301 Py_XDECREF(x); 8302 goto onError; 8303 } 8304 Py_XDECREF(x); 8305 if (x!=Py_None) /* it worked => adjust input pointer */ 8306 ++i; 8307 else { /* untranslatable character */ 8308 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8309 Py_ssize_t repsize; 8310 Py_ssize_t newpos; 8311 Py_ssize_t uni2; 8312 /* startpos for collecting untranslatable chars */ 8313 Py_ssize_t collstart = i; 8314 Py_ssize_t collend = i+1; 8315 Py_ssize_t coll; 8316 8317 /* find all untranslatable characters */ 8318 while (collend < size) { 8319 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8320 goto onError; 8321 Py_XDECREF(x); 8322 if (x!=Py_None) 8323 break; 8324 ++collend; 8325 } 8326 /* cache callback name lookup 8327 * (if not done yet, i.e. it's the first error) */ 8328 if (known_errorHandler==-1) { 8329 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8330 known_errorHandler = 1; 8331 else if (!strcmp(errors, "replace")) 8332 known_errorHandler = 2; 8333 else if (!strcmp(errors, "ignore")) 8334 known_errorHandler = 3; 8335 else if (!strcmp(errors, "xmlcharrefreplace")) 8336 known_errorHandler = 4; 8337 else 8338 known_errorHandler = 0; 8339 } 8340 switch (known_errorHandler) { 8341 case 1: /* strict */ 8342 make_translate_exception(&exc, 8343 input, collstart, collend, reason); 8344 if (exc != NULL) 8345 PyCodec_StrictErrors(exc); 8346 goto onError; 8347 case 2: /* replace */ 8348 /* No need to check for space, this is a 1:1 replacement */ 8349 for (coll = collstart; coll<collend; coll++) 8350 output[opos++] = '?'; 8351 /* fall through */ 8352 case 3: /* ignore */ 8353 i = collend; 8354 break; 8355 case 4: /* xmlcharrefreplace */ 8356 /* generate replacement (temporarily (mis)uses i) */ 8357 for (i = collstart; i < collend; ++i) { 8358 char buffer[2+29+1+1]; 8359 char *cp; 8360 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8361 if (charmaptranslate_makespace(&output, &osize, 8362 opos+strlen(buffer)+(size-collend))) 8363 goto onError; 8364 for (cp = buffer; *cp; ++cp) 8365 output[opos++] = *cp; 8366 } 8367 i = collend; 8368 break; 8369 default: 8370 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8371 reason, input, &exc, 8372 collstart, collend, &newpos); 8373 if (repunicode == NULL) 8374 goto onError; 8375 if (PyUnicode_READY(repunicode) == -1) { 8376 Py_DECREF(repunicode); 8377 goto onError; 8378 } 8379 /* generate replacement */ 8380 repsize = PyUnicode_GET_LENGTH(repunicode); 8381 if (charmaptranslate_makespace(&output, &osize, 8382 opos+repsize+(size-collend))) { 8383 Py_DECREF(repunicode); 8384 goto onError; 8385 } 8386 for (uni2 = 0; repsize-->0; ++uni2) 8387 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8388 i = newpos; 8389 Py_DECREF(repunicode); 8390 } 8391 } 8392 } 8393 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8394 if (!res) 8395 goto onError; 8396 PyMem_Free(output); 8397 Py_XDECREF(exc); 8398 Py_XDECREF(errorHandler); 8399 return res; 8400 8401 onError: 8402 PyMem_Free(output); 8403 Py_XDECREF(exc); 8404 Py_XDECREF(errorHandler); 8405 return NULL; 8406} 8407 8408/* Deprecated. Use PyUnicode_Translate instead. */ 8409PyObject * 8410PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8411 Py_ssize_t size, 8412 PyObject *mapping, 8413 const char *errors) 8414{ 8415 PyObject *result; 8416 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8417 if (!unicode) 8418 return NULL; 8419 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8420 Py_DECREF(unicode); 8421 return result; 8422} 8423 8424PyObject * 8425PyUnicode_Translate(PyObject *str, 8426 PyObject *mapping, 8427 const char *errors) 8428{ 8429 PyObject *result; 8430 8431 str = PyUnicode_FromObject(str); 8432 if (str == NULL) 8433 return NULL; 8434 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8435 Py_DECREF(str); 8436 return result; 8437} 8438 8439static Py_UCS4 8440fix_decimal_and_space_to_ascii(PyObject *self) 8441{ 8442 /* No need to call PyUnicode_READY(self) because this function is only 8443 called as a callback from fixup() which does it already. */ 8444 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8445 const int kind = PyUnicode_KIND(self); 8446 void *data = PyUnicode_DATA(self); 8447 Py_UCS4 maxchar = 127, ch, fixed; 8448 int modified = 0; 8449 Py_ssize_t i; 8450 8451 for (i = 0; i < len; ++i) { 8452 ch = PyUnicode_READ(kind, data, i); 8453 fixed = 0; 8454 if (ch > 127) { 8455 if (Py_UNICODE_ISSPACE(ch)) 8456 fixed = ' '; 8457 else { 8458 const int decimal = Py_UNICODE_TODECIMAL(ch); 8459 if (decimal >= 0) 8460 fixed = '0' + decimal; 8461 } 8462 if (fixed != 0) { 8463 modified = 1; 8464 maxchar = MAX_MAXCHAR(maxchar, fixed); 8465 PyUnicode_WRITE(kind, data, i, fixed); 8466 } 8467 else 8468 maxchar = MAX_MAXCHAR(maxchar, ch); 8469 } 8470 } 8471 8472 return (modified) ? maxchar : 0; 8473} 8474 8475PyObject * 8476_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8477{ 8478 if (!PyUnicode_Check(unicode)) { 8479 PyErr_BadInternalCall(); 8480 return NULL; 8481 } 8482 if (PyUnicode_READY(unicode) == -1) 8483 return NULL; 8484 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8485 /* If the string is already ASCII, just return the same string */ 8486 Py_INCREF(unicode); 8487 return unicode; 8488 } 8489 return fixup(unicode, fix_decimal_and_space_to_ascii); 8490} 8491 8492PyObject * 8493PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8494 Py_ssize_t length) 8495{ 8496 PyObject *decimal; 8497 Py_ssize_t i; 8498 Py_UCS4 maxchar; 8499 enum PyUnicode_Kind kind; 8500 void *data; 8501 8502 maxchar = 127; 8503 for (i = 0; i < length; i++) { 8504 Py_UNICODE ch = s[i]; 8505 if (ch > 127) { 8506 int decimal = Py_UNICODE_TODECIMAL(ch); 8507 if (decimal >= 0) 8508 ch = '0' + decimal; 8509 maxchar = MAX_MAXCHAR(maxchar, ch); 8510 } 8511 } 8512 8513 /* Copy to a new string */ 8514 decimal = PyUnicode_New(length, maxchar); 8515 if (decimal == NULL) 8516 return decimal; 8517 kind = PyUnicode_KIND(decimal); 8518 data = PyUnicode_DATA(decimal); 8519 /* Iterate over code points */ 8520 for (i = 0; i < length; i++) { 8521 Py_UNICODE ch = s[i]; 8522 if (ch > 127) { 8523 int decimal = Py_UNICODE_TODECIMAL(ch); 8524 if (decimal >= 0) 8525 ch = '0' + decimal; 8526 } 8527 PyUnicode_WRITE(kind, data, i, ch); 8528 } 8529 return unicode_result(decimal); 8530} 8531/* --- Decimal Encoder ---------------------------------------------------- */ 8532 8533int 8534PyUnicode_EncodeDecimal(Py_UNICODE *s, 8535 Py_ssize_t length, 8536 char *output, 8537 const char *errors) 8538{ 8539 PyObject *unicode; 8540 Py_ssize_t i; 8541 enum PyUnicode_Kind kind; 8542 void *data; 8543 8544 if (output == NULL) { 8545 PyErr_BadArgument(); 8546 return -1; 8547 } 8548 8549 unicode = PyUnicode_FromUnicode(s, length); 8550 if (unicode == NULL) 8551 return -1; 8552 8553 if (PyUnicode_READY(unicode) == -1) { 8554 Py_DECREF(unicode); 8555 return -1; 8556 } 8557 kind = PyUnicode_KIND(unicode); 8558 data = PyUnicode_DATA(unicode); 8559 8560 for (i=0; i < length; ) { 8561 PyObject *exc; 8562 Py_UCS4 ch; 8563 int decimal; 8564 Py_ssize_t startpos; 8565 8566 ch = PyUnicode_READ(kind, data, i); 8567 8568 if (Py_UNICODE_ISSPACE(ch)) { 8569 *output++ = ' '; 8570 i++; 8571 continue; 8572 } 8573 decimal = Py_UNICODE_TODECIMAL(ch); 8574 if (decimal >= 0) { 8575 *output++ = '0' + decimal; 8576 i++; 8577 continue; 8578 } 8579 if (0 < ch && ch < 256) { 8580 *output++ = (char)ch; 8581 i++; 8582 continue; 8583 } 8584 8585 startpos = i; 8586 exc = NULL; 8587 raise_encode_exception(&exc, "decimal", unicode, 8588 startpos, startpos+1, 8589 "invalid decimal Unicode string"); 8590 Py_XDECREF(exc); 8591 Py_DECREF(unicode); 8592 return -1; 8593 } 8594 /* 0-terminate the output string */ 8595 *output++ = '\0'; 8596 Py_DECREF(unicode); 8597 return 0; 8598} 8599 8600/* --- Helpers ------------------------------------------------------------ */ 8601 8602static Py_ssize_t 8603any_find_slice(int direction, PyObject* s1, PyObject* s2, 8604 Py_ssize_t start, 8605 Py_ssize_t end) 8606{ 8607 int kind1, kind2, kind; 8608 void *buf1, *buf2; 8609 Py_ssize_t len1, len2, result; 8610 8611 kind1 = PyUnicode_KIND(s1); 8612 kind2 = PyUnicode_KIND(s2); 8613 kind = kind1 > kind2 ? kind1 : kind2; 8614 buf1 = PyUnicode_DATA(s1); 8615 buf2 = PyUnicode_DATA(s2); 8616 if (kind1 != kind) 8617 buf1 = _PyUnicode_AsKind(s1, kind); 8618 if (!buf1) 8619 return -2; 8620 if (kind2 != kind) 8621 buf2 = _PyUnicode_AsKind(s2, kind); 8622 if (!buf2) { 8623 if (kind1 != kind) PyMem_Free(buf1); 8624 return -2; 8625 } 8626 len1 = PyUnicode_GET_LENGTH(s1); 8627 len2 = PyUnicode_GET_LENGTH(s2); 8628 8629 if (direction > 0) { 8630 switch (kind) { 8631 case PyUnicode_1BYTE_KIND: 8632 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8633 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8634 else 8635 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8636 break; 8637 case PyUnicode_2BYTE_KIND: 8638 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8639 break; 8640 case PyUnicode_4BYTE_KIND: 8641 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8642 break; 8643 default: 8644 assert(0); result = -2; 8645 } 8646 } 8647 else { 8648 switch (kind) { 8649 case PyUnicode_1BYTE_KIND: 8650 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8651 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8652 else 8653 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8654 break; 8655 case PyUnicode_2BYTE_KIND: 8656 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8657 break; 8658 case PyUnicode_4BYTE_KIND: 8659 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8660 break; 8661 default: 8662 assert(0); result = -2; 8663 } 8664 } 8665 8666 if (kind1 != kind) 8667 PyMem_Free(buf1); 8668 if (kind2 != kind) 8669 PyMem_Free(buf2); 8670 8671 return result; 8672} 8673 8674Py_ssize_t 8675_PyUnicode_InsertThousandsGrouping( 8676 PyObject *unicode, Py_ssize_t index, 8677 Py_ssize_t n_buffer, 8678 void *digits, Py_ssize_t n_digits, 8679 Py_ssize_t min_width, 8680 const char *grouping, PyObject *thousands_sep, 8681 Py_UCS4 *maxchar) 8682{ 8683 unsigned int kind, thousands_sep_kind; 8684 char *data, *thousands_sep_data; 8685 Py_ssize_t thousands_sep_len; 8686 Py_ssize_t len; 8687 8688 if (unicode != NULL) { 8689 kind = PyUnicode_KIND(unicode); 8690 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8691 } 8692 else { 8693 kind = PyUnicode_1BYTE_KIND; 8694 data = NULL; 8695 } 8696 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8697 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8698 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8699 if (unicode != NULL && thousands_sep_kind != kind) { 8700 if (thousands_sep_kind < kind) { 8701 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8702 if (!thousands_sep_data) 8703 return -1; 8704 } 8705 else { 8706 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8707 if (!data) 8708 return -1; 8709 } 8710 } 8711 8712 switch (kind) { 8713 case PyUnicode_1BYTE_KIND: 8714 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8715 len = asciilib_InsertThousandsGrouping( 8716 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8717 min_width, grouping, 8718 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8719 else 8720 len = ucs1lib_InsertThousandsGrouping( 8721 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8722 min_width, grouping, 8723 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8724 break; 8725 case PyUnicode_2BYTE_KIND: 8726 len = ucs2lib_InsertThousandsGrouping( 8727 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8728 min_width, grouping, 8729 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8730 break; 8731 case PyUnicode_4BYTE_KIND: 8732 len = ucs4lib_InsertThousandsGrouping( 8733 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8734 min_width, grouping, 8735 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8736 break; 8737 default: 8738 assert(0); 8739 return -1; 8740 } 8741 if (unicode != NULL && thousands_sep_kind != kind) { 8742 if (thousands_sep_kind < kind) 8743 PyMem_Free(thousands_sep_data); 8744 else 8745 PyMem_Free(data); 8746 } 8747 if (unicode == NULL) { 8748 *maxchar = 127; 8749 if (len != n_digits) { 8750 *maxchar = MAX_MAXCHAR(*maxchar, 8751 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8752 } 8753 } 8754 return len; 8755} 8756 8757 8758/* helper macro to fixup start/end slice values */ 8759#define ADJUST_INDICES(start, end, len) \ 8760 if (end > len) \ 8761 end = len; \ 8762 else if (end < 0) { \ 8763 end += len; \ 8764 if (end < 0) \ 8765 end = 0; \ 8766 } \ 8767 if (start < 0) { \ 8768 start += len; \ 8769 if (start < 0) \ 8770 start = 0; \ 8771 } 8772 8773Py_ssize_t 8774PyUnicode_Count(PyObject *str, 8775 PyObject *substr, 8776 Py_ssize_t start, 8777 Py_ssize_t end) 8778{ 8779 Py_ssize_t result; 8780 PyObject* str_obj; 8781 PyObject* sub_obj; 8782 int kind1, kind2, kind; 8783 void *buf1 = NULL, *buf2 = NULL; 8784 Py_ssize_t len1, len2; 8785 8786 str_obj = PyUnicode_FromObject(str); 8787 if (!str_obj) 8788 return -1; 8789 sub_obj = PyUnicode_FromObject(substr); 8790 if (!sub_obj) { 8791 Py_DECREF(str_obj); 8792 return -1; 8793 } 8794 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8795 Py_DECREF(sub_obj); 8796 Py_DECREF(str_obj); 8797 return -1; 8798 } 8799 8800 kind1 = PyUnicode_KIND(str_obj); 8801 kind2 = PyUnicode_KIND(sub_obj); 8802 kind = kind1; 8803 buf1 = PyUnicode_DATA(str_obj); 8804 buf2 = PyUnicode_DATA(sub_obj); 8805 if (kind2 != kind) { 8806 if (kind2 > kind) { 8807 Py_DECREF(sub_obj); 8808 Py_DECREF(str_obj); 8809 return 0; 8810 } 8811 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8812 } 8813 if (!buf2) 8814 goto onError; 8815 len1 = PyUnicode_GET_LENGTH(str_obj); 8816 len2 = PyUnicode_GET_LENGTH(sub_obj); 8817 8818 ADJUST_INDICES(start, end, len1); 8819 switch (kind) { 8820 case PyUnicode_1BYTE_KIND: 8821 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8822 result = asciilib_count( 8823 ((Py_UCS1*)buf1) + start, end - start, 8824 buf2, len2, PY_SSIZE_T_MAX 8825 ); 8826 else 8827 result = ucs1lib_count( 8828 ((Py_UCS1*)buf1) + start, end - start, 8829 buf2, len2, PY_SSIZE_T_MAX 8830 ); 8831 break; 8832 case PyUnicode_2BYTE_KIND: 8833 result = ucs2lib_count( 8834 ((Py_UCS2*)buf1) + start, end - start, 8835 buf2, len2, PY_SSIZE_T_MAX 8836 ); 8837 break; 8838 case PyUnicode_4BYTE_KIND: 8839 result = ucs4lib_count( 8840 ((Py_UCS4*)buf1) + start, end - start, 8841 buf2, len2, PY_SSIZE_T_MAX 8842 ); 8843 break; 8844 default: 8845 assert(0); result = 0; 8846 } 8847 8848 Py_DECREF(sub_obj); 8849 Py_DECREF(str_obj); 8850 8851 if (kind2 != kind) 8852 PyMem_Free(buf2); 8853 8854 return result; 8855 onError: 8856 Py_DECREF(sub_obj); 8857 Py_DECREF(str_obj); 8858 if (kind2 != kind && buf2) 8859 PyMem_Free(buf2); 8860 return -1; 8861} 8862 8863Py_ssize_t 8864PyUnicode_Find(PyObject *str, 8865 PyObject *sub, 8866 Py_ssize_t start, 8867 Py_ssize_t end, 8868 int direction) 8869{ 8870 Py_ssize_t result; 8871 8872 str = PyUnicode_FromObject(str); 8873 if (!str) 8874 return -2; 8875 sub = PyUnicode_FromObject(sub); 8876 if (!sub) { 8877 Py_DECREF(str); 8878 return -2; 8879 } 8880 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 8881 Py_DECREF(sub); 8882 Py_DECREF(str); 8883 return -2; 8884 } 8885 8886 result = any_find_slice(direction, 8887 str, sub, start, end 8888 ); 8889 8890 Py_DECREF(str); 8891 Py_DECREF(sub); 8892 8893 return result; 8894} 8895 8896Py_ssize_t 8897PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8898 Py_ssize_t start, Py_ssize_t end, 8899 int direction) 8900{ 8901 int kind; 8902 Py_ssize_t result; 8903 if (PyUnicode_READY(str) == -1) 8904 return -2; 8905 if (start < 0 || end < 0) { 8906 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8907 return -2; 8908 } 8909 if (end > PyUnicode_GET_LENGTH(str)) 8910 end = PyUnicode_GET_LENGTH(str); 8911 kind = PyUnicode_KIND(str); 8912 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 8913 kind, end-start, ch, direction); 8914 if (result == -1) 8915 return -1; 8916 else 8917 return start + result; 8918} 8919 8920static int 8921tailmatch(PyObject *self, 8922 PyObject *substring, 8923 Py_ssize_t start, 8924 Py_ssize_t end, 8925 int direction) 8926{ 8927 int kind_self; 8928 int kind_sub; 8929 void *data_self; 8930 void *data_sub; 8931 Py_ssize_t offset; 8932 Py_ssize_t i; 8933 Py_ssize_t end_sub; 8934 8935 if (PyUnicode_READY(self) == -1 || 8936 PyUnicode_READY(substring) == -1) 8937 return 0; 8938 8939 if (PyUnicode_GET_LENGTH(substring) == 0) 8940 return 1; 8941 8942 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8943 end -= PyUnicode_GET_LENGTH(substring); 8944 if (end < start) 8945 return 0; 8946 8947 kind_self = PyUnicode_KIND(self); 8948 data_self = PyUnicode_DATA(self); 8949 kind_sub = PyUnicode_KIND(substring); 8950 data_sub = PyUnicode_DATA(substring); 8951 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8952 8953 if (direction > 0) 8954 offset = end; 8955 else 8956 offset = start; 8957 8958 if (PyUnicode_READ(kind_self, data_self, offset) == 8959 PyUnicode_READ(kind_sub, data_sub, 0) && 8960 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8961 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8962 /* If both are of the same kind, memcmp is sufficient */ 8963 if (kind_self == kind_sub) { 8964 return ! memcmp((char *)data_self + 8965 (offset * PyUnicode_KIND(substring)), 8966 data_sub, 8967 PyUnicode_GET_LENGTH(substring) * 8968 PyUnicode_KIND(substring)); 8969 } 8970 /* otherwise we have to compare each character by first accesing it */ 8971 else { 8972 /* We do not need to compare 0 and len(substring)-1 because 8973 the if statement above ensured already that they are equal 8974 when we end up here. */ 8975 /* TODO: honor direction and do a forward or backwards search */ 8976 for (i = 1; i < end_sub; ++i) { 8977 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8978 PyUnicode_READ(kind_sub, data_sub, i)) 8979 return 0; 8980 } 8981 return 1; 8982 } 8983 } 8984 8985 return 0; 8986} 8987 8988Py_ssize_t 8989PyUnicode_Tailmatch(PyObject *str, 8990 PyObject *substr, 8991 Py_ssize_t start, 8992 Py_ssize_t end, 8993 int direction) 8994{ 8995 Py_ssize_t result; 8996 8997 str = PyUnicode_FromObject(str); 8998 if (str == NULL) 8999 return -1; 9000 substr = PyUnicode_FromObject(substr); 9001 if (substr == NULL) { 9002 Py_DECREF(str); 9003 return -1; 9004 } 9005 9006 result = tailmatch(str, substr, 9007 start, end, direction); 9008 Py_DECREF(str); 9009 Py_DECREF(substr); 9010 return result; 9011} 9012 9013/* Apply fixfct filter to the Unicode object self and return a 9014 reference to the modified object */ 9015 9016static PyObject * 9017fixup(PyObject *self, 9018 Py_UCS4 (*fixfct)(PyObject *s)) 9019{ 9020 PyObject *u; 9021 Py_UCS4 maxchar_old, maxchar_new = 0; 9022 PyObject *v; 9023 9024 u = _PyUnicode_Copy(self); 9025 if (u == NULL) 9026 return NULL; 9027 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9028 9029 /* fix functions return the new maximum character in a string, 9030 if the kind of the resulting unicode object does not change, 9031 everything is fine. Otherwise we need to change the string kind 9032 and re-run the fix function. */ 9033 maxchar_new = fixfct(u); 9034 9035 if (maxchar_new == 0) { 9036 /* no changes */; 9037 if (PyUnicode_CheckExact(self)) { 9038 Py_DECREF(u); 9039 Py_INCREF(self); 9040 return self; 9041 } 9042 else 9043 return u; 9044 } 9045 9046 maxchar_new = align_maxchar(maxchar_new); 9047 9048 if (maxchar_new == maxchar_old) 9049 return u; 9050 9051 /* In case the maximum character changed, we need to 9052 convert the string to the new category. */ 9053 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9054 if (v == NULL) { 9055 Py_DECREF(u); 9056 return NULL; 9057 } 9058 if (maxchar_new > maxchar_old) { 9059 /* If the maxchar increased so that the kind changed, not all 9060 characters are representable anymore and we need to fix the 9061 string again. This only happens in very few cases. */ 9062 _PyUnicode_FastCopyCharacters(v, 0, 9063 self, 0, PyUnicode_GET_LENGTH(self)); 9064 maxchar_old = fixfct(v); 9065 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9066 } 9067 else { 9068 _PyUnicode_FastCopyCharacters(v, 0, 9069 u, 0, PyUnicode_GET_LENGTH(self)); 9070 } 9071 Py_DECREF(u); 9072 assert(_PyUnicode_CheckConsistency(v, 1)); 9073 return v; 9074} 9075 9076static PyObject * 9077ascii_upper_or_lower(PyObject *self, int lower) 9078{ 9079 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9080 char *resdata, *data = PyUnicode_DATA(self); 9081 PyObject *res; 9082 9083 res = PyUnicode_New(len, 127); 9084 if (res == NULL) 9085 return NULL; 9086 resdata = PyUnicode_DATA(res); 9087 if (lower) 9088 _Py_bytes_lower(resdata, data, len); 9089 else 9090 _Py_bytes_upper(resdata, data, len); 9091 return res; 9092} 9093 9094static Py_UCS4 9095handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9096{ 9097 Py_ssize_t j; 9098 int final_sigma; 9099 Py_UCS4 c; 9100 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9101 9102 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9103 9104 where ! is a negation and \p{xxx} is a character with property xxx. 9105 */ 9106 for (j = i - 1; j >= 0; j--) { 9107 c = PyUnicode_READ(kind, data, j); 9108 if (!_PyUnicode_IsCaseIgnorable(c)) 9109 break; 9110 } 9111 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9112 if (final_sigma) { 9113 for (j = i + 1; j < length; j++) { 9114 c = PyUnicode_READ(kind, data, j); 9115 if (!_PyUnicode_IsCaseIgnorable(c)) 9116 break; 9117 } 9118 final_sigma = j == length || !_PyUnicode_IsCased(c); 9119 } 9120 return (final_sigma) ? 0x3C2 : 0x3C3; 9121} 9122 9123static int 9124lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9125 Py_UCS4 c, Py_UCS4 *mapped) 9126{ 9127 /* Obscure special case. */ 9128 if (c == 0x3A3) { 9129 mapped[0] = handle_capital_sigma(kind, data, length, i); 9130 return 1; 9131 } 9132 return _PyUnicode_ToLowerFull(c, mapped); 9133} 9134 9135static Py_ssize_t 9136do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9137{ 9138 Py_ssize_t i, k = 0; 9139 int n_res, j; 9140 Py_UCS4 c, mapped[3]; 9141 9142 c = PyUnicode_READ(kind, data, 0); 9143 n_res = _PyUnicode_ToUpperFull(c, mapped); 9144 for (j = 0; j < n_res; j++) { 9145 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9146 res[k++] = mapped[j]; 9147 } 9148 for (i = 1; i < length; i++) { 9149 c = PyUnicode_READ(kind, data, i); 9150 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9151 for (j = 0; j < n_res; j++) { 9152 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9153 res[k++] = mapped[j]; 9154 } 9155 } 9156 return k; 9157} 9158 9159static Py_ssize_t 9160do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9161 Py_ssize_t i, k = 0; 9162 9163 for (i = 0; i < length; i++) { 9164 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9165 int n_res, j; 9166 if (Py_UNICODE_ISUPPER(c)) { 9167 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9168 } 9169 else if (Py_UNICODE_ISLOWER(c)) { 9170 n_res = _PyUnicode_ToUpperFull(c, mapped); 9171 } 9172 else { 9173 n_res = 1; 9174 mapped[0] = c; 9175 } 9176 for (j = 0; j < n_res; j++) { 9177 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9178 res[k++] = mapped[j]; 9179 } 9180 } 9181 return k; 9182} 9183 9184static Py_ssize_t 9185do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9186 Py_UCS4 *maxchar, int lower) 9187{ 9188 Py_ssize_t i, k = 0; 9189 9190 for (i = 0; i < length; i++) { 9191 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9192 int n_res, j; 9193 if (lower) 9194 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9195 else 9196 n_res = _PyUnicode_ToUpperFull(c, mapped); 9197 for (j = 0; j < n_res; j++) { 9198 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9199 res[k++] = mapped[j]; 9200 } 9201 } 9202 return k; 9203} 9204 9205static Py_ssize_t 9206do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9207{ 9208 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9209} 9210 9211static Py_ssize_t 9212do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9213{ 9214 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9215} 9216 9217static Py_ssize_t 9218do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9219{ 9220 Py_ssize_t i, k = 0; 9221 9222 for (i = 0; i < length; i++) { 9223 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9224 Py_UCS4 mapped[3]; 9225 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9226 for (j = 0; j < n_res; j++) { 9227 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9228 res[k++] = mapped[j]; 9229 } 9230 } 9231 return k; 9232} 9233 9234static Py_ssize_t 9235do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9236{ 9237 Py_ssize_t i, k = 0; 9238 int previous_is_cased; 9239 9240 previous_is_cased = 0; 9241 for (i = 0; i < length; i++) { 9242 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9243 Py_UCS4 mapped[3]; 9244 int n_res, j; 9245 9246 if (previous_is_cased) 9247 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9248 else 9249 n_res = _PyUnicode_ToTitleFull(c, mapped); 9250 9251 for (j = 0; j < n_res; j++) { 9252 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9253 res[k++] = mapped[j]; 9254 } 9255 9256 previous_is_cased = _PyUnicode_IsCased(c); 9257 } 9258 return k; 9259} 9260 9261static PyObject * 9262case_operation(PyObject *self, 9263 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9264{ 9265 PyObject *res = NULL; 9266 Py_ssize_t length, newlength = 0; 9267 int kind, outkind; 9268 void *data, *outdata; 9269 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9270 9271 assert(PyUnicode_IS_READY(self)); 9272 9273 kind = PyUnicode_KIND(self); 9274 data = PyUnicode_DATA(self); 9275 length = PyUnicode_GET_LENGTH(self); 9276 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9277 if (tmp == NULL) 9278 return PyErr_NoMemory(); 9279 newlength = perform(kind, data, length, tmp, &maxchar); 9280 res = PyUnicode_New(newlength, maxchar); 9281 if (res == NULL) 9282 goto leave; 9283 tmpend = tmp + newlength; 9284 outdata = PyUnicode_DATA(res); 9285 outkind = PyUnicode_KIND(res); 9286 switch (outkind) { 9287 case PyUnicode_1BYTE_KIND: 9288 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9289 break; 9290 case PyUnicode_2BYTE_KIND: 9291 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9292 break; 9293 case PyUnicode_4BYTE_KIND: 9294 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9295 break; 9296 default: 9297 assert(0); 9298 break; 9299 } 9300 leave: 9301 PyMem_FREE(tmp); 9302 return res; 9303} 9304 9305PyObject * 9306PyUnicode_Join(PyObject *separator, PyObject *seq) 9307{ 9308 PyObject *sep = NULL; 9309 Py_ssize_t seplen; 9310 PyObject *res = NULL; /* the result */ 9311 PyObject *fseq; /* PySequence_Fast(seq) */ 9312 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9313 PyObject **items; 9314 PyObject *item; 9315 Py_ssize_t sz, i, res_offset; 9316 Py_UCS4 maxchar; 9317 Py_UCS4 item_maxchar; 9318 int use_memcpy; 9319 unsigned char *res_data = NULL, *sep_data = NULL; 9320 PyObject *last_obj; 9321 unsigned int kind = 0; 9322 9323 fseq = PySequence_Fast(seq, ""); 9324 if (fseq == NULL) { 9325 return NULL; 9326 } 9327 9328 /* NOTE: the following code can't call back into Python code, 9329 * so we are sure that fseq won't be mutated. 9330 */ 9331 9332 seqlen = PySequence_Fast_GET_SIZE(fseq); 9333 /* If empty sequence, return u"". */ 9334 if (seqlen == 0) { 9335 Py_DECREF(fseq); 9336 Py_INCREF(unicode_empty); 9337 res = unicode_empty; 9338 return res; 9339 } 9340 9341 /* If singleton sequence with an exact Unicode, return that. */ 9342 last_obj = NULL; 9343 items = PySequence_Fast_ITEMS(fseq); 9344 if (seqlen == 1) { 9345 if (PyUnicode_CheckExact(items[0])) { 9346 res = items[0]; 9347 Py_INCREF(res); 9348 Py_DECREF(fseq); 9349 return res; 9350 } 9351 seplen = 0; 9352 maxchar = 0; 9353 } 9354 else { 9355 /* Set up sep and seplen */ 9356 if (separator == NULL) { 9357 /* fall back to a blank space separator */ 9358 sep = PyUnicode_FromOrdinal(' '); 9359 if (!sep) 9360 goto onError; 9361 seplen = 1; 9362 maxchar = 32; 9363 } 9364 else { 9365 if (!PyUnicode_Check(separator)) { 9366 PyErr_Format(PyExc_TypeError, 9367 "separator: expected str instance," 9368 " %.80s found", 9369 Py_TYPE(separator)->tp_name); 9370 goto onError; 9371 } 9372 if (PyUnicode_READY(separator)) 9373 goto onError; 9374 sep = separator; 9375 seplen = PyUnicode_GET_LENGTH(separator); 9376 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9377 /* inc refcount to keep this code path symmetric with the 9378 above case of a blank separator */ 9379 Py_INCREF(sep); 9380 } 9381 last_obj = sep; 9382 } 9383 9384 /* There are at least two things to join, or else we have a subclass 9385 * of str in the sequence. 9386 * Do a pre-pass to figure out the total amount of space we'll 9387 * need (sz), and see whether all argument are strings. 9388 */ 9389 sz = 0; 9390#ifdef Py_DEBUG 9391 use_memcpy = 0; 9392#else 9393 use_memcpy = 1; 9394#endif 9395 for (i = 0; i < seqlen; i++) { 9396 const Py_ssize_t old_sz = sz; 9397 item = items[i]; 9398 if (!PyUnicode_Check(item)) { 9399 PyErr_Format(PyExc_TypeError, 9400 "sequence item %zd: expected str instance," 9401 " %.80s found", 9402 i, Py_TYPE(item)->tp_name); 9403 goto onError; 9404 } 9405 if (PyUnicode_READY(item) == -1) 9406 goto onError; 9407 sz += PyUnicode_GET_LENGTH(item); 9408 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9409 maxchar = MAX_MAXCHAR(maxchar, item_maxchar); 9410 if (i != 0) 9411 sz += seplen; 9412 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9413 PyErr_SetString(PyExc_OverflowError, 9414 "join() result is too long for a Python string"); 9415 goto onError; 9416 } 9417 if (use_memcpy && last_obj != NULL) { 9418 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9419 use_memcpy = 0; 9420 } 9421 last_obj = item; 9422 } 9423 9424 res = PyUnicode_New(sz, maxchar); 9425 if (res == NULL) 9426 goto onError; 9427 9428 /* Catenate everything. */ 9429#ifdef Py_DEBUG 9430 use_memcpy = 0; 9431#else 9432 if (use_memcpy) { 9433 res_data = PyUnicode_1BYTE_DATA(res); 9434 kind = PyUnicode_KIND(res); 9435 if (seplen != 0) 9436 sep_data = PyUnicode_1BYTE_DATA(sep); 9437 } 9438#endif 9439 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9440 Py_ssize_t itemlen; 9441 item = items[i]; 9442 /* Copy item, and maybe the separator. */ 9443 if (i && seplen != 0) { 9444 if (use_memcpy) { 9445 Py_MEMCPY(res_data, 9446 sep_data, 9447 kind * seplen); 9448 res_data += kind * seplen; 9449 } 9450 else { 9451 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9452 res_offset += seplen; 9453 } 9454 } 9455 itemlen = PyUnicode_GET_LENGTH(item); 9456 if (itemlen != 0) { 9457 if (use_memcpy) { 9458 Py_MEMCPY(res_data, 9459 PyUnicode_DATA(item), 9460 kind * itemlen); 9461 res_data += kind * itemlen; 9462 } 9463 else { 9464 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9465 res_offset += itemlen; 9466 } 9467 } 9468 } 9469 if (use_memcpy) 9470 assert(res_data == PyUnicode_1BYTE_DATA(res) 9471 + kind * PyUnicode_GET_LENGTH(res)); 9472 else 9473 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9474 9475 Py_DECREF(fseq); 9476 Py_XDECREF(sep); 9477 assert(_PyUnicode_CheckConsistency(res, 1)); 9478 return res; 9479 9480 onError: 9481 Py_DECREF(fseq); 9482 Py_XDECREF(sep); 9483 Py_XDECREF(res); 9484 return NULL; 9485} 9486 9487#define FILL(kind, data, value, start, length) \ 9488 do { \ 9489 Py_ssize_t i_ = 0; \ 9490 assert(kind != PyUnicode_WCHAR_KIND); \ 9491 switch ((kind)) { \ 9492 case PyUnicode_1BYTE_KIND: { \ 9493 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9494 memset(to_, (unsigned char)value, (length)); \ 9495 break; \ 9496 } \ 9497 case PyUnicode_2BYTE_KIND: { \ 9498 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9499 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9500 break; \ 9501 } \ 9502 case PyUnicode_4BYTE_KIND: { \ 9503 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9504 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9505 break; \ 9506 default: assert(0); \ 9507 } \ 9508 } \ 9509 } while (0) 9510 9511void 9512_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9513 Py_UCS4 fill_char) 9514{ 9515 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9516 const void *data = PyUnicode_DATA(unicode); 9517 assert(PyUnicode_IS_READY(unicode)); 9518 assert(unicode_modifiable(unicode)); 9519 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9520 assert(start >= 0); 9521 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9522 FILL(kind, data, fill_char, start, length); 9523} 9524 9525Py_ssize_t 9526PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9527 Py_UCS4 fill_char) 9528{ 9529 Py_ssize_t maxlen; 9530 9531 if (!PyUnicode_Check(unicode)) { 9532 PyErr_BadInternalCall(); 9533 return -1; 9534 } 9535 if (PyUnicode_READY(unicode) == -1) 9536 return -1; 9537 if (unicode_check_modifiable(unicode)) 9538 return -1; 9539 9540 if (start < 0) { 9541 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9542 return -1; 9543 } 9544 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9545 PyErr_SetString(PyExc_ValueError, 9546 "fill character is bigger than " 9547 "the string maximum character"); 9548 return -1; 9549 } 9550 9551 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9552 length = Py_MIN(maxlen, length); 9553 if (length <= 0) 9554 return 0; 9555 9556 _PyUnicode_FastFill(unicode, start, length, fill_char); 9557 return length; 9558} 9559 9560static PyObject * 9561pad(PyObject *self, 9562 Py_ssize_t left, 9563 Py_ssize_t right, 9564 Py_UCS4 fill) 9565{ 9566 PyObject *u; 9567 Py_UCS4 maxchar; 9568 int kind; 9569 void *data; 9570 9571 if (left < 0) 9572 left = 0; 9573 if (right < 0) 9574 right = 0; 9575 9576 if (left == 0 && right == 0) 9577 return unicode_result_unchanged(self); 9578 9579 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9580 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9581 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9582 return NULL; 9583 } 9584 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9585 maxchar = MAX_MAXCHAR(maxchar, fill); 9586 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9587 if (!u) 9588 return NULL; 9589 9590 kind = PyUnicode_KIND(u); 9591 data = PyUnicode_DATA(u); 9592 if (left) 9593 FILL(kind, data, fill, 0, left); 9594 if (right) 9595 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9596 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9597 assert(_PyUnicode_CheckConsistency(u, 1)); 9598 return u; 9599} 9600 9601PyObject * 9602PyUnicode_Splitlines(PyObject *string, int keepends) 9603{ 9604 PyObject *list; 9605 9606 string = PyUnicode_FromObject(string); 9607 if (string == NULL) 9608 return NULL; 9609 if (PyUnicode_READY(string) == -1) { 9610 Py_DECREF(string); 9611 return NULL; 9612 } 9613 9614 switch (PyUnicode_KIND(string)) { 9615 case PyUnicode_1BYTE_KIND: 9616 if (PyUnicode_IS_ASCII(string)) 9617 list = asciilib_splitlines( 9618 string, PyUnicode_1BYTE_DATA(string), 9619 PyUnicode_GET_LENGTH(string), keepends); 9620 else 9621 list = ucs1lib_splitlines( 9622 string, PyUnicode_1BYTE_DATA(string), 9623 PyUnicode_GET_LENGTH(string), keepends); 9624 break; 9625 case PyUnicode_2BYTE_KIND: 9626 list = ucs2lib_splitlines( 9627 string, PyUnicode_2BYTE_DATA(string), 9628 PyUnicode_GET_LENGTH(string), keepends); 9629 break; 9630 case PyUnicode_4BYTE_KIND: 9631 list = ucs4lib_splitlines( 9632 string, PyUnicode_4BYTE_DATA(string), 9633 PyUnicode_GET_LENGTH(string), keepends); 9634 break; 9635 default: 9636 assert(0); 9637 list = 0; 9638 } 9639 Py_DECREF(string); 9640 return list; 9641} 9642 9643static PyObject * 9644split(PyObject *self, 9645 PyObject *substring, 9646 Py_ssize_t maxcount) 9647{ 9648 int kind1, kind2, kind; 9649 void *buf1, *buf2; 9650 Py_ssize_t len1, len2; 9651 PyObject* out; 9652 9653 if (maxcount < 0) 9654 maxcount = PY_SSIZE_T_MAX; 9655 9656 if (PyUnicode_READY(self) == -1) 9657 return NULL; 9658 9659 if (substring == NULL) 9660 switch (PyUnicode_KIND(self)) { 9661 case PyUnicode_1BYTE_KIND: 9662 if (PyUnicode_IS_ASCII(self)) 9663 return asciilib_split_whitespace( 9664 self, PyUnicode_1BYTE_DATA(self), 9665 PyUnicode_GET_LENGTH(self), maxcount 9666 ); 9667 else 9668 return ucs1lib_split_whitespace( 9669 self, PyUnicode_1BYTE_DATA(self), 9670 PyUnicode_GET_LENGTH(self), maxcount 9671 ); 9672 case PyUnicode_2BYTE_KIND: 9673 return ucs2lib_split_whitespace( 9674 self, PyUnicode_2BYTE_DATA(self), 9675 PyUnicode_GET_LENGTH(self), maxcount 9676 ); 9677 case PyUnicode_4BYTE_KIND: 9678 return ucs4lib_split_whitespace( 9679 self, PyUnicode_4BYTE_DATA(self), 9680 PyUnicode_GET_LENGTH(self), maxcount 9681 ); 9682 default: 9683 assert(0); 9684 return NULL; 9685 } 9686 9687 if (PyUnicode_READY(substring) == -1) 9688 return NULL; 9689 9690 kind1 = PyUnicode_KIND(self); 9691 kind2 = PyUnicode_KIND(substring); 9692 kind = kind1 > kind2 ? kind1 : kind2; 9693 buf1 = PyUnicode_DATA(self); 9694 buf2 = PyUnicode_DATA(substring); 9695 if (kind1 != kind) 9696 buf1 = _PyUnicode_AsKind(self, kind); 9697 if (!buf1) 9698 return NULL; 9699 if (kind2 != kind) 9700 buf2 = _PyUnicode_AsKind(substring, kind); 9701 if (!buf2) { 9702 if (kind1 != kind) PyMem_Free(buf1); 9703 return NULL; 9704 } 9705 len1 = PyUnicode_GET_LENGTH(self); 9706 len2 = PyUnicode_GET_LENGTH(substring); 9707 9708 switch (kind) { 9709 case PyUnicode_1BYTE_KIND: 9710 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9711 out = asciilib_split( 9712 self, buf1, len1, buf2, len2, maxcount); 9713 else 9714 out = ucs1lib_split( 9715 self, buf1, len1, buf2, len2, maxcount); 9716 break; 9717 case PyUnicode_2BYTE_KIND: 9718 out = ucs2lib_split( 9719 self, buf1, len1, buf2, len2, maxcount); 9720 break; 9721 case PyUnicode_4BYTE_KIND: 9722 out = ucs4lib_split( 9723 self, buf1, len1, buf2, len2, maxcount); 9724 break; 9725 default: 9726 out = NULL; 9727 } 9728 if (kind1 != kind) 9729 PyMem_Free(buf1); 9730 if (kind2 != kind) 9731 PyMem_Free(buf2); 9732 return out; 9733} 9734 9735static PyObject * 9736rsplit(PyObject *self, 9737 PyObject *substring, 9738 Py_ssize_t maxcount) 9739{ 9740 int kind1, kind2, kind; 9741 void *buf1, *buf2; 9742 Py_ssize_t len1, len2; 9743 PyObject* out; 9744 9745 if (maxcount < 0) 9746 maxcount = PY_SSIZE_T_MAX; 9747 9748 if (PyUnicode_READY(self) == -1) 9749 return NULL; 9750 9751 if (substring == NULL) 9752 switch (PyUnicode_KIND(self)) { 9753 case PyUnicode_1BYTE_KIND: 9754 if (PyUnicode_IS_ASCII(self)) 9755 return asciilib_rsplit_whitespace( 9756 self, PyUnicode_1BYTE_DATA(self), 9757 PyUnicode_GET_LENGTH(self), maxcount 9758 ); 9759 else 9760 return ucs1lib_rsplit_whitespace( 9761 self, PyUnicode_1BYTE_DATA(self), 9762 PyUnicode_GET_LENGTH(self), maxcount 9763 ); 9764 case PyUnicode_2BYTE_KIND: 9765 return ucs2lib_rsplit_whitespace( 9766 self, PyUnicode_2BYTE_DATA(self), 9767 PyUnicode_GET_LENGTH(self), maxcount 9768 ); 9769 case PyUnicode_4BYTE_KIND: 9770 return ucs4lib_rsplit_whitespace( 9771 self, PyUnicode_4BYTE_DATA(self), 9772 PyUnicode_GET_LENGTH(self), maxcount 9773 ); 9774 default: 9775 assert(0); 9776 return NULL; 9777 } 9778 9779 if (PyUnicode_READY(substring) == -1) 9780 return NULL; 9781 9782 kind1 = PyUnicode_KIND(self); 9783 kind2 = PyUnicode_KIND(substring); 9784 kind = kind1 > kind2 ? kind1 : kind2; 9785 buf1 = PyUnicode_DATA(self); 9786 buf2 = PyUnicode_DATA(substring); 9787 if (kind1 != kind) 9788 buf1 = _PyUnicode_AsKind(self, kind); 9789 if (!buf1) 9790 return NULL; 9791 if (kind2 != kind) 9792 buf2 = _PyUnicode_AsKind(substring, kind); 9793 if (!buf2) { 9794 if (kind1 != kind) PyMem_Free(buf1); 9795 return NULL; 9796 } 9797 len1 = PyUnicode_GET_LENGTH(self); 9798 len2 = PyUnicode_GET_LENGTH(substring); 9799 9800 switch (kind) { 9801 case PyUnicode_1BYTE_KIND: 9802 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9803 out = asciilib_rsplit( 9804 self, buf1, len1, buf2, len2, maxcount); 9805 else 9806 out = ucs1lib_rsplit( 9807 self, buf1, len1, buf2, len2, maxcount); 9808 break; 9809 case PyUnicode_2BYTE_KIND: 9810 out = ucs2lib_rsplit( 9811 self, buf1, len1, buf2, len2, maxcount); 9812 break; 9813 case PyUnicode_4BYTE_KIND: 9814 out = ucs4lib_rsplit( 9815 self, buf1, len1, buf2, len2, maxcount); 9816 break; 9817 default: 9818 out = NULL; 9819 } 9820 if (kind1 != kind) 9821 PyMem_Free(buf1); 9822 if (kind2 != kind) 9823 PyMem_Free(buf2); 9824 return out; 9825} 9826 9827static Py_ssize_t 9828anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9829 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9830{ 9831 switch (kind) { 9832 case PyUnicode_1BYTE_KIND: 9833 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9834 return asciilib_find(buf1, len1, buf2, len2, offset); 9835 else 9836 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9837 case PyUnicode_2BYTE_KIND: 9838 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9839 case PyUnicode_4BYTE_KIND: 9840 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9841 } 9842 assert(0); 9843 return -1; 9844} 9845 9846static Py_ssize_t 9847anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9848 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9849{ 9850 switch (kind) { 9851 case PyUnicode_1BYTE_KIND: 9852 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 9853 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 9854 else 9855 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9856 case PyUnicode_2BYTE_KIND: 9857 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9858 case PyUnicode_4BYTE_KIND: 9859 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9860 } 9861 assert(0); 9862 return 0; 9863} 9864 9865static PyObject * 9866replace(PyObject *self, PyObject *str1, 9867 PyObject *str2, Py_ssize_t maxcount) 9868{ 9869 PyObject *u; 9870 char *sbuf = PyUnicode_DATA(self); 9871 char *buf1 = PyUnicode_DATA(str1); 9872 char *buf2 = PyUnicode_DATA(str2); 9873 int srelease = 0, release1 = 0, release2 = 0; 9874 int skind = PyUnicode_KIND(self); 9875 int kind1 = PyUnicode_KIND(str1); 9876 int kind2 = PyUnicode_KIND(str2); 9877 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9878 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9879 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9880 int mayshrink; 9881 Py_UCS4 maxchar, maxchar_str2; 9882 9883 if (maxcount < 0) 9884 maxcount = PY_SSIZE_T_MAX; 9885 else if (maxcount == 0 || slen == 0) 9886 goto nothing; 9887 9888 if (str1 == str2) 9889 goto nothing; 9890 if (skind < kind1) 9891 /* substring too wide to be present */ 9892 goto nothing; 9893 9894 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9895 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 9896 /* Replacing str1 with str2 may cause a maxchar reduction in the 9897 result string. */ 9898 mayshrink = (maxchar_str2 < maxchar); 9899 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); 9900 9901 if (len1 == len2) { 9902 /* same length */ 9903 if (len1 == 0) 9904 goto nothing; 9905 if (len1 == 1) { 9906 /* replace characters */ 9907 Py_UCS4 u1, u2; 9908 int rkind; 9909 Py_ssize_t index, pos; 9910 char *src; 9911 9912 u1 = PyUnicode_READ_CHAR(str1, 0); 9913 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); 9914 if (pos < 0) 9915 goto nothing; 9916 u2 = PyUnicode_READ_CHAR(str2, 0); 9917 u = PyUnicode_New(slen, maxchar); 9918 if (!u) 9919 goto error; 9920 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 9921 rkind = PyUnicode_KIND(u); 9922 9923 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); 9924 index = 0; 9925 src = sbuf; 9926 while (--maxcount) 9927 { 9928 pos++; 9929 src += pos * PyUnicode_KIND(self); 9930 slen -= pos; 9931 index += pos; 9932 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); 9933 if (pos < 0) 9934 break; 9935 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); 9936 } 9937 } 9938 else { 9939 int rkind = skind; 9940 char *res; 9941 Py_ssize_t i; 9942 9943 if (kind1 < rkind) { 9944 /* widen substring */ 9945 buf1 = _PyUnicode_AsKind(str1, rkind); 9946 if (!buf1) goto error; 9947 release1 = 1; 9948 } 9949 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 9950 if (i < 0) 9951 goto nothing; 9952 if (rkind > kind2) { 9953 /* widen replacement */ 9954 buf2 = _PyUnicode_AsKind(str2, rkind); 9955 if (!buf2) goto error; 9956 release2 = 1; 9957 } 9958 else if (rkind < kind2) { 9959 /* widen self and buf1 */ 9960 rkind = kind2; 9961 if (release1) PyMem_Free(buf1); 9962 sbuf = _PyUnicode_AsKind(self, rkind); 9963 if (!sbuf) goto error; 9964 srelease = 1; 9965 buf1 = _PyUnicode_AsKind(str1, rkind); 9966 if (!buf1) goto error; 9967 release1 = 1; 9968 } 9969 u = PyUnicode_New(slen, maxchar); 9970 if (!u) 9971 goto error; 9972 assert(PyUnicode_KIND(u) == rkind); 9973 res = PyUnicode_DATA(u); 9974 9975 memcpy(res, sbuf, rkind * slen); 9976 /* change everything in-place, starting with this one */ 9977 memcpy(res + rkind * i, 9978 buf2, 9979 rkind * len2); 9980 i += len1; 9981 9982 while ( --maxcount > 0) { 9983 i = anylib_find(rkind, self, 9984 sbuf+rkind*i, slen-i, 9985 str1, buf1, len1, i); 9986 if (i == -1) 9987 break; 9988 memcpy(res + rkind * i, 9989 buf2, 9990 rkind * len2); 9991 i += len1; 9992 } 9993 } 9994 } 9995 else { 9996 Py_ssize_t n, i, j, ires; 9997 Py_ssize_t new_size; 9998 int rkind = skind; 9999 char *res; 10000 10001 if (kind1 < rkind) { 10002 /* widen substring */ 10003 buf1 = _PyUnicode_AsKind(str1, rkind); 10004 if (!buf1) goto error; 10005 release1 = 1; 10006 } 10007 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10008 if (n == 0) 10009 goto nothing; 10010 if (kind2 < rkind) { 10011 /* widen replacement */ 10012 buf2 = _PyUnicode_AsKind(str2, rkind); 10013 if (!buf2) goto error; 10014 release2 = 1; 10015 } 10016 else if (kind2 > rkind) { 10017 /* widen self and buf1 */ 10018 rkind = kind2; 10019 sbuf = _PyUnicode_AsKind(self, rkind); 10020 if (!sbuf) goto error; 10021 srelease = 1; 10022 if (release1) PyMem_Free(buf1); 10023 buf1 = _PyUnicode_AsKind(str1, rkind); 10024 if (!buf1) goto error; 10025 release1 = 1; 10026 } 10027 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10028 PyUnicode_GET_LENGTH(str1))); */ 10029 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10030 PyErr_SetString(PyExc_OverflowError, 10031 "replace string is too long"); 10032 goto error; 10033 } 10034 new_size = slen + n * (len2 - len1); 10035 if (new_size == 0) { 10036 Py_INCREF(unicode_empty); 10037 u = unicode_empty; 10038 goto done; 10039 } 10040 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10041 PyErr_SetString(PyExc_OverflowError, 10042 "replace string is too long"); 10043 goto error; 10044 } 10045 u = PyUnicode_New(new_size, maxchar); 10046 if (!u) 10047 goto error; 10048 assert(PyUnicode_KIND(u) == rkind); 10049 res = PyUnicode_DATA(u); 10050 ires = i = 0; 10051 if (len1 > 0) { 10052 while (n-- > 0) { 10053 /* look for next match */ 10054 j = anylib_find(rkind, self, 10055 sbuf + rkind * i, slen-i, 10056 str1, buf1, len1, i); 10057 if (j == -1) 10058 break; 10059 else if (j > i) { 10060 /* copy unchanged part [i:j] */ 10061 memcpy(res + rkind * ires, 10062 sbuf + rkind * i, 10063 rkind * (j-i)); 10064 ires += j - i; 10065 } 10066 /* copy substitution string */ 10067 if (len2 > 0) { 10068 memcpy(res + rkind * ires, 10069 buf2, 10070 rkind * len2); 10071 ires += len2; 10072 } 10073 i = j + len1; 10074 } 10075 if (i < slen) 10076 /* copy tail [i:] */ 10077 memcpy(res + rkind * ires, 10078 sbuf + rkind * i, 10079 rkind * (slen-i)); 10080 } 10081 else { 10082 /* interleave */ 10083 while (n > 0) { 10084 memcpy(res + rkind * ires, 10085 buf2, 10086 rkind * len2); 10087 ires += len2; 10088 if (--n <= 0) 10089 break; 10090 memcpy(res + rkind * ires, 10091 sbuf + rkind * i, 10092 rkind); 10093 ires++; 10094 i++; 10095 } 10096 memcpy(res + rkind * ires, 10097 sbuf + rkind * i, 10098 rkind * (slen-i)); 10099 } 10100 } 10101 10102 if (mayshrink) { 10103 unicode_adjust_maxchar(&u); 10104 if (u == NULL) 10105 goto error; 10106 } 10107 10108 done: 10109 if (srelease) 10110 PyMem_FREE(sbuf); 10111 if (release1) 10112 PyMem_FREE(buf1); 10113 if (release2) 10114 PyMem_FREE(buf2); 10115 assert(_PyUnicode_CheckConsistency(u, 1)); 10116 return u; 10117 10118 nothing: 10119 /* nothing to replace; return original string (when possible) */ 10120 if (srelease) 10121 PyMem_FREE(sbuf); 10122 if (release1) 10123 PyMem_FREE(buf1); 10124 if (release2) 10125 PyMem_FREE(buf2); 10126 return unicode_result_unchanged(self); 10127 10128 error: 10129 if (srelease && sbuf) 10130 PyMem_FREE(sbuf); 10131 if (release1 && buf1) 10132 PyMem_FREE(buf1); 10133 if (release2 && buf2) 10134 PyMem_FREE(buf2); 10135 return NULL; 10136} 10137 10138/* --- Unicode Object Methods --------------------------------------------- */ 10139 10140PyDoc_STRVAR(title__doc__, 10141 "S.title() -> str\n\ 10142\n\ 10143Return a titlecased version of S, i.e. words start with title case\n\ 10144characters, all remaining cased characters have lower case."); 10145 10146static PyObject* 10147unicode_title(PyObject *self) 10148{ 10149 if (PyUnicode_READY(self) == -1) 10150 return NULL; 10151 return case_operation(self, do_title); 10152} 10153 10154PyDoc_STRVAR(capitalize__doc__, 10155 "S.capitalize() -> str\n\ 10156\n\ 10157Return a capitalized version of S, i.e. make the first character\n\ 10158have upper case and the rest lower case."); 10159 10160static PyObject* 10161unicode_capitalize(PyObject *self) 10162{ 10163 if (PyUnicode_READY(self) == -1) 10164 return NULL; 10165 if (PyUnicode_GET_LENGTH(self) == 0) 10166 return unicode_result_unchanged(self); 10167 return case_operation(self, do_capitalize); 10168} 10169 10170PyDoc_STRVAR(casefold__doc__, 10171 "S.casefold() -> str\n\ 10172\n\ 10173Return a version of S suitable for caseless comparisons."); 10174 10175static PyObject * 10176unicode_casefold(PyObject *self) 10177{ 10178 if (PyUnicode_READY(self) == -1) 10179 return NULL; 10180 if (PyUnicode_IS_ASCII(self)) 10181 return ascii_upper_or_lower(self, 1); 10182 return case_operation(self, do_casefold); 10183} 10184 10185 10186/* Argument converter. Coerces to a single unicode character */ 10187 10188static int 10189convert_uc(PyObject *obj, void *addr) 10190{ 10191 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10192 PyObject *uniobj; 10193 10194 uniobj = PyUnicode_FromObject(obj); 10195 if (uniobj == NULL) { 10196 PyErr_SetString(PyExc_TypeError, 10197 "The fill character cannot be converted to Unicode"); 10198 return 0; 10199 } 10200 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10201 PyErr_SetString(PyExc_TypeError, 10202 "The fill character must be exactly one character long"); 10203 Py_DECREF(uniobj); 10204 return 0; 10205 } 10206 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10207 Py_DECREF(uniobj); 10208 return 1; 10209} 10210 10211PyDoc_STRVAR(center__doc__, 10212 "S.center(width[, fillchar]) -> str\n\ 10213\n\ 10214Return S centered in a string of length width. Padding is\n\ 10215done using the specified fill character (default is a space)"); 10216 10217static PyObject * 10218unicode_center(PyObject *self, PyObject *args) 10219{ 10220 Py_ssize_t marg, left; 10221 Py_ssize_t width; 10222 Py_UCS4 fillchar = ' '; 10223 10224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10225 return NULL; 10226 10227 if (PyUnicode_READY(self) == -1) 10228 return NULL; 10229 10230 if (PyUnicode_GET_LENGTH(self) >= width) 10231 return unicode_result_unchanged(self); 10232 10233 marg = width - PyUnicode_GET_LENGTH(self); 10234 left = marg / 2 + (marg & width & 1); 10235 10236 return pad(self, left, marg - left, fillchar); 10237} 10238 10239/* This function assumes that str1 and str2 are readied by the caller. */ 10240 10241static int 10242unicode_compare(PyObject *str1, PyObject *str2) 10243{ 10244 int kind1, kind2; 10245 void *data1, *data2; 10246 Py_ssize_t len1, len2; 10247 Py_ssize_t i, len; 10248 10249 /* a string is equal to itself */ 10250 if (str1 == str2) 10251 return 0; 10252 10253 kind1 = PyUnicode_KIND(str1); 10254 kind2 = PyUnicode_KIND(str2); 10255 data1 = PyUnicode_DATA(str1); 10256 data2 = PyUnicode_DATA(str2); 10257 len1 = PyUnicode_GET_LENGTH(str1); 10258 len2 = PyUnicode_GET_LENGTH(str2); 10259 len = Py_MIN(len1, len2); 10260 10261 if (kind1 == 1 && kind2 == 1) { 10262 int cmp = memcmp(data1, data2, len); 10263 /* normalize result of memcmp() into the range [-1; 1] */ 10264 if (cmp < 0) 10265 return -1; 10266 if (cmp > 0) 10267 return 1; 10268 } 10269 else { 10270 for (i = 0; i < len; ++i) { 10271 Py_UCS4 c1, c2; 10272 c1 = PyUnicode_READ(kind1, data1, i); 10273 c2 = PyUnicode_READ(kind2, data2, i); 10274 10275 if (c1 != c2) 10276 return (c1 < c2) ? -1 : 1; 10277 } 10278 } 10279 10280 if (len1 == len2) 10281 return 0; 10282 if (len1 < len2) 10283 return -1; 10284 else 10285 return 1; 10286} 10287 10288static int 10289unicode_compare_eq(PyObject *str1, PyObject *str2) 10290{ 10291 int kind; 10292 void *data1, *data2; 10293 Py_ssize_t len; 10294 int cmp; 10295 10296 /* a string is equal to itself */ 10297 if (str1 == str2) 10298 return 1; 10299 10300 len = PyUnicode_GET_LENGTH(str1); 10301 if (PyUnicode_GET_LENGTH(str2) != len) 10302 return 0; 10303 kind = PyUnicode_KIND(str1); 10304 if (PyUnicode_KIND(str2) != kind) 10305 return 0; 10306 data1 = PyUnicode_DATA(str1); 10307 data2 = PyUnicode_DATA(str2); 10308 10309 cmp = memcmp(data1, data2, len * kind); 10310 return (cmp == 0); 10311} 10312 10313 10314int 10315PyUnicode_Compare(PyObject *left, PyObject *right) 10316{ 10317 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10318 if (PyUnicode_READY(left) == -1 || 10319 PyUnicode_READY(right) == -1) 10320 return -1; 10321 return unicode_compare(left, right); 10322 } 10323 PyErr_Format(PyExc_TypeError, 10324 "Can't compare %.100s and %.100s", 10325 left->ob_type->tp_name, 10326 right->ob_type->tp_name); 10327 return -1; 10328} 10329 10330int 10331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10332{ 10333 Py_ssize_t i; 10334 int kind; 10335 void *data; 10336 Py_UCS4 chr; 10337 10338 assert(_PyUnicode_CHECK(uni)); 10339 if (PyUnicode_READY(uni) == -1) 10340 return -1; 10341 kind = PyUnicode_KIND(uni); 10342 data = PyUnicode_DATA(uni); 10343 /* Compare Unicode string and source character set string */ 10344 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10345 if (chr != str[i]) 10346 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10347 /* This check keeps Python strings that end in '\0' from comparing equal 10348 to C strings identical up to that point. */ 10349 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10350 return 1; /* uni is longer */ 10351 if (str[i]) 10352 return -1; /* str is longer */ 10353 return 0; 10354} 10355 10356 10357#define TEST_COND(cond) \ 10358 ((cond) ? Py_True : Py_False) 10359 10360PyObject * 10361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10362{ 10363 int result; 10364 PyObject *v; 10365 10366 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10367 Py_RETURN_NOTIMPLEMENTED; 10368 10369 if (PyUnicode_READY(left) == -1 || 10370 PyUnicode_READY(right) == -1) 10371 return NULL; 10372 10373 if (op == Py_EQ || op == Py_NE) { 10374 result = unicode_compare_eq(left, right); 10375 if (op == Py_EQ) 10376 v = TEST_COND(result); 10377 else 10378 v = TEST_COND(!result); 10379 } 10380 else { 10381 result = unicode_compare(left, right); 10382 10383 /* Convert the return value to a Boolean */ 10384 switch (op) { 10385 case Py_LE: 10386 v = TEST_COND(result <= 0); 10387 break; 10388 case Py_GE: 10389 v = TEST_COND(result >= 0); 10390 break; 10391 case Py_LT: 10392 v = TEST_COND(result == -1); 10393 break; 10394 case Py_GT: 10395 v = TEST_COND(result == 1); 10396 break; 10397 default: 10398 PyErr_BadArgument(); 10399 return NULL; 10400 } 10401 } 10402 Py_INCREF(v); 10403 return v; 10404} 10405 10406int 10407PyUnicode_Contains(PyObject *container, PyObject *element) 10408{ 10409 PyObject *str, *sub; 10410 int kind1, kind2, kind; 10411 void *buf1, *buf2; 10412 Py_ssize_t len1, len2; 10413 int result; 10414 10415 /* Coerce the two arguments */ 10416 sub = PyUnicode_FromObject(element); 10417 if (!sub) { 10418 PyErr_Format(PyExc_TypeError, 10419 "'in <string>' requires string as left operand, not %s", 10420 element->ob_type->tp_name); 10421 return -1; 10422 } 10423 10424 str = PyUnicode_FromObject(container); 10425 if (!str) { 10426 Py_DECREF(sub); 10427 return -1; 10428 } 10429 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 10430 Py_DECREF(sub); 10431 Py_DECREF(str); 10432 } 10433 10434 kind1 = PyUnicode_KIND(str); 10435 kind2 = PyUnicode_KIND(sub); 10436 kind = kind1; 10437 buf1 = PyUnicode_DATA(str); 10438 buf2 = PyUnicode_DATA(sub); 10439 if (kind2 != kind) { 10440 if (kind2 > kind) { 10441 Py_DECREF(sub); 10442 Py_DECREF(str); 10443 return 0; 10444 } 10445 buf2 = _PyUnicode_AsKind(sub, kind); 10446 } 10447 if (!buf2) { 10448 Py_DECREF(sub); 10449 Py_DECREF(str); 10450 return -1; 10451 } 10452 len1 = PyUnicode_GET_LENGTH(str); 10453 len2 = PyUnicode_GET_LENGTH(sub); 10454 10455 switch (kind) { 10456 case PyUnicode_1BYTE_KIND: 10457 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10458 break; 10459 case PyUnicode_2BYTE_KIND: 10460 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10461 break; 10462 case PyUnicode_4BYTE_KIND: 10463 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10464 break; 10465 default: 10466 result = -1; 10467 assert(0); 10468 } 10469 10470 Py_DECREF(str); 10471 Py_DECREF(sub); 10472 10473 if (kind2 != kind) 10474 PyMem_Free(buf2); 10475 10476 return result; 10477} 10478 10479/* Concat to string or Unicode object giving a new Unicode object. */ 10480 10481PyObject * 10482PyUnicode_Concat(PyObject *left, PyObject *right) 10483{ 10484 PyObject *u = NULL, *v = NULL, *w; 10485 Py_UCS4 maxchar, maxchar2; 10486 Py_ssize_t u_len, v_len, new_len; 10487 10488 /* Coerce the two arguments */ 10489 u = PyUnicode_FromObject(left); 10490 if (u == NULL) 10491 goto onError; 10492 v = PyUnicode_FromObject(right); 10493 if (v == NULL) 10494 goto onError; 10495 10496 /* Shortcuts */ 10497 if (v == unicode_empty) { 10498 Py_DECREF(v); 10499 return u; 10500 } 10501 if (u == unicode_empty) { 10502 Py_DECREF(u); 10503 return v; 10504 } 10505 10506 u_len = PyUnicode_GET_LENGTH(u); 10507 v_len = PyUnicode_GET_LENGTH(v); 10508 if (u_len > PY_SSIZE_T_MAX - v_len) { 10509 PyErr_SetString(PyExc_OverflowError, 10510 "strings are too large to concat"); 10511 goto onError; 10512 } 10513 new_len = u_len + v_len; 10514 10515 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10516 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10517 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10518 10519 /* Concat the two Unicode strings */ 10520 w = PyUnicode_New(new_len, maxchar); 10521 if (w == NULL) 10522 goto onError; 10523 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10524 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10525 Py_DECREF(u); 10526 Py_DECREF(v); 10527 assert(_PyUnicode_CheckConsistency(w, 1)); 10528 return w; 10529 10530 onError: 10531 Py_XDECREF(u); 10532 Py_XDECREF(v); 10533 return NULL; 10534} 10535 10536void 10537PyUnicode_Append(PyObject **p_left, PyObject *right) 10538{ 10539 PyObject *left, *res; 10540 Py_UCS4 maxchar, maxchar2; 10541 Py_ssize_t left_len, right_len, new_len; 10542 10543 if (p_left == NULL) { 10544 if (!PyErr_Occurred()) 10545 PyErr_BadInternalCall(); 10546 return; 10547 } 10548 left = *p_left; 10549 if (right == NULL || !PyUnicode_Check(left)) { 10550 if (!PyErr_Occurred()) 10551 PyErr_BadInternalCall(); 10552 goto error; 10553 } 10554 10555 if (PyUnicode_READY(left) == -1) 10556 goto error; 10557 if (PyUnicode_READY(right) == -1) 10558 goto error; 10559 10560 /* Shortcuts */ 10561 if (left == unicode_empty) { 10562 Py_DECREF(left); 10563 Py_INCREF(right); 10564 *p_left = right; 10565 return; 10566 } 10567 if (right == unicode_empty) 10568 return; 10569 10570 left_len = PyUnicode_GET_LENGTH(left); 10571 right_len = PyUnicode_GET_LENGTH(right); 10572 if (left_len > PY_SSIZE_T_MAX - right_len) { 10573 PyErr_SetString(PyExc_OverflowError, 10574 "strings are too large to concat"); 10575 goto error; 10576 } 10577 new_len = left_len + right_len; 10578 10579 if (unicode_modifiable(left) 10580 && PyUnicode_CheckExact(right) 10581 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10582 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10583 to change the structure size, but characters are stored just after 10584 the structure, and so it requires to move all characters which is 10585 not so different than duplicating the string. */ 10586 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10587 { 10588 /* append inplace */ 10589 if (unicode_resize(p_left, new_len) != 0) { 10590 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10591 * deallocated so it cannot be put back into 10592 * 'variable'. The MemoryError is raised when there 10593 * is no value in 'variable', which might (very 10594 * remotely) be a cause of incompatibilities. 10595 */ 10596 goto error; 10597 } 10598 /* copy 'right' into the newly allocated area of 'left' */ 10599 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10600 } 10601 else { 10602 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10603 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10604 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10605 10606 /* Concat the two Unicode strings */ 10607 res = PyUnicode_New(new_len, maxchar); 10608 if (res == NULL) 10609 goto error; 10610 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10611 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10612 Py_DECREF(left); 10613 *p_left = res; 10614 } 10615 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10616 return; 10617 10618error: 10619 Py_CLEAR(*p_left); 10620} 10621 10622void 10623PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10624{ 10625 PyUnicode_Append(pleft, right); 10626 Py_XDECREF(right); 10627} 10628 10629PyDoc_STRVAR(count__doc__, 10630 "S.count(sub[, start[, end]]) -> int\n\ 10631\n\ 10632Return the number of non-overlapping occurrences of substring sub in\n\ 10633string S[start:end]. Optional arguments start and end are\n\ 10634interpreted as in slice notation."); 10635 10636static PyObject * 10637unicode_count(PyObject *self, PyObject *args) 10638{ 10639 PyObject *substring; 10640 Py_ssize_t start = 0; 10641 Py_ssize_t end = PY_SSIZE_T_MAX; 10642 PyObject *result; 10643 int kind1, kind2, kind; 10644 void *buf1, *buf2; 10645 Py_ssize_t len1, len2, iresult; 10646 10647 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10648 &start, &end)) 10649 return NULL; 10650 10651 kind1 = PyUnicode_KIND(self); 10652 kind2 = PyUnicode_KIND(substring); 10653 if (kind2 > kind1) 10654 return PyLong_FromLong(0); 10655 kind = kind1; 10656 buf1 = PyUnicode_DATA(self); 10657 buf2 = PyUnicode_DATA(substring); 10658 if (kind2 != kind) 10659 buf2 = _PyUnicode_AsKind(substring, kind); 10660 if (!buf2) { 10661 Py_DECREF(substring); 10662 return NULL; 10663 } 10664 len1 = PyUnicode_GET_LENGTH(self); 10665 len2 = PyUnicode_GET_LENGTH(substring); 10666 10667 ADJUST_INDICES(start, end, len1); 10668 switch (kind) { 10669 case PyUnicode_1BYTE_KIND: 10670 iresult = ucs1lib_count( 10671 ((Py_UCS1*)buf1) + start, end - start, 10672 buf2, len2, PY_SSIZE_T_MAX 10673 ); 10674 break; 10675 case PyUnicode_2BYTE_KIND: 10676 iresult = ucs2lib_count( 10677 ((Py_UCS2*)buf1) + start, end - start, 10678 buf2, len2, PY_SSIZE_T_MAX 10679 ); 10680 break; 10681 case PyUnicode_4BYTE_KIND: 10682 iresult = ucs4lib_count( 10683 ((Py_UCS4*)buf1) + start, end - start, 10684 buf2, len2, PY_SSIZE_T_MAX 10685 ); 10686 break; 10687 default: 10688 assert(0); iresult = 0; 10689 } 10690 10691 result = PyLong_FromSsize_t(iresult); 10692 10693 if (kind2 != kind) 10694 PyMem_Free(buf2); 10695 10696 Py_DECREF(substring); 10697 10698 return result; 10699} 10700 10701PyDoc_STRVAR(encode__doc__, 10702 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10703\n\ 10704Encode S using the codec registered for encoding. Default encoding\n\ 10705is 'utf-8'. errors may be given to set a different error\n\ 10706handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10708'xmlcharrefreplace' as well as any other name registered with\n\ 10709codecs.register_error that can handle UnicodeEncodeErrors."); 10710 10711static PyObject * 10712unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10713{ 10714 static char *kwlist[] = {"encoding", "errors", 0}; 10715 char *encoding = NULL; 10716 char *errors = NULL; 10717 10718 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10719 kwlist, &encoding, &errors)) 10720 return NULL; 10721 return PyUnicode_AsEncodedString(self, encoding, errors); 10722} 10723 10724PyDoc_STRVAR(expandtabs__doc__, 10725 "S.expandtabs([tabsize]) -> str\n\ 10726\n\ 10727Return a copy of S where all tab characters are expanded using spaces.\n\ 10728If tabsize is not given, a tab size of 8 characters is assumed."); 10729 10730static PyObject* 10731unicode_expandtabs(PyObject *self, PyObject *args) 10732{ 10733 Py_ssize_t i, j, line_pos, src_len, incr; 10734 Py_UCS4 ch; 10735 PyObject *u; 10736 void *src_data, *dest_data; 10737 int tabsize = 8; 10738 int kind; 10739 int found; 10740 10741 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10742 return NULL; 10743 10744 if (PyUnicode_READY(self) == -1) 10745 return NULL; 10746 10747 /* First pass: determine size of output string */ 10748 src_len = PyUnicode_GET_LENGTH(self); 10749 i = j = line_pos = 0; 10750 kind = PyUnicode_KIND(self); 10751 src_data = PyUnicode_DATA(self); 10752 found = 0; 10753 for (; i < src_len; i++) { 10754 ch = PyUnicode_READ(kind, src_data, i); 10755 if (ch == '\t') { 10756 found = 1; 10757 if (tabsize > 0) { 10758 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10759 if (j > PY_SSIZE_T_MAX - incr) 10760 goto overflow; 10761 line_pos += incr; 10762 j += incr; 10763 } 10764 } 10765 else { 10766 if (j > PY_SSIZE_T_MAX - 1) 10767 goto overflow; 10768 line_pos++; 10769 j++; 10770 if (ch == '\n' || ch == '\r') 10771 line_pos = 0; 10772 } 10773 } 10774 if (!found) 10775 return unicode_result_unchanged(self); 10776 10777 /* Second pass: create output string and fill it */ 10778 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10779 if (!u) 10780 return NULL; 10781 dest_data = PyUnicode_DATA(u); 10782 10783 i = j = line_pos = 0; 10784 10785 for (; i < src_len; i++) { 10786 ch = PyUnicode_READ(kind, src_data, i); 10787 if (ch == '\t') { 10788 if (tabsize > 0) { 10789 incr = tabsize - (line_pos % tabsize); 10790 line_pos += incr; 10791 FILL(kind, dest_data, ' ', j, incr); 10792 j += incr; 10793 } 10794 } 10795 else { 10796 line_pos++; 10797 PyUnicode_WRITE(kind, dest_data, j, ch); 10798 j++; 10799 if (ch == '\n' || ch == '\r') 10800 line_pos = 0; 10801 } 10802 } 10803 assert (j == PyUnicode_GET_LENGTH(u)); 10804 return unicode_result(u); 10805 10806 overflow: 10807 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10808 return NULL; 10809} 10810 10811PyDoc_STRVAR(find__doc__, 10812 "S.find(sub[, start[, end]]) -> int\n\ 10813\n\ 10814Return the lowest index in S where substring sub is found,\n\ 10815such that sub is contained within S[start:end]. Optional\n\ 10816arguments start and end are interpreted as in slice notation.\n\ 10817\n\ 10818Return -1 on failure."); 10819 10820static PyObject * 10821unicode_find(PyObject *self, PyObject *args) 10822{ 10823 PyObject *substring; 10824 Py_ssize_t start; 10825 Py_ssize_t end; 10826 Py_ssize_t result; 10827 10828 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10829 &start, &end)) 10830 return NULL; 10831 10832 if (PyUnicode_READY(self) == -1) 10833 return NULL; 10834 if (PyUnicode_READY(substring) == -1) 10835 return NULL; 10836 10837 result = any_find_slice(1, self, substring, start, end); 10838 10839 Py_DECREF(substring); 10840 10841 if (result == -2) 10842 return NULL; 10843 10844 return PyLong_FromSsize_t(result); 10845} 10846 10847static PyObject * 10848unicode_getitem(PyObject *self, Py_ssize_t index) 10849{ 10850 void *data; 10851 enum PyUnicode_Kind kind; 10852 Py_UCS4 ch; 10853 PyObject *res; 10854 10855 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 10856 PyErr_BadArgument(); 10857 return NULL; 10858 } 10859 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 10860 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10861 return NULL; 10862 } 10863 kind = PyUnicode_KIND(self); 10864 data = PyUnicode_DATA(self); 10865 ch = PyUnicode_READ(kind, data, index); 10866 if (ch < 256) 10867 return get_latin1_char(ch); 10868 10869 res = PyUnicode_New(1, ch); 10870 if (res == NULL) 10871 return NULL; 10872 kind = PyUnicode_KIND(res); 10873 data = PyUnicode_DATA(res); 10874 PyUnicode_WRITE(kind, data, 0, ch); 10875 assert(_PyUnicode_CheckConsistency(res, 1)); 10876 return res; 10877} 10878 10879/* Believe it or not, this produces the same value for ASCII strings 10880 as bytes_hash(). */ 10881static Py_hash_t 10882unicode_hash(PyObject *self) 10883{ 10884 Py_ssize_t len; 10885 Py_uhash_t x; 10886 10887#ifdef Py_DEBUG 10888 assert(_Py_HashSecret_Initialized); 10889#endif 10890 if (_PyUnicode_HASH(self) != -1) 10891 return _PyUnicode_HASH(self); 10892 if (PyUnicode_READY(self) == -1) 10893 return -1; 10894 len = PyUnicode_GET_LENGTH(self); 10895 /* 10896 We make the hash of the empty string be 0, rather than using 10897 (prefix ^ suffix), since this slightly obfuscates the hash secret 10898 */ 10899 if (len == 0) { 10900 _PyUnicode_HASH(self) = 0; 10901 return 0; 10902 } 10903 10904 /* The hash function as a macro, gets expanded three times below. */ 10905#define HASH(P) \ 10906 x ^= (Py_uhash_t) *P << 7; \ 10907 while (--len >= 0) \ 10908 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 10909 10910 x = (Py_uhash_t) _Py_HashSecret.prefix; 10911 switch (PyUnicode_KIND(self)) { 10912 case PyUnicode_1BYTE_KIND: { 10913 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10914 HASH(c); 10915 break; 10916 } 10917 case PyUnicode_2BYTE_KIND: { 10918 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10919 HASH(s); 10920 break; 10921 } 10922 default: { 10923 Py_UCS4 *l; 10924 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10925 "Impossible switch case in unicode_hash"); 10926 l = PyUnicode_4BYTE_DATA(self); 10927 HASH(l); 10928 break; 10929 } 10930 } 10931 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 10932 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 10933 10934 if (x == -1) 10935 x = -2; 10936 _PyUnicode_HASH(self) = x; 10937 return x; 10938} 10939#undef HASH 10940 10941PyDoc_STRVAR(index__doc__, 10942 "S.index(sub[, start[, end]]) -> int\n\ 10943\n\ 10944Like S.find() but raise ValueError when the substring is not found."); 10945 10946static PyObject * 10947unicode_index(PyObject *self, PyObject *args) 10948{ 10949 Py_ssize_t result; 10950 PyObject *substring; 10951 Py_ssize_t start; 10952 Py_ssize_t end; 10953 10954 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10955 &start, &end)) 10956 return NULL; 10957 10958 if (PyUnicode_READY(self) == -1) 10959 return NULL; 10960 if (PyUnicode_READY(substring) == -1) 10961 return NULL; 10962 10963 result = any_find_slice(1, self, substring, start, end); 10964 10965 Py_DECREF(substring); 10966 10967 if (result == -2) 10968 return NULL; 10969 10970 if (result < 0) { 10971 PyErr_SetString(PyExc_ValueError, "substring not found"); 10972 return NULL; 10973 } 10974 10975 return PyLong_FromSsize_t(result); 10976} 10977 10978PyDoc_STRVAR(islower__doc__, 10979 "S.islower() -> bool\n\ 10980\n\ 10981Return True if all cased characters in S are lowercase and there is\n\ 10982at least one cased character in S, False otherwise."); 10983 10984static PyObject* 10985unicode_islower(PyObject *self) 10986{ 10987 Py_ssize_t i, length; 10988 int kind; 10989 void *data; 10990 int cased; 10991 10992 if (PyUnicode_READY(self) == -1) 10993 return NULL; 10994 length = PyUnicode_GET_LENGTH(self); 10995 kind = PyUnicode_KIND(self); 10996 data = PyUnicode_DATA(self); 10997 10998 /* Shortcut for single character strings */ 10999 if (length == 1) 11000 return PyBool_FromLong( 11001 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11002 11003 /* Special case for empty strings */ 11004 if (length == 0) 11005 return PyBool_FromLong(0); 11006 11007 cased = 0; 11008 for (i = 0; i < length; i++) { 11009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11010 11011 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11012 return PyBool_FromLong(0); 11013 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11014 cased = 1; 11015 } 11016 return PyBool_FromLong(cased); 11017} 11018 11019PyDoc_STRVAR(isupper__doc__, 11020 "S.isupper() -> bool\n\ 11021\n\ 11022Return True if all cased characters in S are uppercase and there is\n\ 11023at least one cased character in S, False otherwise."); 11024 11025static PyObject* 11026unicode_isupper(PyObject *self) 11027{ 11028 Py_ssize_t i, length; 11029 int kind; 11030 void *data; 11031 int cased; 11032 11033 if (PyUnicode_READY(self) == -1) 11034 return NULL; 11035 length = PyUnicode_GET_LENGTH(self); 11036 kind = PyUnicode_KIND(self); 11037 data = PyUnicode_DATA(self); 11038 11039 /* Shortcut for single character strings */ 11040 if (length == 1) 11041 return PyBool_FromLong( 11042 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11043 11044 /* Special case for empty strings */ 11045 if (length == 0) 11046 return PyBool_FromLong(0); 11047 11048 cased = 0; 11049 for (i = 0; i < length; i++) { 11050 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11051 11052 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11053 return PyBool_FromLong(0); 11054 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11055 cased = 1; 11056 } 11057 return PyBool_FromLong(cased); 11058} 11059 11060PyDoc_STRVAR(istitle__doc__, 11061 "S.istitle() -> bool\n\ 11062\n\ 11063Return True if S is a titlecased string and there is at least one\n\ 11064character in S, i.e. upper- and titlecase characters may only\n\ 11065follow uncased characters and lowercase characters only cased ones.\n\ 11066Return False otherwise."); 11067 11068static PyObject* 11069unicode_istitle(PyObject *self) 11070{ 11071 Py_ssize_t i, length; 11072 int kind; 11073 void *data; 11074 int cased, previous_is_cased; 11075 11076 if (PyUnicode_READY(self) == -1) 11077 return NULL; 11078 length = PyUnicode_GET_LENGTH(self); 11079 kind = PyUnicode_KIND(self); 11080 data = PyUnicode_DATA(self); 11081 11082 /* Shortcut for single character strings */ 11083 if (length == 1) { 11084 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11085 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11086 (Py_UNICODE_ISUPPER(ch) != 0)); 11087 } 11088 11089 /* Special case for empty strings */ 11090 if (length == 0) 11091 return PyBool_FromLong(0); 11092 11093 cased = 0; 11094 previous_is_cased = 0; 11095 for (i = 0; i < length; i++) { 11096 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11097 11098 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11099 if (previous_is_cased) 11100 return PyBool_FromLong(0); 11101 previous_is_cased = 1; 11102 cased = 1; 11103 } 11104 else if (Py_UNICODE_ISLOWER(ch)) { 11105 if (!previous_is_cased) 11106 return PyBool_FromLong(0); 11107 previous_is_cased = 1; 11108 cased = 1; 11109 } 11110 else 11111 previous_is_cased = 0; 11112 } 11113 return PyBool_FromLong(cased); 11114} 11115 11116PyDoc_STRVAR(isspace__doc__, 11117 "S.isspace() -> bool\n\ 11118\n\ 11119Return True if all characters in S are whitespace\n\ 11120and there is at least one character in S, False otherwise."); 11121 11122static PyObject* 11123unicode_isspace(PyObject *self) 11124{ 11125 Py_ssize_t i, length; 11126 int kind; 11127 void *data; 11128 11129 if (PyUnicode_READY(self) == -1) 11130 return NULL; 11131 length = PyUnicode_GET_LENGTH(self); 11132 kind = PyUnicode_KIND(self); 11133 data = PyUnicode_DATA(self); 11134 11135 /* Shortcut for single character strings */ 11136 if (length == 1) 11137 return PyBool_FromLong( 11138 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11139 11140 /* Special case for empty strings */ 11141 if (length == 0) 11142 return PyBool_FromLong(0); 11143 11144 for (i = 0; i < length; i++) { 11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11146 if (!Py_UNICODE_ISSPACE(ch)) 11147 return PyBool_FromLong(0); 11148 } 11149 return PyBool_FromLong(1); 11150} 11151 11152PyDoc_STRVAR(isalpha__doc__, 11153 "S.isalpha() -> bool\n\ 11154\n\ 11155Return True if all characters in S are alphabetic\n\ 11156and there is at least one character in S, False otherwise."); 11157 11158static PyObject* 11159unicode_isalpha(PyObject *self) 11160{ 11161 Py_ssize_t i, length; 11162 int kind; 11163 void *data; 11164 11165 if (PyUnicode_READY(self) == -1) 11166 return NULL; 11167 length = PyUnicode_GET_LENGTH(self); 11168 kind = PyUnicode_KIND(self); 11169 data = PyUnicode_DATA(self); 11170 11171 /* Shortcut for single character strings */ 11172 if (length == 1) 11173 return PyBool_FromLong( 11174 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11175 11176 /* Special case for empty strings */ 11177 if (length == 0) 11178 return PyBool_FromLong(0); 11179 11180 for (i = 0; i < length; i++) { 11181 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11182 return PyBool_FromLong(0); 11183 } 11184 return PyBool_FromLong(1); 11185} 11186 11187PyDoc_STRVAR(isalnum__doc__, 11188 "S.isalnum() -> bool\n\ 11189\n\ 11190Return True if all characters in S are alphanumeric\n\ 11191and there is at least one character in S, False otherwise."); 11192 11193static PyObject* 11194unicode_isalnum(PyObject *self) 11195{ 11196 int kind; 11197 void *data; 11198 Py_ssize_t len, i; 11199 11200 if (PyUnicode_READY(self) == -1) 11201 return NULL; 11202 11203 kind = PyUnicode_KIND(self); 11204 data = PyUnicode_DATA(self); 11205 len = PyUnicode_GET_LENGTH(self); 11206 11207 /* Shortcut for single character strings */ 11208 if (len == 1) { 11209 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11210 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11211 } 11212 11213 /* Special case for empty strings */ 11214 if (len == 0) 11215 return PyBool_FromLong(0); 11216 11217 for (i = 0; i < len; i++) { 11218 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11219 if (!Py_UNICODE_ISALNUM(ch)) 11220 return PyBool_FromLong(0); 11221 } 11222 return PyBool_FromLong(1); 11223} 11224 11225PyDoc_STRVAR(isdecimal__doc__, 11226 "S.isdecimal() -> bool\n\ 11227\n\ 11228Return True if there are only decimal characters in S,\n\ 11229False otherwise."); 11230 11231static PyObject* 11232unicode_isdecimal(PyObject *self) 11233{ 11234 Py_ssize_t i, length; 11235 int kind; 11236 void *data; 11237 11238 if (PyUnicode_READY(self) == -1) 11239 return NULL; 11240 length = PyUnicode_GET_LENGTH(self); 11241 kind = PyUnicode_KIND(self); 11242 data = PyUnicode_DATA(self); 11243 11244 /* Shortcut for single character strings */ 11245 if (length == 1) 11246 return PyBool_FromLong( 11247 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11248 11249 /* Special case for empty strings */ 11250 if (length == 0) 11251 return PyBool_FromLong(0); 11252 11253 for (i = 0; i < length; i++) { 11254 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11255 return PyBool_FromLong(0); 11256 } 11257 return PyBool_FromLong(1); 11258} 11259 11260PyDoc_STRVAR(isdigit__doc__, 11261 "S.isdigit() -> bool\n\ 11262\n\ 11263Return True if all characters in S are digits\n\ 11264and there is at least one character in S, False otherwise."); 11265 11266static PyObject* 11267unicode_isdigit(PyObject *self) 11268{ 11269 Py_ssize_t i, length; 11270 int kind; 11271 void *data; 11272 11273 if (PyUnicode_READY(self) == -1) 11274 return NULL; 11275 length = PyUnicode_GET_LENGTH(self); 11276 kind = PyUnicode_KIND(self); 11277 data = PyUnicode_DATA(self); 11278 11279 /* Shortcut for single character strings */ 11280 if (length == 1) { 11281 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11282 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11283 } 11284 11285 /* Special case for empty strings */ 11286 if (length == 0) 11287 return PyBool_FromLong(0); 11288 11289 for (i = 0; i < length; i++) { 11290 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11291 return PyBool_FromLong(0); 11292 } 11293 return PyBool_FromLong(1); 11294} 11295 11296PyDoc_STRVAR(isnumeric__doc__, 11297 "S.isnumeric() -> bool\n\ 11298\n\ 11299Return True if there are only numeric characters in S,\n\ 11300False otherwise."); 11301 11302static PyObject* 11303unicode_isnumeric(PyObject *self) 11304{ 11305 Py_ssize_t i, length; 11306 int kind; 11307 void *data; 11308 11309 if (PyUnicode_READY(self) == -1) 11310 return NULL; 11311 length = PyUnicode_GET_LENGTH(self); 11312 kind = PyUnicode_KIND(self); 11313 data = PyUnicode_DATA(self); 11314 11315 /* Shortcut for single character strings */ 11316 if (length == 1) 11317 return PyBool_FromLong( 11318 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11319 11320 /* Special case for empty strings */ 11321 if (length == 0) 11322 return PyBool_FromLong(0); 11323 11324 for (i = 0; i < length; i++) { 11325 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11326 return PyBool_FromLong(0); 11327 } 11328 return PyBool_FromLong(1); 11329} 11330 11331int 11332PyUnicode_IsIdentifier(PyObject *self) 11333{ 11334 int kind; 11335 void *data; 11336 Py_ssize_t i; 11337 Py_UCS4 first; 11338 11339 if (PyUnicode_READY(self) == -1) { 11340 Py_FatalError("identifier not ready"); 11341 return 0; 11342 } 11343 11344 /* Special case for empty strings */ 11345 if (PyUnicode_GET_LENGTH(self) == 0) 11346 return 0; 11347 kind = PyUnicode_KIND(self); 11348 data = PyUnicode_DATA(self); 11349 11350 /* PEP 3131 says that the first character must be in 11351 XID_Start and subsequent characters in XID_Continue, 11352 and for the ASCII range, the 2.x rules apply (i.e 11353 start with letters and underscore, continue with 11354 letters, digits, underscore). However, given the current 11355 definition of XID_Start and XID_Continue, it is sufficient 11356 to check just for these, except that _ must be allowed 11357 as starting an identifier. */ 11358 first = PyUnicode_READ(kind, data, 0); 11359 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11360 return 0; 11361 11362 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11363 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11364 return 0; 11365 return 1; 11366} 11367 11368PyDoc_STRVAR(isidentifier__doc__, 11369 "S.isidentifier() -> bool\n\ 11370\n\ 11371Return True if S is a valid identifier according\n\ 11372to the language definition."); 11373 11374static PyObject* 11375unicode_isidentifier(PyObject *self) 11376{ 11377 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11378} 11379 11380PyDoc_STRVAR(isprintable__doc__, 11381 "S.isprintable() -> bool\n\ 11382\n\ 11383Return True if all characters in S are considered\n\ 11384printable in repr() or S is empty, False otherwise."); 11385 11386static PyObject* 11387unicode_isprintable(PyObject *self) 11388{ 11389 Py_ssize_t i, length; 11390 int kind; 11391 void *data; 11392 11393 if (PyUnicode_READY(self) == -1) 11394 return NULL; 11395 length = PyUnicode_GET_LENGTH(self); 11396 kind = PyUnicode_KIND(self); 11397 data = PyUnicode_DATA(self); 11398 11399 /* Shortcut for single character strings */ 11400 if (length == 1) 11401 return PyBool_FromLong( 11402 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11403 11404 for (i = 0; i < length; i++) { 11405 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11406 Py_RETURN_FALSE; 11407 } 11408 } 11409 Py_RETURN_TRUE; 11410} 11411 11412PyDoc_STRVAR(join__doc__, 11413 "S.join(iterable) -> str\n\ 11414\n\ 11415Return a string which is the concatenation of the strings in the\n\ 11416iterable. The separator between elements is S."); 11417 11418static PyObject* 11419unicode_join(PyObject *self, PyObject *data) 11420{ 11421 return PyUnicode_Join(self, data); 11422} 11423 11424static Py_ssize_t 11425unicode_length(PyObject *self) 11426{ 11427 if (PyUnicode_READY(self) == -1) 11428 return -1; 11429 return PyUnicode_GET_LENGTH(self); 11430} 11431 11432PyDoc_STRVAR(ljust__doc__, 11433 "S.ljust(width[, fillchar]) -> str\n\ 11434\n\ 11435Return S left-justified in a Unicode string of length width. Padding is\n\ 11436done using the specified fill character (default is a space)."); 11437 11438static PyObject * 11439unicode_ljust(PyObject *self, PyObject *args) 11440{ 11441 Py_ssize_t width; 11442 Py_UCS4 fillchar = ' '; 11443 11444 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11445 return NULL; 11446 11447 if (PyUnicode_READY(self) == -1) 11448 return NULL; 11449 11450 if (PyUnicode_GET_LENGTH(self) >= width) 11451 return unicode_result_unchanged(self); 11452 11453 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11454} 11455 11456PyDoc_STRVAR(lower__doc__, 11457 "S.lower() -> str\n\ 11458\n\ 11459Return a copy of the string S converted to lowercase."); 11460 11461static PyObject* 11462unicode_lower(PyObject *self) 11463{ 11464 if (PyUnicode_READY(self) == -1) 11465 return NULL; 11466 if (PyUnicode_IS_ASCII(self)) 11467 return ascii_upper_or_lower(self, 1); 11468 return case_operation(self, do_lower); 11469} 11470 11471#define LEFTSTRIP 0 11472#define RIGHTSTRIP 1 11473#define BOTHSTRIP 2 11474 11475/* Arrays indexed by above */ 11476static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11477 11478#define STRIPNAME(i) (stripformat[i]+3) 11479 11480/* externally visible for str.strip(unicode) */ 11481PyObject * 11482_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11483{ 11484 void *data; 11485 int kind; 11486 Py_ssize_t i, j, len; 11487 BLOOM_MASK sepmask; 11488 11489 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11490 return NULL; 11491 11492 kind = PyUnicode_KIND(self); 11493 data = PyUnicode_DATA(self); 11494 len = PyUnicode_GET_LENGTH(self); 11495 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11496 PyUnicode_DATA(sepobj), 11497 PyUnicode_GET_LENGTH(sepobj)); 11498 11499 i = 0; 11500 if (striptype != RIGHTSTRIP) { 11501 while (i < len && 11502 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11503 i++; 11504 } 11505 } 11506 11507 j = len; 11508 if (striptype != LEFTSTRIP) { 11509 do { 11510 j--; 11511 } while (j >= i && 11512 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11513 j++; 11514 } 11515 11516 return PyUnicode_Substring(self, i, j); 11517} 11518 11519PyObject* 11520PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11521{ 11522 unsigned char *data; 11523 int kind; 11524 Py_ssize_t length; 11525 11526 if (PyUnicode_READY(self) == -1) 11527 return NULL; 11528 11529 length = PyUnicode_GET_LENGTH(self); 11530 end = Py_MIN(end, length); 11531 11532 if (start == 0 && end == length) 11533 return unicode_result_unchanged(self); 11534 11535 if (start < 0 || end < 0) { 11536 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11537 return NULL; 11538 } 11539 if (start >= length || end < start) { 11540 Py_INCREF(unicode_empty); 11541 return unicode_empty; 11542 } 11543 11544 length = end - start; 11545 if (PyUnicode_IS_ASCII(self)) { 11546 data = PyUnicode_1BYTE_DATA(self); 11547 return _PyUnicode_FromASCII((char*)(data + start), length); 11548 } 11549 else { 11550 kind = PyUnicode_KIND(self); 11551 data = PyUnicode_1BYTE_DATA(self); 11552 return PyUnicode_FromKindAndData(kind, 11553 data + kind * start, 11554 length); 11555 } 11556} 11557 11558static PyObject * 11559do_strip(PyObject *self, int striptype) 11560{ 11561 int kind; 11562 void *data; 11563 Py_ssize_t len, i, j; 11564 11565 if (PyUnicode_READY(self) == -1) 11566 return NULL; 11567 11568 kind = PyUnicode_KIND(self); 11569 data = PyUnicode_DATA(self); 11570 len = PyUnicode_GET_LENGTH(self); 11571 11572 i = 0; 11573 if (striptype != RIGHTSTRIP) { 11574 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11575 i++; 11576 } 11577 } 11578 11579 j = len; 11580 if (striptype != LEFTSTRIP) { 11581 do { 11582 j--; 11583 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11584 j++; 11585 } 11586 11587 return PyUnicode_Substring(self, i, j); 11588} 11589 11590 11591static PyObject * 11592do_argstrip(PyObject *self, int striptype, PyObject *args) 11593{ 11594 PyObject *sep = NULL; 11595 11596 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11597 return NULL; 11598 11599 if (sep != NULL && sep != Py_None) { 11600 if (PyUnicode_Check(sep)) 11601 return _PyUnicode_XStrip(self, striptype, sep); 11602 else { 11603 PyErr_Format(PyExc_TypeError, 11604 "%s arg must be None or str", 11605 STRIPNAME(striptype)); 11606 return NULL; 11607 } 11608 } 11609 11610 return do_strip(self, striptype); 11611} 11612 11613 11614PyDoc_STRVAR(strip__doc__, 11615 "S.strip([chars]) -> str\n\ 11616\n\ 11617Return a copy of the string S with leading and trailing\n\ 11618whitespace removed.\n\ 11619If chars is given and not None, remove characters in chars instead."); 11620 11621static PyObject * 11622unicode_strip(PyObject *self, PyObject *args) 11623{ 11624 if (PyTuple_GET_SIZE(args) == 0) 11625 return do_strip(self, BOTHSTRIP); /* Common case */ 11626 else 11627 return do_argstrip(self, BOTHSTRIP, args); 11628} 11629 11630 11631PyDoc_STRVAR(lstrip__doc__, 11632 "S.lstrip([chars]) -> str\n\ 11633\n\ 11634Return a copy of the string S with leading whitespace removed.\n\ 11635If chars is given and not None, remove characters in chars instead."); 11636 11637static PyObject * 11638unicode_lstrip(PyObject *self, PyObject *args) 11639{ 11640 if (PyTuple_GET_SIZE(args) == 0) 11641 return do_strip(self, LEFTSTRIP); /* Common case */ 11642 else 11643 return do_argstrip(self, LEFTSTRIP, args); 11644} 11645 11646 11647PyDoc_STRVAR(rstrip__doc__, 11648 "S.rstrip([chars]) -> str\n\ 11649\n\ 11650Return a copy of the string S with trailing whitespace removed.\n\ 11651If chars is given and not None, remove characters in chars instead."); 11652 11653static PyObject * 11654unicode_rstrip(PyObject *self, PyObject *args) 11655{ 11656 if (PyTuple_GET_SIZE(args) == 0) 11657 return do_strip(self, RIGHTSTRIP); /* Common case */ 11658 else 11659 return do_argstrip(self, RIGHTSTRIP, args); 11660} 11661 11662 11663static PyObject* 11664unicode_repeat(PyObject *str, Py_ssize_t len) 11665{ 11666 PyObject *u; 11667 Py_ssize_t nchars, n; 11668 11669 if (len < 1) { 11670 Py_INCREF(unicode_empty); 11671 return unicode_empty; 11672 } 11673 11674 /* no repeat, return original string */ 11675 if (len == 1) 11676 return unicode_result_unchanged(str); 11677 11678 if (PyUnicode_READY(str) == -1) 11679 return NULL; 11680 11681 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11682 PyErr_SetString(PyExc_OverflowError, 11683 "repeated string is too long"); 11684 return NULL; 11685 } 11686 nchars = len * PyUnicode_GET_LENGTH(str); 11687 11688 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11689 if (!u) 11690 return NULL; 11691 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11692 11693 if (PyUnicode_GET_LENGTH(str) == 1) { 11694 const int kind = PyUnicode_KIND(str); 11695 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11696 if (kind == PyUnicode_1BYTE_KIND) { 11697 void *to = PyUnicode_DATA(u); 11698 memset(to, (unsigned char)fill_char, len); 11699 } 11700 else if (kind == PyUnicode_2BYTE_KIND) { 11701 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11702 for (n = 0; n < len; ++n) 11703 ucs2[n] = fill_char; 11704 } else { 11705 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11706 assert(kind == PyUnicode_4BYTE_KIND); 11707 for (n = 0; n < len; ++n) 11708 ucs4[n] = fill_char; 11709 } 11710 } 11711 else { 11712 /* number of characters copied this far */ 11713 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11714 const Py_ssize_t char_size = PyUnicode_KIND(str); 11715 char *to = (char *) PyUnicode_DATA(u); 11716 Py_MEMCPY(to, PyUnicode_DATA(str), 11717 PyUnicode_GET_LENGTH(str) * char_size); 11718 while (done < nchars) { 11719 n = (done <= nchars-done) ? done : nchars-done; 11720 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11721 done += n; 11722 } 11723 } 11724 11725 assert(_PyUnicode_CheckConsistency(u, 1)); 11726 return u; 11727} 11728 11729PyObject * 11730PyUnicode_Replace(PyObject *obj, 11731 PyObject *subobj, 11732 PyObject *replobj, 11733 Py_ssize_t maxcount) 11734{ 11735 PyObject *self; 11736 PyObject *str1; 11737 PyObject *str2; 11738 PyObject *result; 11739 11740 self = PyUnicode_FromObject(obj); 11741 if (self == NULL) 11742 return NULL; 11743 str1 = PyUnicode_FromObject(subobj); 11744 if (str1 == NULL) { 11745 Py_DECREF(self); 11746 return NULL; 11747 } 11748 str2 = PyUnicode_FromObject(replobj); 11749 if (str2 == NULL) { 11750 Py_DECREF(self); 11751 Py_DECREF(str1); 11752 return NULL; 11753 } 11754 if (PyUnicode_READY(self) == -1 || 11755 PyUnicode_READY(str1) == -1 || 11756 PyUnicode_READY(str2) == -1) 11757 result = NULL; 11758 else 11759 result = replace(self, str1, str2, maxcount); 11760 Py_DECREF(self); 11761 Py_DECREF(str1); 11762 Py_DECREF(str2); 11763 return result; 11764} 11765 11766PyDoc_STRVAR(replace__doc__, 11767 "S.replace(old, new[, count]) -> str\n\ 11768\n\ 11769Return a copy of S with all occurrences of substring\n\ 11770old replaced by new. If the optional argument count is\n\ 11771given, only the first count occurrences are replaced."); 11772 11773static PyObject* 11774unicode_replace(PyObject *self, PyObject *args) 11775{ 11776 PyObject *str1; 11777 PyObject *str2; 11778 Py_ssize_t maxcount = -1; 11779 PyObject *result; 11780 11781 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11782 return NULL; 11783 if (PyUnicode_READY(self) == -1) 11784 return NULL; 11785 str1 = PyUnicode_FromObject(str1); 11786 if (str1 == NULL) 11787 return NULL; 11788 str2 = PyUnicode_FromObject(str2); 11789 if (str2 == NULL) { 11790 Py_DECREF(str1); 11791 return NULL; 11792 } 11793 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 11794 result = NULL; 11795 else 11796 result = replace(self, str1, str2, maxcount); 11797 11798 Py_DECREF(str1); 11799 Py_DECREF(str2); 11800 return result; 11801} 11802 11803static PyObject * 11804unicode_repr(PyObject *unicode) 11805{ 11806 PyObject *repr; 11807 Py_ssize_t isize; 11808 Py_ssize_t osize, squote, dquote, i, o; 11809 Py_UCS4 max, quote; 11810 int ikind, okind; 11811 void *idata, *odata; 11812 11813 if (PyUnicode_READY(unicode) == -1) 11814 return NULL; 11815 11816 isize = PyUnicode_GET_LENGTH(unicode); 11817 idata = PyUnicode_DATA(unicode); 11818 11819 /* Compute length of output, quote characters, and 11820 maximum character */ 11821 osize = 2; /* quotes */ 11822 max = 127; 11823 squote = dquote = 0; 11824 ikind = PyUnicode_KIND(unicode); 11825 for (i = 0; i < isize; i++) { 11826 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11827 switch (ch) { 11828 case '\'': squote++; osize++; break; 11829 case '"': dquote++; osize++; break; 11830 case '\\': case '\t': case '\r': case '\n': 11831 osize += 2; break; 11832 default: 11833 /* Fast-path ASCII */ 11834 if (ch < ' ' || ch == 0x7f) 11835 osize += 4; /* \xHH */ 11836 else if (ch < 0x7f) 11837 osize++; 11838 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11839 osize++; 11840 max = ch > max ? ch : max; 11841 } 11842 else if (ch < 0x100) 11843 osize += 4; /* \xHH */ 11844 else if (ch < 0x10000) 11845 osize += 6; /* \uHHHH */ 11846 else 11847 osize += 10; /* \uHHHHHHHH */ 11848 } 11849 } 11850 11851 quote = '\''; 11852 if (squote) { 11853 if (dquote) 11854 /* Both squote and dquote present. Use squote, 11855 and escape them */ 11856 osize += squote; 11857 else 11858 quote = '"'; 11859 } 11860 11861 repr = PyUnicode_New(osize, max); 11862 if (repr == NULL) 11863 return NULL; 11864 okind = PyUnicode_KIND(repr); 11865 odata = PyUnicode_DATA(repr); 11866 11867 PyUnicode_WRITE(okind, odata, 0, quote); 11868 PyUnicode_WRITE(okind, odata, osize-1, quote); 11869 11870 for (i = 0, o = 1; i < isize; i++) { 11871 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11872 11873 /* Escape quotes and backslashes */ 11874 if ((ch == quote) || (ch == '\\')) { 11875 PyUnicode_WRITE(okind, odata, o++, '\\'); 11876 PyUnicode_WRITE(okind, odata, o++, ch); 11877 continue; 11878 } 11879 11880 /* Map special whitespace to '\t', \n', '\r' */ 11881 if (ch == '\t') { 11882 PyUnicode_WRITE(okind, odata, o++, '\\'); 11883 PyUnicode_WRITE(okind, odata, o++, 't'); 11884 } 11885 else if (ch == '\n') { 11886 PyUnicode_WRITE(okind, odata, o++, '\\'); 11887 PyUnicode_WRITE(okind, odata, o++, 'n'); 11888 } 11889 else if (ch == '\r') { 11890 PyUnicode_WRITE(okind, odata, o++, '\\'); 11891 PyUnicode_WRITE(okind, odata, o++, 'r'); 11892 } 11893 11894 /* Map non-printable US ASCII to '\xhh' */ 11895 else if (ch < ' ' || ch == 0x7F) { 11896 PyUnicode_WRITE(okind, odata, o++, '\\'); 11897 PyUnicode_WRITE(okind, odata, o++, 'x'); 11898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 11900 } 11901 11902 /* Copy ASCII characters as-is */ 11903 else if (ch < 0x7F) { 11904 PyUnicode_WRITE(okind, odata, o++, ch); 11905 } 11906 11907 /* Non-ASCII characters */ 11908 else { 11909 /* Map Unicode whitespace and control characters 11910 (categories Z* and C* except ASCII space) 11911 */ 11912 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11913 PyUnicode_WRITE(okind, odata, o++, '\\'); 11914 /* Map 8-bit characters to '\xhh' */ 11915 if (ch <= 0xff) { 11916 PyUnicode_WRITE(okind, odata, o++, 'x'); 11917 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 11919 } 11920 /* Map 16-bit characters to '\uxxxx' */ 11921 else if (ch <= 0xffff) { 11922 PyUnicode_WRITE(okind, odata, o++, 'u'); 11923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 11924 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 11925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 11926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 11927 } 11928 /* Map 21-bit characters to '\U00xxxxxx' */ 11929 else { 11930 PyUnicode_WRITE(okind, odata, o++, 'U'); 11931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 11932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 11939 } 11940 } 11941 /* Copy characters as-is */ 11942 else { 11943 PyUnicode_WRITE(okind, odata, o++, ch); 11944 } 11945 } 11946 } 11947 /* Closing quote already added at the beginning */ 11948 assert(_PyUnicode_CheckConsistency(repr, 1)); 11949 return repr; 11950} 11951 11952PyDoc_STRVAR(rfind__doc__, 11953 "S.rfind(sub[, start[, end]]) -> int\n\ 11954\n\ 11955Return the highest index in S where substring sub is found,\n\ 11956such that sub is contained within S[start:end]. Optional\n\ 11957arguments start and end are interpreted as in slice notation.\n\ 11958\n\ 11959Return -1 on failure."); 11960 11961static PyObject * 11962unicode_rfind(PyObject *self, PyObject *args) 11963{ 11964 PyObject *substring; 11965 Py_ssize_t start; 11966 Py_ssize_t end; 11967 Py_ssize_t result; 11968 11969 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11970 &start, &end)) 11971 return NULL; 11972 11973 if (PyUnicode_READY(self) == -1) 11974 return NULL; 11975 if (PyUnicode_READY(substring) == -1) 11976 return NULL; 11977 11978 result = any_find_slice(-1, self, substring, start, end); 11979 11980 Py_DECREF(substring); 11981 11982 if (result == -2) 11983 return NULL; 11984 11985 return PyLong_FromSsize_t(result); 11986} 11987 11988PyDoc_STRVAR(rindex__doc__, 11989 "S.rindex(sub[, start[, end]]) -> int\n\ 11990\n\ 11991Like S.rfind() but raise ValueError when the substring is not found."); 11992 11993static PyObject * 11994unicode_rindex(PyObject *self, PyObject *args) 11995{ 11996 PyObject *substring; 11997 Py_ssize_t start; 11998 Py_ssize_t end; 11999 Py_ssize_t result; 12000 12001 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12002 &start, &end)) 12003 return NULL; 12004 12005 if (PyUnicode_READY(self) == -1) 12006 return NULL; 12007 if (PyUnicode_READY(substring) == -1) 12008 return NULL; 12009 12010 result = any_find_slice(-1, self, substring, start, end); 12011 12012 Py_DECREF(substring); 12013 12014 if (result == -2) 12015 return NULL; 12016 12017 if (result < 0) { 12018 PyErr_SetString(PyExc_ValueError, "substring not found"); 12019 return NULL; 12020 } 12021 12022 return PyLong_FromSsize_t(result); 12023} 12024 12025PyDoc_STRVAR(rjust__doc__, 12026 "S.rjust(width[, fillchar]) -> str\n\ 12027\n\ 12028Return S right-justified in a string of length width. Padding is\n\ 12029done using the specified fill character (default is a space)."); 12030 12031static PyObject * 12032unicode_rjust(PyObject *self, PyObject *args) 12033{ 12034 Py_ssize_t width; 12035 Py_UCS4 fillchar = ' '; 12036 12037 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12038 return NULL; 12039 12040 if (PyUnicode_READY(self) == -1) 12041 return NULL; 12042 12043 if (PyUnicode_GET_LENGTH(self) >= width) 12044 return unicode_result_unchanged(self); 12045 12046 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12047} 12048 12049PyObject * 12050PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12051{ 12052 PyObject *result; 12053 12054 s = PyUnicode_FromObject(s); 12055 if (s == NULL) 12056 return NULL; 12057 if (sep != NULL) { 12058 sep = PyUnicode_FromObject(sep); 12059 if (sep == NULL) { 12060 Py_DECREF(s); 12061 return NULL; 12062 } 12063 } 12064 12065 result = split(s, sep, maxsplit); 12066 12067 Py_DECREF(s); 12068 Py_XDECREF(sep); 12069 return result; 12070} 12071 12072PyDoc_STRVAR(split__doc__, 12073 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12074\n\ 12075Return a list of the words in S, using sep as the\n\ 12076delimiter string. If maxsplit is given, at most maxsplit\n\ 12077splits are done. If sep is not specified or is None, any\n\ 12078whitespace string is a separator and empty strings are\n\ 12079removed from the result."); 12080 12081static PyObject* 12082unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12083{ 12084 static char *kwlist[] = {"sep", "maxsplit", 0}; 12085 PyObject *substring = Py_None; 12086 Py_ssize_t maxcount = -1; 12087 12088 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12089 kwlist, &substring, &maxcount)) 12090 return NULL; 12091 12092 if (substring == Py_None) 12093 return split(self, NULL, maxcount); 12094 else if (PyUnicode_Check(substring)) 12095 return split(self, substring, maxcount); 12096 else 12097 return PyUnicode_Split(self, substring, maxcount); 12098} 12099 12100PyObject * 12101PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12102{ 12103 PyObject* str_obj; 12104 PyObject* sep_obj; 12105 PyObject* out; 12106 int kind1, kind2, kind; 12107 void *buf1 = NULL, *buf2 = NULL; 12108 Py_ssize_t len1, len2; 12109 12110 str_obj = PyUnicode_FromObject(str_in); 12111 if (!str_obj) 12112 return NULL; 12113 sep_obj = PyUnicode_FromObject(sep_in); 12114 if (!sep_obj) { 12115 Py_DECREF(str_obj); 12116 return NULL; 12117 } 12118 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12119 Py_DECREF(sep_obj); 12120 Py_DECREF(str_obj); 12121 return NULL; 12122 } 12123 12124 kind1 = PyUnicode_KIND(str_obj); 12125 kind2 = PyUnicode_KIND(sep_obj); 12126 kind = Py_MAX(kind1, kind2); 12127 buf1 = PyUnicode_DATA(str_obj); 12128 if (kind1 != kind) 12129 buf1 = _PyUnicode_AsKind(str_obj, kind); 12130 if (!buf1) 12131 goto onError; 12132 buf2 = PyUnicode_DATA(sep_obj); 12133 if (kind2 != kind) 12134 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12135 if (!buf2) 12136 goto onError; 12137 len1 = PyUnicode_GET_LENGTH(str_obj); 12138 len2 = PyUnicode_GET_LENGTH(sep_obj); 12139 12140 switch (PyUnicode_KIND(str_obj)) { 12141 case PyUnicode_1BYTE_KIND: 12142 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12143 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12144 else 12145 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12146 break; 12147 case PyUnicode_2BYTE_KIND: 12148 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12149 break; 12150 case PyUnicode_4BYTE_KIND: 12151 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12152 break; 12153 default: 12154 assert(0); 12155 out = 0; 12156 } 12157 12158 Py_DECREF(sep_obj); 12159 Py_DECREF(str_obj); 12160 if (kind1 != kind) 12161 PyMem_Free(buf1); 12162 if (kind2 != kind) 12163 PyMem_Free(buf2); 12164 12165 return out; 12166 onError: 12167 Py_DECREF(sep_obj); 12168 Py_DECREF(str_obj); 12169 if (kind1 != kind && buf1) 12170 PyMem_Free(buf1); 12171 if (kind2 != kind && buf2) 12172 PyMem_Free(buf2); 12173 return NULL; 12174} 12175 12176 12177PyObject * 12178PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12179{ 12180 PyObject* str_obj; 12181 PyObject* sep_obj; 12182 PyObject* out; 12183 int kind1, kind2, kind; 12184 void *buf1 = NULL, *buf2 = NULL; 12185 Py_ssize_t len1, len2; 12186 12187 str_obj = PyUnicode_FromObject(str_in); 12188 if (!str_obj) 12189 return NULL; 12190 sep_obj = PyUnicode_FromObject(sep_in); 12191 if (!sep_obj) { 12192 Py_DECREF(str_obj); 12193 return NULL; 12194 } 12195 12196 kind1 = PyUnicode_KIND(str_in); 12197 kind2 = PyUnicode_KIND(sep_obj); 12198 kind = Py_MAX(kind1, kind2); 12199 buf1 = PyUnicode_DATA(str_in); 12200 if (kind1 != kind) 12201 buf1 = _PyUnicode_AsKind(str_in, kind); 12202 if (!buf1) 12203 goto onError; 12204 buf2 = PyUnicode_DATA(sep_obj); 12205 if (kind2 != kind) 12206 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12207 if (!buf2) 12208 goto onError; 12209 len1 = PyUnicode_GET_LENGTH(str_obj); 12210 len2 = PyUnicode_GET_LENGTH(sep_obj); 12211 12212 switch (PyUnicode_KIND(str_in)) { 12213 case PyUnicode_1BYTE_KIND: 12214 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12215 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12216 else 12217 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12218 break; 12219 case PyUnicode_2BYTE_KIND: 12220 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12221 break; 12222 case PyUnicode_4BYTE_KIND: 12223 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12224 break; 12225 default: 12226 assert(0); 12227 out = 0; 12228 } 12229 12230 Py_DECREF(sep_obj); 12231 Py_DECREF(str_obj); 12232 if (kind1 != kind) 12233 PyMem_Free(buf1); 12234 if (kind2 != kind) 12235 PyMem_Free(buf2); 12236 12237 return out; 12238 onError: 12239 Py_DECREF(sep_obj); 12240 Py_DECREF(str_obj); 12241 if (kind1 != kind && buf1) 12242 PyMem_Free(buf1); 12243 if (kind2 != kind && buf2) 12244 PyMem_Free(buf2); 12245 return NULL; 12246} 12247 12248PyDoc_STRVAR(partition__doc__, 12249 "S.partition(sep) -> (head, sep, tail)\n\ 12250\n\ 12251Search for the separator sep in S, and return the part before it,\n\ 12252the separator itself, and the part after it. If the separator is not\n\ 12253found, return S and two empty strings."); 12254 12255static PyObject* 12256unicode_partition(PyObject *self, PyObject *separator) 12257{ 12258 return PyUnicode_Partition(self, separator); 12259} 12260 12261PyDoc_STRVAR(rpartition__doc__, 12262 "S.rpartition(sep) -> (head, sep, tail)\n\ 12263\n\ 12264Search for the separator sep in S, starting at the end of S, and return\n\ 12265the part before it, the separator itself, and the part after it. If the\n\ 12266separator is not found, return two empty strings and S."); 12267 12268static PyObject* 12269unicode_rpartition(PyObject *self, PyObject *separator) 12270{ 12271 return PyUnicode_RPartition(self, separator); 12272} 12273 12274PyObject * 12275PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12276{ 12277 PyObject *result; 12278 12279 s = PyUnicode_FromObject(s); 12280 if (s == NULL) 12281 return NULL; 12282 if (sep != NULL) { 12283 sep = PyUnicode_FromObject(sep); 12284 if (sep == NULL) { 12285 Py_DECREF(s); 12286 return NULL; 12287 } 12288 } 12289 12290 result = rsplit(s, sep, maxsplit); 12291 12292 Py_DECREF(s); 12293 Py_XDECREF(sep); 12294 return result; 12295} 12296 12297PyDoc_STRVAR(rsplit__doc__, 12298 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12299\n\ 12300Return a list of the words in S, using sep as the\n\ 12301delimiter string, starting at the end of the string and\n\ 12302working to the front. If maxsplit is given, at most maxsplit\n\ 12303splits are done. If sep is not specified, any whitespace string\n\ 12304is a separator."); 12305 12306static PyObject* 12307unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12308{ 12309 static char *kwlist[] = {"sep", "maxsplit", 0}; 12310 PyObject *substring = Py_None; 12311 Py_ssize_t maxcount = -1; 12312 12313 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12314 kwlist, &substring, &maxcount)) 12315 return NULL; 12316 12317 if (substring == Py_None) 12318 return rsplit(self, NULL, maxcount); 12319 else if (PyUnicode_Check(substring)) 12320 return rsplit(self, substring, maxcount); 12321 else 12322 return PyUnicode_RSplit(self, substring, maxcount); 12323} 12324 12325PyDoc_STRVAR(splitlines__doc__, 12326 "S.splitlines([keepends]) -> list of strings\n\ 12327\n\ 12328Return a list of the lines in S, breaking at line boundaries.\n\ 12329Line breaks are not included in the resulting list unless keepends\n\ 12330is given and true."); 12331 12332static PyObject* 12333unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12334{ 12335 static char *kwlist[] = {"keepends", 0}; 12336 int keepends = 0; 12337 12338 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12339 kwlist, &keepends)) 12340 return NULL; 12341 12342 return PyUnicode_Splitlines(self, keepends); 12343} 12344 12345static 12346PyObject *unicode_str(PyObject *self) 12347{ 12348 return unicode_result_unchanged(self); 12349} 12350 12351PyDoc_STRVAR(swapcase__doc__, 12352 "S.swapcase() -> str\n\ 12353\n\ 12354Return a copy of S with uppercase characters converted to lowercase\n\ 12355and vice versa."); 12356 12357static PyObject* 12358unicode_swapcase(PyObject *self) 12359{ 12360 if (PyUnicode_READY(self) == -1) 12361 return NULL; 12362 return case_operation(self, do_swapcase); 12363} 12364 12365PyDoc_STRVAR(maketrans__doc__, 12366 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12367\n\ 12368Return a translation table usable for str.translate().\n\ 12369If there is only one argument, it must be a dictionary mapping Unicode\n\ 12370ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12371Character keys will be then converted to ordinals.\n\ 12372If there are two arguments, they must be strings of equal length, and\n\ 12373in the resulting dictionary, each character in x will be mapped to the\n\ 12374character at the same position in y. If there is a third argument, it\n\ 12375must be a string, whose characters will be mapped to None in the result."); 12376 12377static PyObject* 12378unicode_maketrans(PyObject *null, PyObject *args) 12379{ 12380 PyObject *x, *y = NULL, *z = NULL; 12381 PyObject *new = NULL, *key, *value; 12382 Py_ssize_t i = 0; 12383 int res; 12384 12385 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12386 return NULL; 12387 new = PyDict_New(); 12388 if (!new) 12389 return NULL; 12390 if (y != NULL) { 12391 int x_kind, y_kind, z_kind; 12392 void *x_data, *y_data, *z_data; 12393 12394 /* x must be a string too, of equal length */ 12395 if (!PyUnicode_Check(x)) { 12396 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12397 "be a string if there is a second argument"); 12398 goto err; 12399 } 12400 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12401 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12402 "arguments must have equal length"); 12403 goto err; 12404 } 12405 /* create entries for translating chars in x to those in y */ 12406 x_kind = PyUnicode_KIND(x); 12407 y_kind = PyUnicode_KIND(y); 12408 x_data = PyUnicode_DATA(x); 12409 y_data = PyUnicode_DATA(y); 12410 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12411 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12412 if (!key) 12413 goto err; 12414 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12415 if (!value) { 12416 Py_DECREF(key); 12417 goto err; 12418 } 12419 res = PyDict_SetItem(new, key, value); 12420 Py_DECREF(key); 12421 Py_DECREF(value); 12422 if (res < 0) 12423 goto err; 12424 } 12425 /* create entries for deleting chars in z */ 12426 if (z != NULL) { 12427 z_kind = PyUnicode_KIND(z); 12428 z_data = PyUnicode_DATA(z); 12429 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12430 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12431 if (!key) 12432 goto err; 12433 res = PyDict_SetItem(new, key, Py_None); 12434 Py_DECREF(key); 12435 if (res < 0) 12436 goto err; 12437 } 12438 } 12439 } else { 12440 int kind; 12441 void *data; 12442 12443 /* x must be a dict */ 12444 if (!PyDict_CheckExact(x)) { 12445 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12446 "to maketrans it must be a dict"); 12447 goto err; 12448 } 12449 /* copy entries into the new dict, converting string keys to int keys */ 12450 while (PyDict_Next(x, &i, &key, &value)) { 12451 if (PyUnicode_Check(key)) { 12452 /* convert string keys to integer keys */ 12453 PyObject *newkey; 12454 if (PyUnicode_GET_LENGTH(key) != 1) { 12455 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12456 "table must be of length 1"); 12457 goto err; 12458 } 12459 kind = PyUnicode_KIND(key); 12460 data = PyUnicode_DATA(key); 12461 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12462 if (!newkey) 12463 goto err; 12464 res = PyDict_SetItem(new, newkey, value); 12465 Py_DECREF(newkey); 12466 if (res < 0) 12467 goto err; 12468 } else if (PyLong_Check(key)) { 12469 /* just keep integer keys */ 12470 if (PyDict_SetItem(new, key, value) < 0) 12471 goto err; 12472 } else { 12473 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12474 "be strings or integers"); 12475 goto err; 12476 } 12477 } 12478 } 12479 return new; 12480 err: 12481 Py_DECREF(new); 12482 return NULL; 12483} 12484 12485PyDoc_STRVAR(translate__doc__, 12486 "S.translate(table) -> str\n\ 12487\n\ 12488Return a copy of the string S, where all characters have been mapped\n\ 12489through the given translation table, which must be a mapping of\n\ 12490Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12491Unmapped characters are left untouched. Characters mapped to None\n\ 12492are deleted."); 12493 12494static PyObject* 12495unicode_translate(PyObject *self, PyObject *table) 12496{ 12497 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12498} 12499 12500PyDoc_STRVAR(upper__doc__, 12501 "S.upper() -> str\n\ 12502\n\ 12503Return a copy of S converted to uppercase."); 12504 12505static PyObject* 12506unicode_upper(PyObject *self) 12507{ 12508 if (PyUnicode_READY(self) == -1) 12509 return NULL; 12510 if (PyUnicode_IS_ASCII(self)) 12511 return ascii_upper_or_lower(self, 0); 12512 return case_operation(self, do_upper); 12513} 12514 12515PyDoc_STRVAR(zfill__doc__, 12516 "S.zfill(width) -> str\n\ 12517\n\ 12518Pad a numeric string S with zeros on the left, to fill a field\n\ 12519of the specified width. The string S is never truncated."); 12520 12521static PyObject * 12522unicode_zfill(PyObject *self, PyObject *args) 12523{ 12524 Py_ssize_t fill; 12525 PyObject *u; 12526 Py_ssize_t width; 12527 int kind; 12528 void *data; 12529 Py_UCS4 chr; 12530 12531 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12532 return NULL; 12533 12534 if (PyUnicode_READY(self) == -1) 12535 return NULL; 12536 12537 if (PyUnicode_GET_LENGTH(self) >= width) 12538 return unicode_result_unchanged(self); 12539 12540 fill = width - PyUnicode_GET_LENGTH(self); 12541 12542 u = pad(self, fill, 0, '0'); 12543 12544 if (u == NULL) 12545 return NULL; 12546 12547 kind = PyUnicode_KIND(u); 12548 data = PyUnicode_DATA(u); 12549 chr = PyUnicode_READ(kind, data, fill); 12550 12551 if (chr == '+' || chr == '-') { 12552 /* move sign to beginning of string */ 12553 PyUnicode_WRITE(kind, data, 0, chr); 12554 PyUnicode_WRITE(kind, data, fill, '0'); 12555 } 12556 12557 assert(_PyUnicode_CheckConsistency(u, 1)); 12558 return u; 12559} 12560 12561#if 0 12562static PyObject * 12563unicode__decimal2ascii(PyObject *self) 12564{ 12565 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12566} 12567#endif 12568 12569PyDoc_STRVAR(startswith__doc__, 12570 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12571\n\ 12572Return True if S starts with the specified prefix, False otherwise.\n\ 12573With optional start, test S beginning at that position.\n\ 12574With optional end, stop comparing S at that position.\n\ 12575prefix can also be a tuple of strings to try."); 12576 12577static PyObject * 12578unicode_startswith(PyObject *self, 12579 PyObject *args) 12580{ 12581 PyObject *subobj; 12582 PyObject *substring; 12583 Py_ssize_t start = 0; 12584 Py_ssize_t end = PY_SSIZE_T_MAX; 12585 int result; 12586 12587 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12588 return NULL; 12589 if (PyTuple_Check(subobj)) { 12590 Py_ssize_t i; 12591 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12592 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12593 if (substring == NULL) 12594 return NULL; 12595 result = tailmatch(self, substring, start, end, -1); 12596 Py_DECREF(substring); 12597 if (result) { 12598 Py_RETURN_TRUE; 12599 } 12600 } 12601 /* nothing matched */ 12602 Py_RETURN_FALSE; 12603 } 12604 substring = PyUnicode_FromObject(subobj); 12605 if (substring == NULL) { 12606 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12607 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12608 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12609 return NULL; 12610 } 12611 result = tailmatch(self, substring, start, end, -1); 12612 Py_DECREF(substring); 12613 return PyBool_FromLong(result); 12614} 12615 12616 12617PyDoc_STRVAR(endswith__doc__, 12618 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12619\n\ 12620Return True if S ends with the specified suffix, False otherwise.\n\ 12621With optional start, test S beginning at that position.\n\ 12622With optional end, stop comparing S at that position.\n\ 12623suffix can also be a tuple of strings to try."); 12624 12625static PyObject * 12626unicode_endswith(PyObject *self, 12627 PyObject *args) 12628{ 12629 PyObject *subobj; 12630 PyObject *substring; 12631 Py_ssize_t start = 0; 12632 Py_ssize_t end = PY_SSIZE_T_MAX; 12633 int result; 12634 12635 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12636 return NULL; 12637 if (PyTuple_Check(subobj)) { 12638 Py_ssize_t i; 12639 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12640 substring = PyUnicode_FromObject( 12641 PyTuple_GET_ITEM(subobj, i)); 12642 if (substring == NULL) 12643 return NULL; 12644 result = tailmatch(self, substring, start, end, +1); 12645 Py_DECREF(substring); 12646 if (result) { 12647 Py_RETURN_TRUE; 12648 } 12649 } 12650 Py_RETURN_FALSE; 12651 } 12652 substring = PyUnicode_FromObject(subobj); 12653 if (substring == NULL) { 12654 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12655 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12656 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12657 return NULL; 12658 } 12659 result = tailmatch(self, substring, start, end, +1); 12660 Py_DECREF(substring); 12661 return PyBool_FromLong(result); 12662} 12663 12664Py_LOCAL_INLINE(void) 12665_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12666{ 12667 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12668 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12669 writer->data = PyUnicode_DATA(writer->buffer); 12670 writer->kind = PyUnicode_KIND(writer->buffer); 12671} 12672 12673void 12674_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) 12675{ 12676 memset(writer, 0, sizeof(*writer)); 12677#ifdef Py_DEBUG 12678 writer->kind = 5; /* invalid kind */ 12679#endif 12680 writer->min_length = Py_MAX(min_length, 100); 12681 writer->overallocate = (min_length > 0); 12682} 12683 12684int 12685_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12686 Py_ssize_t length, Py_UCS4 maxchar) 12687{ 12688 Py_ssize_t newlen; 12689 PyObject *newbuffer; 12690 12691 assert(length > 0); 12692 12693 if (length > PY_SSIZE_T_MAX - writer->pos) { 12694 PyErr_NoMemory(); 12695 return -1; 12696 } 12697 newlen = writer->pos + length; 12698 12699 if (writer->buffer == NULL) { 12700 if (writer->overallocate) { 12701 /* overallocate 25% to limit the number of resize */ 12702 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12703 newlen += newlen / 4; 12704 if (newlen < writer->min_length) 12705 newlen = writer->min_length; 12706 } 12707 writer->buffer = PyUnicode_New(newlen, maxchar); 12708 if (writer->buffer == NULL) 12709 return -1; 12710 _PyUnicodeWriter_Update(writer); 12711 return 0; 12712 } 12713 12714 if (newlen > writer->size) { 12715 if (writer->overallocate) { 12716 /* overallocate 25% to limit the number of resize */ 12717 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12718 newlen += newlen / 4; 12719 if (newlen < writer->min_length) 12720 newlen = writer->min_length; 12721 } 12722 12723 if (maxchar > writer->maxchar || writer->readonly) { 12724 /* resize + widen */ 12725 newbuffer = PyUnicode_New(newlen, maxchar); 12726 if (newbuffer == NULL) 12727 return -1; 12728 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12729 writer->buffer, 0, writer->pos); 12730 Py_DECREF(writer->buffer); 12731 writer->readonly = 0; 12732 } 12733 else { 12734 newbuffer = resize_compact(writer->buffer, newlen); 12735 if (newbuffer == NULL) 12736 return -1; 12737 } 12738 writer->buffer = newbuffer; 12739 _PyUnicodeWriter_Update(writer); 12740 } 12741 else if (maxchar > writer->maxchar) { 12742 assert(!writer->readonly); 12743 newbuffer = PyUnicode_New(writer->size, maxchar); 12744 if (newbuffer == NULL) 12745 return -1; 12746 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12747 writer->buffer, 0, writer->pos); 12748 Py_DECREF(writer->buffer); 12749 writer->buffer = newbuffer; 12750 _PyUnicodeWriter_Update(writer); 12751 } 12752 return 0; 12753} 12754 12755int 12756_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 12757{ 12758 Py_UCS4 maxchar; 12759 Py_ssize_t len; 12760 12761 if (PyUnicode_READY(str) == -1) 12762 return -1; 12763 len = PyUnicode_GET_LENGTH(str); 12764 if (len == 0) 12765 return 0; 12766 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 12767 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 12768 if (writer->buffer == NULL && !writer->overallocate) { 12769 Py_INCREF(str); 12770 writer->buffer = str; 12771 _PyUnicodeWriter_Update(writer); 12772 writer->readonly = 1; 12773 writer->size = 0; 12774 writer->pos += len; 12775 return 0; 12776 } 12777 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 12778 return -1; 12779 } 12780 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 12781 str, 0, len); 12782 writer->pos += len; 12783 return 0; 12784} 12785 12786int 12787_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) 12788{ 12789 Py_UCS4 maxchar; 12790 12791 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 12792 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 12793 return -1; 12794 unicode_write_cstr(writer->buffer, writer->pos, str, len); 12795 writer->pos += len; 12796 return 0; 12797} 12798 12799PyObject * 12800_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 12801{ 12802 if (writer->pos == 0) { 12803 Py_XDECREF(writer->buffer); 12804 Py_INCREF(unicode_empty); 12805 return unicode_empty; 12806 } 12807 if (writer->readonly) { 12808 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 12809 return writer->buffer; 12810 } 12811 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 12812 PyObject *newbuffer; 12813 newbuffer = resize_compact(writer->buffer, writer->pos); 12814 if (newbuffer == NULL) { 12815 Py_DECREF(writer->buffer); 12816 return NULL; 12817 } 12818 writer->buffer = newbuffer; 12819 } 12820 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 12821 return writer->buffer; 12822} 12823 12824void 12825_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 12826{ 12827 Py_CLEAR(writer->buffer); 12828} 12829 12830#include "stringlib/unicode_format.h" 12831 12832PyDoc_STRVAR(format__doc__, 12833 "S.format(*args, **kwargs) -> str\n\ 12834\n\ 12835Return a formatted version of S, using substitutions from args and kwargs.\n\ 12836The substitutions are identified by braces ('{' and '}')."); 12837 12838PyDoc_STRVAR(format_map__doc__, 12839 "S.format_map(mapping) -> str\n\ 12840\n\ 12841Return a formatted version of S, using substitutions from mapping.\n\ 12842The substitutions are identified by braces ('{' and '}')."); 12843 12844static PyObject * 12845unicode__format__(PyObject* self, PyObject* args) 12846{ 12847 PyObject *format_spec; 12848 _PyUnicodeWriter writer; 12849 int ret; 12850 12851 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12852 return NULL; 12853 12854 if (PyUnicode_READY(self) == -1) 12855 return NULL; 12856 _PyUnicodeWriter_Init(&writer, 0); 12857 ret = _PyUnicode_FormatAdvancedWriter(&writer, 12858 self, format_spec, 0, 12859 PyUnicode_GET_LENGTH(format_spec)); 12860 if (ret == -1) { 12861 _PyUnicodeWriter_Dealloc(&writer); 12862 return NULL; 12863 } 12864 return _PyUnicodeWriter_Finish(&writer); 12865} 12866 12867PyDoc_STRVAR(p_format__doc__, 12868 "S.__format__(format_spec) -> str\n\ 12869\n\ 12870Return a formatted version of S as described by format_spec."); 12871 12872static PyObject * 12873unicode__sizeof__(PyObject *v) 12874{ 12875 Py_ssize_t size; 12876 12877 /* If it's a compact object, account for base structure + 12878 character data. */ 12879 if (PyUnicode_IS_COMPACT_ASCII(v)) 12880 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12881 else if (PyUnicode_IS_COMPACT(v)) 12882 size = sizeof(PyCompactUnicodeObject) + 12883 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12884 else { 12885 /* If it is a two-block object, account for base object, and 12886 for character block if present. */ 12887 size = sizeof(PyUnicodeObject); 12888 if (_PyUnicode_DATA_ANY(v)) 12889 size += (PyUnicode_GET_LENGTH(v) + 1) * 12890 PyUnicode_KIND(v); 12891 } 12892 /* If the wstr pointer is present, account for it unless it is shared 12893 with the data pointer. Check if the data is not shared. */ 12894 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12895 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12896 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12897 size += PyUnicode_UTF8_LENGTH(v) + 1; 12898 12899 return PyLong_FromSsize_t(size); 12900} 12901 12902PyDoc_STRVAR(sizeof__doc__, 12903 "S.__sizeof__() -> size of S in memory, in bytes"); 12904 12905static PyObject * 12906unicode_getnewargs(PyObject *v) 12907{ 12908 PyObject *copy = _PyUnicode_Copy(v); 12909 if (!copy) 12910 return NULL; 12911 return Py_BuildValue("(N)", copy); 12912} 12913 12914static PyMethodDef unicode_methods[] = { 12915 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12916 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12917 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 12918 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 12919 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12920 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12921 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 12922 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12923 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12924 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12925 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12926 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12927 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12928 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12929 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12930 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12931 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12932 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12933 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12934 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12935 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12936 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12937 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12938 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12939 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12940 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12941 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12942 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12943 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12944 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12945 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12946 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12947 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12948 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12949 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12950 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12951 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12952 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12953 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12954 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12955 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12956 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12957 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12958 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12959 {"maketrans", (PyCFunction) unicode_maketrans, 12960 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12961 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12962#if 0 12963 /* These methods are just used for debugging the implementation. */ 12964 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12965#endif 12966 12967 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12968 {NULL, NULL} 12969}; 12970 12971static PyObject * 12972unicode_mod(PyObject *v, PyObject *w) 12973{ 12974 if (!PyUnicode_Check(v)) 12975 Py_RETURN_NOTIMPLEMENTED; 12976 return PyUnicode_Format(v, w); 12977} 12978 12979static PyNumberMethods unicode_as_number = { 12980 0, /*nb_add*/ 12981 0, /*nb_subtract*/ 12982 0, /*nb_multiply*/ 12983 unicode_mod, /*nb_remainder*/ 12984}; 12985 12986static PySequenceMethods unicode_as_sequence = { 12987 (lenfunc) unicode_length, /* sq_length */ 12988 PyUnicode_Concat, /* sq_concat */ 12989 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12990 (ssizeargfunc) unicode_getitem, /* sq_item */ 12991 0, /* sq_slice */ 12992 0, /* sq_ass_item */ 12993 0, /* sq_ass_slice */ 12994 PyUnicode_Contains, /* sq_contains */ 12995}; 12996 12997static PyObject* 12998unicode_subscript(PyObject* self, PyObject* item) 12999{ 13000 if (PyUnicode_READY(self) == -1) 13001 return NULL; 13002 13003 if (PyIndex_Check(item)) { 13004 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13005 if (i == -1 && PyErr_Occurred()) 13006 return NULL; 13007 if (i < 0) 13008 i += PyUnicode_GET_LENGTH(self); 13009 return unicode_getitem(self, i); 13010 } else if (PySlice_Check(item)) { 13011 Py_ssize_t start, stop, step, slicelength, cur, i; 13012 PyObject *result; 13013 void *src_data, *dest_data; 13014 int src_kind, dest_kind; 13015 Py_UCS4 ch, max_char, kind_limit; 13016 13017 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13018 &start, &stop, &step, &slicelength) < 0) { 13019 return NULL; 13020 } 13021 13022 if (slicelength <= 0) { 13023 Py_INCREF(unicode_empty); 13024 return unicode_empty; 13025 } else if (start == 0 && step == 1 && 13026 slicelength == PyUnicode_GET_LENGTH(self)) { 13027 return unicode_result_unchanged(self); 13028 } else if (step == 1) { 13029 return PyUnicode_Substring(self, 13030 start, start + slicelength); 13031 } 13032 /* General case */ 13033 src_kind = PyUnicode_KIND(self); 13034 src_data = PyUnicode_DATA(self); 13035 if (!PyUnicode_IS_ASCII(self)) { 13036 kind_limit = kind_maxchar_limit(src_kind); 13037 max_char = 0; 13038 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13039 ch = PyUnicode_READ(src_kind, src_data, cur); 13040 if (ch > max_char) { 13041 max_char = ch; 13042 if (max_char >= kind_limit) 13043 break; 13044 } 13045 } 13046 } 13047 else 13048 max_char = 127; 13049 result = PyUnicode_New(slicelength, max_char); 13050 if (result == NULL) 13051 return NULL; 13052 dest_kind = PyUnicode_KIND(result); 13053 dest_data = PyUnicode_DATA(result); 13054 13055 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13056 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13057 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13058 } 13059 assert(_PyUnicode_CheckConsistency(result, 1)); 13060 return result; 13061 } else { 13062 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13063 return NULL; 13064 } 13065} 13066 13067static PyMappingMethods unicode_as_mapping = { 13068 (lenfunc)unicode_length, /* mp_length */ 13069 (binaryfunc)unicode_subscript, /* mp_subscript */ 13070 (objobjargproc)0, /* mp_ass_subscript */ 13071}; 13072 13073 13074/* Helpers for PyUnicode_Format() */ 13075 13076struct unicode_formatter_t { 13077 PyObject *args; 13078 int args_owned; 13079 Py_ssize_t arglen, argidx; 13080 PyObject *dict; 13081 13082 enum PyUnicode_Kind fmtkind; 13083 Py_ssize_t fmtcnt, fmtpos; 13084 void *fmtdata; 13085 PyObject *fmtstr; 13086 13087 _PyUnicodeWriter writer; 13088}; 13089 13090struct unicode_format_arg_t { 13091 Py_UCS4 ch; 13092 int flags; 13093 Py_ssize_t width; 13094 int prec; 13095 int sign; 13096}; 13097 13098static PyObject * 13099unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13100{ 13101 Py_ssize_t argidx = ctx->argidx; 13102 13103 if (argidx < ctx->arglen) { 13104 ctx->argidx++; 13105 if (ctx->arglen < 0) 13106 return ctx->args; 13107 else 13108 return PyTuple_GetItem(ctx->args, argidx); 13109 } 13110 PyErr_SetString(PyExc_TypeError, 13111 "not enough arguments for format string"); 13112 return NULL; 13113} 13114 13115/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13116 13117/* Format a float into the writer if the writer is not NULL, or into *p_output 13118 otherwise. 13119 13120 Return 0 on success, raise an exception and return -1 on error. */ 13121static int 13122formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13123 PyObject **p_output, 13124 _PyUnicodeWriter *writer) 13125{ 13126 char *p; 13127 double x; 13128 Py_ssize_t len; 13129 int prec; 13130 int dtoa_flags; 13131 13132 x = PyFloat_AsDouble(v); 13133 if (x == -1.0 && PyErr_Occurred()) 13134 return -1; 13135 13136 prec = arg->prec; 13137 if (prec < 0) 13138 prec = 6; 13139 13140 if (arg->flags & F_ALT) 13141 dtoa_flags = Py_DTSF_ALT; 13142 else 13143 dtoa_flags = 0; 13144 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13145 if (p == NULL) 13146 return -1; 13147 len = strlen(p); 13148 if (writer) { 13149 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) { 13150 PyMem_Free(p); 13151 return -1; 13152 } 13153 unicode_write_cstr(writer->buffer, writer->pos, p, len); 13154 writer->pos += len; 13155 } 13156 else 13157 *p_output = _PyUnicode_FromASCII(p, len); 13158 PyMem_Free(p); 13159 return 0; 13160} 13161 13162/* formatlong() emulates the format codes d, u, o, x and X, and 13163 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13164 * Python's regular ints. 13165 * Return value: a new PyUnicodeObject*, or NULL if error. 13166 * The output string is of the form 13167 * "-"? ("0x" | "0X")? digit+ 13168 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13169 * set in flags. The case of hex digits will be correct, 13170 * There will be at least prec digits, zero-filled on the left if 13171 * necessary to get that many. 13172 * val object to be converted 13173 * flags bitmask of format flags; only F_ALT is looked at 13174 * prec minimum number of digits; 0-fill on left if needed 13175 * type a character in [duoxX]; u acts the same as d 13176 * 13177 * CAUTION: o, x and X conversions on regular ints can never 13178 * produce a '-' sign, but can for Python's unbounded ints. 13179 */ 13180static PyObject* 13181formatlong(PyObject *val, struct unicode_format_arg_t *arg) 13182{ 13183 PyObject *result = NULL; 13184 char *buf; 13185 Py_ssize_t i; 13186 int sign; /* 1 if '-', else 0 */ 13187 int len; /* number of characters */ 13188 Py_ssize_t llen; 13189 int numdigits; /* len == numnondigits + numdigits */ 13190 int numnondigits = 0; 13191 int prec = arg->prec; 13192 int type = arg->ch; 13193 13194 /* Avoid exceeding SSIZE_T_MAX */ 13195 if (prec > INT_MAX-3) { 13196 PyErr_SetString(PyExc_OverflowError, 13197 "precision too large"); 13198 return NULL; 13199 } 13200 13201 assert(PyLong_Check(val)); 13202 13203 switch (type) { 13204 default: 13205 assert(!"'type' not in [diuoxX]"); 13206 case 'd': 13207 case 'i': 13208 case 'u': 13209 /* Special-case boolean: we want 0/1 */ 13210 if (PyBool_Check(val)) 13211 result = PyNumber_ToBase(val, 10); 13212 else 13213 result = Py_TYPE(val)->tp_str(val); 13214 break; 13215 case 'o': 13216 numnondigits = 2; 13217 result = PyNumber_ToBase(val, 8); 13218 break; 13219 case 'x': 13220 case 'X': 13221 numnondigits = 2; 13222 result = PyNumber_ToBase(val, 16); 13223 break; 13224 } 13225 if (!result) 13226 return NULL; 13227 13228 assert(unicode_modifiable(result)); 13229 assert(PyUnicode_IS_READY(result)); 13230 assert(PyUnicode_IS_ASCII(result)); 13231 13232 /* To modify the string in-place, there can only be one reference. */ 13233 if (Py_REFCNT(result) != 1) { 13234 PyErr_BadInternalCall(); 13235 return NULL; 13236 } 13237 buf = PyUnicode_DATA(result); 13238 llen = PyUnicode_GET_LENGTH(result); 13239 if (llen > INT_MAX) { 13240 PyErr_SetString(PyExc_ValueError, 13241 "string too large in _PyBytes_FormatLong"); 13242 return NULL; 13243 } 13244 len = (int)llen; 13245 sign = buf[0] == '-'; 13246 numnondigits += sign; 13247 numdigits = len - numnondigits; 13248 assert(numdigits > 0); 13249 13250 /* Get rid of base marker unless F_ALT */ 13251 if (((arg->flags & F_ALT) == 0 && 13252 (type == 'o' || type == 'x' || type == 'X'))) { 13253 assert(buf[sign] == '0'); 13254 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13255 buf[sign+1] == 'o'); 13256 numnondigits -= 2; 13257 buf += 2; 13258 len -= 2; 13259 if (sign) 13260 buf[0] = '-'; 13261 assert(len == numnondigits + numdigits); 13262 assert(numdigits > 0); 13263 } 13264 13265 /* Fill with leading zeroes to meet minimum width. */ 13266 if (prec > numdigits) { 13267 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13268 numnondigits + prec); 13269 char *b1; 13270 if (!r1) { 13271 Py_DECREF(result); 13272 return NULL; 13273 } 13274 b1 = PyBytes_AS_STRING(r1); 13275 for (i = 0; i < numnondigits; ++i) 13276 *b1++ = *buf++; 13277 for (i = 0; i < prec - numdigits; i++) 13278 *b1++ = '0'; 13279 for (i = 0; i < numdigits; i++) 13280 *b1++ = *buf++; 13281 *b1 = '\0'; 13282 Py_DECREF(result); 13283 result = r1; 13284 buf = PyBytes_AS_STRING(result); 13285 len = numnondigits + prec; 13286 } 13287 13288 /* Fix up case for hex conversions. */ 13289 if (type == 'X') { 13290 /* Need to convert all lower case letters to upper case. 13291 and need to convert 0x to 0X (and -0x to -0X). */ 13292 for (i = 0; i < len; i++) 13293 if (buf[i] >= 'a' && buf[i] <= 'x') 13294 buf[i] -= 'a'-'A'; 13295 } 13296 if (!PyUnicode_Check(result) 13297 || buf != PyUnicode_DATA(result)) { 13298 PyObject *unicode; 13299 unicode = _PyUnicode_FromASCII(buf, len); 13300 Py_DECREF(result); 13301 result = unicode; 13302 } 13303 else if (len != PyUnicode_GET_LENGTH(result)) { 13304 if (PyUnicode_Resize(&result, len) < 0) 13305 Py_CLEAR(result); 13306 } 13307 return result; 13308} 13309 13310/* Format an integer. 13311 * Return 1 if the number has been formatted into the writer, 13312 * 0 if the number has been formatted into *p_output 13313 * -1 and raise an exception on error */ 13314static int 13315mainformatlong(PyObject *v, 13316 struct unicode_format_arg_t *arg, 13317 PyObject **p_output, 13318 _PyUnicodeWriter *writer) 13319{ 13320 PyObject *iobj, *res; 13321 char type = (char)arg->ch; 13322 13323 if (!PyNumber_Check(v)) 13324 goto wrongtype; 13325 13326 if (!PyLong_Check(v)) { 13327 iobj = PyNumber_Long(v); 13328 if (iobj == NULL) { 13329 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13330 goto wrongtype; 13331 return -1; 13332 } 13333 assert(PyLong_Check(iobj)); 13334 } 13335 else { 13336 iobj = v; 13337 Py_INCREF(iobj); 13338 } 13339 13340 if (PyLong_CheckExact(v) 13341 && arg->width == -1 && arg->prec == -1 13342 && !(arg->flags & (F_SIGN | F_BLANK)) 13343 && type != 'X') 13344 { 13345 /* Fast path */ 13346 int alternate = arg->flags & F_ALT; 13347 int base; 13348 13349 switch(type) 13350 { 13351 default: 13352 assert(0 && "'type' not in [diuoxX]"); 13353 case 'd': 13354 case 'i': 13355 case 'u': 13356 base = 10; 13357 break; 13358 case 'o': 13359 base = 8; 13360 break; 13361 case 'x': 13362 case 'X': 13363 base = 16; 13364 break; 13365 } 13366 13367 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 13368 Py_DECREF(iobj); 13369 return -1; 13370 } 13371 Py_DECREF(iobj); 13372 return 1; 13373 } 13374 13375 res = formatlong(iobj, arg); 13376 Py_DECREF(iobj); 13377 if (res == NULL) 13378 return -1; 13379 *p_output = res; 13380 return 0; 13381 13382wrongtype: 13383 PyErr_Format(PyExc_TypeError, 13384 "%%%c format: a number is required, " 13385 "not %.200s", 13386 type, Py_TYPE(v)->tp_name); 13387 return -1; 13388} 13389 13390static Py_UCS4 13391formatchar(PyObject *v) 13392{ 13393 /* presume that the buffer is at least 3 characters long */ 13394 if (PyUnicode_Check(v)) { 13395 if (PyUnicode_GET_LENGTH(v) == 1) { 13396 return PyUnicode_READ_CHAR(v, 0); 13397 } 13398 goto onError; 13399 } 13400 else { 13401 /* Integer input truncated to a character */ 13402 long x; 13403 x = PyLong_AsLong(v); 13404 if (x == -1 && PyErr_Occurred()) 13405 goto onError; 13406 13407 if (x < 0 || x > MAX_UNICODE) { 13408 PyErr_SetString(PyExc_OverflowError, 13409 "%c arg not in range(0x110000)"); 13410 return (Py_UCS4) -1; 13411 } 13412 13413 return (Py_UCS4) x; 13414 } 13415 13416 onError: 13417 PyErr_SetString(PyExc_TypeError, 13418 "%c requires int or char"); 13419 return (Py_UCS4) -1; 13420} 13421 13422/* Parse options of an argument: flags, width, precision. 13423 Handle also "%(name)" syntax. 13424 13425 Return 0 if the argument has been formatted into arg->str. 13426 Return 1 if the argument has been written into ctx->writer, 13427 Raise an exception and return -1 on error. */ 13428static int 13429unicode_format_arg_parse(struct unicode_formatter_t *ctx, 13430 struct unicode_format_arg_t *arg) 13431{ 13432#define FORMAT_READ(ctx) \ 13433 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 13434 13435 PyObject *v; 13436 13437 arg->ch = FORMAT_READ(ctx); 13438 if (arg->ch == '(') { 13439 /* Get argument value from a dictionary. Example: "%(name)s". */ 13440 Py_ssize_t keystart; 13441 Py_ssize_t keylen; 13442 PyObject *key; 13443 int pcount = 1; 13444 13445 if (ctx->dict == NULL) { 13446 PyErr_SetString(PyExc_TypeError, 13447 "format requires a mapping"); 13448 return -1; 13449 } 13450 ++ctx->fmtpos; 13451 --ctx->fmtcnt; 13452 keystart = ctx->fmtpos; 13453 /* Skip over balanced parentheses */ 13454 while (pcount > 0 && --ctx->fmtcnt >= 0) { 13455 arg->ch = FORMAT_READ(ctx); 13456 if (arg->ch == ')') 13457 --pcount; 13458 else if (arg->ch == '(') 13459 ++pcount; 13460 ctx->fmtpos++; 13461 } 13462 keylen = ctx->fmtpos - keystart - 1; 13463 if (ctx->fmtcnt < 0 || pcount > 0) { 13464 PyErr_SetString(PyExc_ValueError, 13465 "incomplete format key"); 13466 return -1; 13467 } 13468 key = PyUnicode_Substring(ctx->fmtstr, 13469 keystart, keystart + keylen); 13470 if (key == NULL) 13471 return -1; 13472 if (ctx->args_owned) { 13473 Py_DECREF(ctx->args); 13474 ctx->args_owned = 0; 13475 } 13476 ctx->args = PyObject_GetItem(ctx->dict, key); 13477 Py_DECREF(key); 13478 if (ctx->args == NULL) 13479 return -1; 13480 ctx->args_owned = 1; 13481 ctx->arglen = -1; 13482 ctx->argidx = -2; 13483 } 13484 13485 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 13486 arg->flags = 0; 13487 while (--ctx->fmtcnt >= 0) { 13488 arg->ch = FORMAT_READ(ctx); 13489 ctx->fmtpos++; 13490 switch (arg->ch) { 13491 case '-': arg->flags |= F_LJUST; continue; 13492 case '+': arg->flags |= F_SIGN; continue; 13493 case ' ': arg->flags |= F_BLANK; continue; 13494 case '#': arg->flags |= F_ALT; continue; 13495 case '0': arg->flags |= F_ZERO; continue; 13496 } 13497 break; 13498 } 13499 13500 /* Parse width. Example: "%10s" => width=10 */ 13501 arg->width = -1; 13502 if (arg->ch == '*') { 13503 v = unicode_format_getnextarg(ctx); 13504 if (v == NULL) 13505 return -1; 13506 if (!PyLong_Check(v)) { 13507 PyErr_SetString(PyExc_TypeError, 13508 "* wants int"); 13509 return -1; 13510 } 13511 arg->width = PyLong_AsLong(v); 13512 if (arg->width == -1 && PyErr_Occurred()) 13513 return -1; 13514 if (arg->width < 0) { 13515 arg->flags |= F_LJUST; 13516 arg->width = -arg->width; 13517 } 13518 if (--ctx->fmtcnt >= 0) { 13519 arg->ch = FORMAT_READ(ctx); 13520 ctx->fmtpos++; 13521 } 13522 } 13523 else if (arg->ch >= '0' && arg->ch <= '9') { 13524 arg->width = arg->ch - '0'; 13525 while (--ctx->fmtcnt >= 0) { 13526 arg->ch = FORMAT_READ(ctx); 13527 ctx->fmtpos++; 13528 if (arg->ch < '0' || arg->ch > '9') 13529 break; 13530 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 13531 mixing signed and unsigned comparison. Since arg->ch is between 13532 '0' and '9', casting to int is safe. */ 13533 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 13534 PyErr_SetString(PyExc_ValueError, 13535 "width too big"); 13536 return -1; 13537 } 13538 arg->width = arg->width*10 + (arg->ch - '0'); 13539 } 13540 } 13541 13542 /* Parse precision. Example: "%.3f" => prec=3 */ 13543 arg->prec = -1; 13544 if (arg->ch == '.') { 13545 arg->prec = 0; 13546 if (--ctx->fmtcnt >= 0) { 13547 arg->ch = FORMAT_READ(ctx); 13548 ctx->fmtpos++; 13549 } 13550 if (arg->ch == '*') { 13551 v = unicode_format_getnextarg(ctx); 13552 if (v == NULL) 13553 return -1; 13554 if (!PyLong_Check(v)) { 13555 PyErr_SetString(PyExc_TypeError, 13556 "* wants int"); 13557 return -1; 13558 } 13559 arg->prec = PyLong_AsLong(v); 13560 if (arg->prec == -1 && PyErr_Occurred()) 13561 return -1; 13562 if (arg->prec < 0) 13563 arg->prec = 0; 13564 if (--ctx->fmtcnt >= 0) { 13565 arg->ch = FORMAT_READ(ctx); 13566 ctx->fmtpos++; 13567 } 13568 } 13569 else if (arg->ch >= '0' && arg->ch <= '9') { 13570 arg->prec = arg->ch - '0'; 13571 while (--ctx->fmtcnt >= 0) { 13572 arg->ch = FORMAT_READ(ctx); 13573 ctx->fmtpos++; 13574 if (arg->ch < '0' || arg->ch > '9') 13575 break; 13576 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 13577 PyErr_SetString(PyExc_ValueError, 13578 "precision too big"); 13579 return -1; 13580 } 13581 arg->prec = arg->prec*10 + (arg->ch - '0'); 13582 } 13583 } 13584 } 13585 13586 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 13587 if (ctx->fmtcnt >= 0) { 13588 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 13589 if (--ctx->fmtcnt >= 0) { 13590 arg->ch = FORMAT_READ(ctx); 13591 ctx->fmtpos++; 13592 } 13593 } 13594 } 13595 if (ctx->fmtcnt < 0) { 13596 PyErr_SetString(PyExc_ValueError, 13597 "incomplete format"); 13598 return -1; 13599 } 13600 return 0; 13601 13602#undef FORMAT_READ 13603} 13604 13605/* Format one argument. Supported conversion specifiers: 13606 13607 - "s", "r", "a": any type 13608 - "i", "d", "u", "o", "x", "X": int 13609 - "e", "E", "f", "F", "g", "G": float 13610 - "c": int or str (1 character) 13611 13612 Return 0 if the argument has been formatted into *p_str, 13613 1 if the argument has been written into ctx->writer, 13614 -1 on error. */ 13615static int 13616unicode_format_arg_format(struct unicode_formatter_t *ctx, 13617 struct unicode_format_arg_t *arg, 13618 PyObject **p_str) 13619{ 13620 PyObject *v; 13621 _PyUnicodeWriter *writer = &ctx->writer; 13622 13623 if (ctx->fmtcnt == 0) 13624 ctx->writer.overallocate = 0; 13625 13626 if (arg->ch == '%') { 13627 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1) 13628 return -1; 13629 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); 13630 writer->pos += 1; 13631 return 1; 13632 } 13633 13634 v = unicode_format_getnextarg(ctx); 13635 if (v == NULL) 13636 return -1; 13637 13638 arg->sign = 0; 13639 13640 switch (arg->ch) { 13641 13642 case 's': 13643 case 'r': 13644 case 'a': 13645 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 13646 /* Fast path */ 13647 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 13648 return -1; 13649 return 1; 13650 } 13651 13652 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 13653 *p_str = v; 13654 Py_INCREF(*p_str); 13655 } 13656 else { 13657 if (arg->ch == 's') 13658 *p_str = PyObject_Str(v); 13659 else if (arg->ch == 'r') 13660 *p_str = PyObject_Repr(v); 13661 else 13662 *p_str = PyObject_ASCII(v); 13663 } 13664 break; 13665 13666 case 'i': 13667 case 'd': 13668 case 'u': 13669 case 'o': 13670 case 'x': 13671 case 'X': 13672 { 13673 int ret = mainformatlong(v, arg, p_str, writer); 13674 if (ret != 0) 13675 return ret; 13676 arg->sign = 1; 13677 break; 13678 } 13679 13680 case 'e': 13681 case 'E': 13682 case 'f': 13683 case 'F': 13684 case 'g': 13685 case 'G': 13686 if (arg->width == -1 && arg->prec == -1 13687 && !(arg->flags & (F_SIGN | F_BLANK))) 13688 { 13689 /* Fast path */ 13690 if (formatfloat(v, arg, NULL, writer) == -1) 13691 return -1; 13692 return 1; 13693 } 13694 13695 arg->sign = 1; 13696 if (formatfloat(v, arg, p_str, NULL) == -1) 13697 return -1; 13698 break; 13699 13700 case 'c': 13701 { 13702 Py_UCS4 ch = formatchar(v); 13703 if (ch == (Py_UCS4) -1) 13704 return -1; 13705 if (arg->width == -1 && arg->prec == -1) { 13706 /* Fast path */ 13707 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1) 13708 return -1; 13709 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13710 writer->pos += 1; 13711 return 1; 13712 } 13713 *p_str = PyUnicode_FromOrdinal(ch); 13714 break; 13715 } 13716 13717 default: 13718 PyErr_Format(PyExc_ValueError, 13719 "unsupported format character '%c' (0x%x) " 13720 "at index %zd", 13721 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 13722 (int)arg->ch, 13723 ctx->fmtpos - 1); 13724 return -1; 13725 } 13726 if (*p_str == NULL) 13727 return -1; 13728 assert (PyUnicode_Check(*p_str)); 13729 return 0; 13730} 13731 13732static int 13733unicode_format_arg_output(struct unicode_formatter_t *ctx, 13734 struct unicode_format_arg_t *arg, 13735 PyObject *str) 13736{ 13737 Py_ssize_t len; 13738 enum PyUnicode_Kind kind; 13739 void *pbuf; 13740 Py_ssize_t pindex; 13741 Py_UCS4 signchar; 13742 Py_ssize_t buflen; 13743 Py_UCS4 maxchar, bufmaxchar; 13744 Py_ssize_t sublen; 13745 _PyUnicodeWriter *writer = &ctx->writer; 13746 Py_UCS4 fill; 13747 13748 fill = ' '; 13749 if (arg->sign && arg->flags & F_ZERO) 13750 fill = '0'; 13751 13752 if (PyUnicode_READY(str) == -1) 13753 return -1; 13754 13755 len = PyUnicode_GET_LENGTH(str); 13756 if ((arg->width == -1 || arg->width <= len) 13757 && (arg->prec == -1 || arg->prec >= len) 13758 && !(arg->flags & (F_SIGN | F_BLANK))) 13759 { 13760 /* Fast path */ 13761 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 13762 return -1; 13763 return 0; 13764 } 13765 13766 /* Truncate the string for "s", "r" and "a" formats 13767 if the precision is set */ 13768 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 13769 if (arg->prec >= 0 && len > arg->prec) 13770 len = arg->prec; 13771 } 13772 13773 /* Adjust sign and width */ 13774 kind = PyUnicode_KIND(str); 13775 pbuf = PyUnicode_DATA(str); 13776 pindex = 0; 13777 signchar = '\0'; 13778 if (arg->sign) { 13779 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 13780 if (ch == '-' || ch == '+') { 13781 signchar = ch; 13782 len--; 13783 pindex++; 13784 } 13785 else if (arg->flags & F_SIGN) 13786 signchar = '+'; 13787 else if (arg->flags & F_BLANK) 13788 signchar = ' '; 13789 else 13790 arg->sign = 0; 13791 } 13792 if (arg->width < len) 13793 arg->width = len; 13794 13795 /* Prepare the writer */ 13796 bufmaxchar = 127; 13797 if (!(arg->flags & F_LJUST)) { 13798 if (arg->sign) { 13799 if ((arg->width-1) > len) 13800 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13801 } 13802 else { 13803 if (arg->width > len) 13804 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13805 } 13806 } 13807 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 13808 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); 13809 buflen = arg->width; 13810 if (arg->sign && len == arg->width) 13811 buflen++; 13812 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1) 13813 return -1; 13814 13815 /* Write the sign if needed */ 13816 if (arg->sign) { 13817 if (fill != ' ') { 13818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 13819 writer->pos += 1; 13820 } 13821 if (arg->width > len) 13822 arg->width--; 13823 } 13824 13825 /* Write the numeric prefix for "x", "X" and "o" formats 13826 if the alternate form is used. 13827 For example, write "0x" for the "%#x" format. */ 13828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 13829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13830 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 13831 if (fill != ' ') { 13832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 13833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 13834 writer->pos += 2; 13835 pindex += 2; 13836 } 13837 arg->width -= 2; 13838 if (arg->width < 0) 13839 arg->width = 0; 13840 len -= 2; 13841 } 13842 13843 /* Pad left with the fill character if needed */ 13844 if (arg->width > len && !(arg->flags & F_LJUST)) { 13845 sublen = arg->width - len; 13846 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 13847 writer->pos += sublen; 13848 arg->width = len; 13849 } 13850 13851 /* If padding with spaces: write sign if needed and/or numeric prefix if 13852 the alternate form is used */ 13853 if (fill == ' ') { 13854 if (arg->sign) { 13855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 13856 writer->pos += 1; 13857 } 13858 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 13859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13860 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 13861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 13862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 13863 writer->pos += 2; 13864 pindex += 2; 13865 } 13866 } 13867 13868 /* Write characters */ 13869 if (len) { 13870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13871 str, pindex, len); 13872 writer->pos += len; 13873 } 13874 13875 /* Pad right with the fill character if needed */ 13876 if (arg->width > len) { 13877 sublen = arg->width - len; 13878 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 13879 writer->pos += sublen; 13880 } 13881 return 0; 13882} 13883 13884/* Helper of PyUnicode_Format(): format one arg. 13885 Return 0 on success, raise an exception and return -1 on error. */ 13886static int 13887unicode_format_arg(struct unicode_formatter_t *ctx) 13888{ 13889 struct unicode_format_arg_t arg; 13890 PyObject *str; 13891 int ret; 13892 13893 ret = unicode_format_arg_parse(ctx, &arg); 13894 if (ret == -1) 13895 return -1; 13896 13897 ret = unicode_format_arg_format(ctx, &arg, &str); 13898 if (ret == -1) 13899 return -1; 13900 13901 if (ret != 1) { 13902 ret = unicode_format_arg_output(ctx, &arg, str); 13903 Py_DECREF(str); 13904 if (ret == -1) 13905 return -1; 13906 } 13907 13908 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 13909 PyErr_SetString(PyExc_TypeError, 13910 "not all arguments converted during string formatting"); 13911 return -1; 13912 } 13913 return 0; 13914} 13915 13916PyObject * 13917PyUnicode_Format(PyObject *format, PyObject *args) 13918{ 13919 struct unicode_formatter_t ctx; 13920 13921 if (format == NULL || args == NULL) { 13922 PyErr_BadInternalCall(); 13923 return NULL; 13924 } 13925 13926 ctx.fmtstr = PyUnicode_FromObject(format); 13927 if (ctx.fmtstr == NULL) 13928 return NULL; 13929 if (PyUnicode_READY(ctx.fmtstr) == -1) { 13930 Py_DECREF(ctx.fmtstr); 13931 return NULL; 13932 } 13933 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 13934 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 13935 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 13936 ctx.fmtpos = 0; 13937 13938 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100); 13939 13940 if (PyTuple_Check(args)) { 13941 ctx.arglen = PyTuple_Size(args); 13942 ctx.argidx = 0; 13943 } 13944 else { 13945 ctx.arglen = -1; 13946 ctx.argidx = -2; 13947 } 13948 ctx.args_owned = 0; 13949 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 13950 ctx.dict = args; 13951 else 13952 ctx.dict = NULL; 13953 ctx.args = args; 13954 13955 while (--ctx.fmtcnt >= 0) { 13956 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 13957 Py_ssize_t nonfmtpos, sublen; 13958 Py_UCS4 maxchar; 13959 13960 nonfmtpos = ctx.fmtpos++; 13961 while (ctx.fmtcnt >= 0 && 13962 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 13963 ctx.fmtpos++; 13964 ctx.fmtcnt--; 13965 } 13966 if (ctx.fmtcnt < 0) { 13967 ctx.fmtpos--; 13968 ctx.writer.overallocate = 0; 13969 } 13970 sublen = ctx.fmtpos - nonfmtpos; 13971 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr, 13972 nonfmtpos, nonfmtpos + sublen); 13973 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1) 13974 goto onError; 13975 13976 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos, 13977 ctx.fmtstr, nonfmtpos, sublen); 13978 ctx.writer.pos += sublen; 13979 } 13980 else { 13981 ctx.fmtpos++; 13982 if (unicode_format_arg(&ctx) == -1) 13983 goto onError; 13984 } 13985 } 13986 13987 if (ctx.argidx < ctx.arglen && !ctx.dict) { 13988 PyErr_SetString(PyExc_TypeError, 13989 "not all arguments converted during string formatting"); 13990 goto onError; 13991 } 13992 13993 if (ctx.args_owned) { 13994 Py_DECREF(ctx.args); 13995 } 13996 Py_DECREF(ctx.fmtstr); 13997 return _PyUnicodeWriter_Finish(&ctx.writer); 13998 13999 onError: 14000 Py_DECREF(ctx.fmtstr); 14001 _PyUnicodeWriter_Dealloc(&ctx.writer); 14002 if (ctx.args_owned) { 14003 Py_DECREF(ctx.args); 14004 } 14005 return NULL; 14006} 14007 14008static PyObject * 14009unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14010 14011static PyObject * 14012unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14013{ 14014 PyObject *x = NULL; 14015 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14016 char *encoding = NULL; 14017 char *errors = NULL; 14018 14019 if (type != &PyUnicode_Type) 14020 return unicode_subtype_new(type, args, kwds); 14021 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14022 kwlist, &x, &encoding, &errors)) 14023 return NULL; 14024 if (x == NULL) { 14025 Py_INCREF(unicode_empty); 14026 return unicode_empty; 14027 } 14028 if (encoding == NULL && errors == NULL) 14029 return PyObject_Str(x); 14030 else 14031 return PyUnicode_FromEncodedObject(x, encoding, errors); 14032} 14033 14034static PyObject * 14035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14036{ 14037 PyObject *unicode, *self; 14038 Py_ssize_t length, char_size; 14039 int share_wstr, share_utf8; 14040 unsigned int kind; 14041 void *data; 14042 14043 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14044 14045 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14046 if (unicode == NULL) 14047 return NULL; 14048 assert(_PyUnicode_CHECK(unicode)); 14049 if (PyUnicode_READY(unicode) == -1) { 14050 Py_DECREF(unicode); 14051 return NULL; 14052 } 14053 14054 self = type->tp_alloc(type, 0); 14055 if (self == NULL) { 14056 Py_DECREF(unicode); 14057 return NULL; 14058 } 14059 kind = PyUnicode_KIND(unicode); 14060 length = PyUnicode_GET_LENGTH(unicode); 14061 14062 _PyUnicode_LENGTH(self) = length; 14063#ifdef Py_DEBUG 14064 _PyUnicode_HASH(self) = -1; 14065#else 14066 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14067#endif 14068 _PyUnicode_STATE(self).interned = 0; 14069 _PyUnicode_STATE(self).kind = kind; 14070 _PyUnicode_STATE(self).compact = 0; 14071 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14072 _PyUnicode_STATE(self).ready = 1; 14073 _PyUnicode_WSTR(self) = NULL; 14074 _PyUnicode_UTF8_LENGTH(self) = 0; 14075 _PyUnicode_UTF8(self) = NULL; 14076 _PyUnicode_WSTR_LENGTH(self) = 0; 14077 _PyUnicode_DATA_ANY(self) = NULL; 14078 14079 share_utf8 = 0; 14080 share_wstr = 0; 14081 if (kind == PyUnicode_1BYTE_KIND) { 14082 char_size = 1; 14083 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14084 share_utf8 = 1; 14085 } 14086 else if (kind == PyUnicode_2BYTE_KIND) { 14087 char_size = 2; 14088 if (sizeof(wchar_t) == 2) 14089 share_wstr = 1; 14090 } 14091 else { 14092 assert(kind == PyUnicode_4BYTE_KIND); 14093 char_size = 4; 14094 if (sizeof(wchar_t) == 4) 14095 share_wstr = 1; 14096 } 14097 14098 /* Ensure we won't overflow the length. */ 14099 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14100 PyErr_NoMemory(); 14101 goto onError; 14102 } 14103 data = PyObject_MALLOC((length + 1) * char_size); 14104 if (data == NULL) { 14105 PyErr_NoMemory(); 14106 goto onError; 14107 } 14108 14109 _PyUnicode_DATA_ANY(self) = data; 14110 if (share_utf8) { 14111 _PyUnicode_UTF8_LENGTH(self) = length; 14112 _PyUnicode_UTF8(self) = data; 14113 } 14114 if (share_wstr) { 14115 _PyUnicode_WSTR_LENGTH(self) = length; 14116 _PyUnicode_WSTR(self) = (wchar_t *)data; 14117 } 14118 14119 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14120 kind * (length + 1)); 14121 assert(_PyUnicode_CheckConsistency(self, 1)); 14122#ifdef Py_DEBUG 14123 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14124#endif 14125 Py_DECREF(unicode); 14126 return self; 14127 14128onError: 14129 Py_DECREF(unicode); 14130 Py_DECREF(self); 14131 return NULL; 14132} 14133 14134PyDoc_STRVAR(unicode_doc, 14135"str(object='') -> str\n\ 14136str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14137\n\ 14138Create a new string object from the given object. If encoding or\n\ 14139errors is specified, then the object must expose a data buffer\n\ 14140that will be decoded using the given encoding and error handler.\n\ 14141Otherwise, returns the result of object.__str__() (if defined)\n\ 14142or repr(object).\n\ 14143encoding defaults to sys.getdefaultencoding().\n\ 14144errors defaults to 'strict'."); 14145 14146static PyObject *unicode_iter(PyObject *seq); 14147 14148PyTypeObject PyUnicode_Type = { 14149 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14150 "str", /* tp_name */ 14151 sizeof(PyUnicodeObject), /* tp_size */ 14152 0, /* tp_itemsize */ 14153 /* Slots */ 14154 (destructor)unicode_dealloc, /* tp_dealloc */ 14155 0, /* tp_print */ 14156 0, /* tp_getattr */ 14157 0, /* tp_setattr */ 14158 0, /* tp_reserved */ 14159 unicode_repr, /* tp_repr */ 14160 &unicode_as_number, /* tp_as_number */ 14161 &unicode_as_sequence, /* tp_as_sequence */ 14162 &unicode_as_mapping, /* tp_as_mapping */ 14163 (hashfunc) unicode_hash, /* tp_hash*/ 14164 0, /* tp_call*/ 14165 (reprfunc) unicode_str, /* tp_str */ 14166 PyObject_GenericGetAttr, /* tp_getattro */ 14167 0, /* tp_setattro */ 14168 0, /* tp_as_buffer */ 14169 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14170 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14171 unicode_doc, /* tp_doc */ 14172 0, /* tp_traverse */ 14173 0, /* tp_clear */ 14174 PyUnicode_RichCompare, /* tp_richcompare */ 14175 0, /* tp_weaklistoffset */ 14176 unicode_iter, /* tp_iter */ 14177 0, /* tp_iternext */ 14178 unicode_methods, /* tp_methods */ 14179 0, /* tp_members */ 14180 0, /* tp_getset */ 14181 &PyBaseObject_Type, /* tp_base */ 14182 0, /* tp_dict */ 14183 0, /* tp_descr_get */ 14184 0, /* tp_descr_set */ 14185 0, /* tp_dictoffset */ 14186 0, /* tp_init */ 14187 0, /* tp_alloc */ 14188 unicode_new, /* tp_new */ 14189 PyObject_Del, /* tp_free */ 14190}; 14191 14192/* Initialize the Unicode implementation */ 14193 14194int _PyUnicode_Init(void) 14195{ 14196 int i; 14197 14198 /* XXX - move this array to unicodectype.c ? */ 14199 Py_UCS2 linebreak[] = { 14200 0x000A, /* LINE FEED */ 14201 0x000D, /* CARRIAGE RETURN */ 14202 0x001C, /* FILE SEPARATOR */ 14203 0x001D, /* GROUP SEPARATOR */ 14204 0x001E, /* RECORD SEPARATOR */ 14205 0x0085, /* NEXT LINE */ 14206 0x2028, /* LINE SEPARATOR */ 14207 0x2029, /* PARAGRAPH SEPARATOR */ 14208 }; 14209 14210 /* Init the implementation */ 14211 unicode_empty = PyUnicode_New(0, 0); 14212 if (!unicode_empty) 14213 Py_FatalError("Can't create empty string"); 14214 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 14215 14216 for (i = 0; i < 256; i++) 14217 unicode_latin1[i] = NULL; 14218 if (PyType_Ready(&PyUnicode_Type) < 0) 14219 Py_FatalError("Can't initialize 'unicode'"); 14220 14221 /* initialize the linebreak bloom filter */ 14222 bloom_linebreak = make_bloom_mask( 14223 PyUnicode_2BYTE_KIND, linebreak, 14224 Py_ARRAY_LENGTH(linebreak)); 14225 14226 PyType_Ready(&EncodingMapType); 14227 14228 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 14229 Py_FatalError("Can't initialize field name iterator type"); 14230 14231 if (PyType_Ready(&PyFormatterIter_Type) < 0) 14232 Py_FatalError("Can't initialize formatter iter type"); 14233 14234#ifdef HAVE_MBCS 14235 winver.dwOSVersionInfoSize = sizeof(winver); 14236 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14237 PyErr_SetFromWindowsErr(0); 14238 return -1; 14239 } 14240#endif 14241 return 0; 14242} 14243 14244/* Finalize the Unicode implementation */ 14245 14246int 14247PyUnicode_ClearFreeList(void) 14248{ 14249 return 0; 14250} 14251 14252void 14253_PyUnicode_Fini(void) 14254{ 14255 int i; 14256 14257 Py_XDECREF(unicode_empty); 14258 unicode_empty = NULL; 14259 14260 for (i = 0; i < 256; i++) { 14261 if (unicode_latin1[i]) { 14262 Py_DECREF(unicode_latin1[i]); 14263 unicode_latin1[i] = NULL; 14264 } 14265 } 14266 _PyUnicode_ClearStaticStrings(); 14267 (void)PyUnicode_ClearFreeList(); 14268} 14269 14270void 14271PyUnicode_InternInPlace(PyObject **p) 14272{ 14273 register PyObject *s = *p; 14274 PyObject *t; 14275#ifdef Py_DEBUG 14276 assert(s != NULL); 14277 assert(_PyUnicode_CHECK(s)); 14278#else 14279 if (s == NULL || !PyUnicode_Check(s)) 14280 return; 14281#endif 14282 /* If it's a subclass, we don't really know what putting 14283 it in the interned dict might do. */ 14284 if (!PyUnicode_CheckExact(s)) 14285 return; 14286 if (PyUnicode_CHECK_INTERNED(s)) 14287 return; 14288 if (interned == NULL) { 14289 interned = PyDict_New(); 14290 if (interned == NULL) { 14291 PyErr_Clear(); /* Don't leave an exception */ 14292 return; 14293 } 14294 } 14295 /* It might be that the GetItem call fails even 14296 though the key is present in the dictionary, 14297 namely when this happens during a stack overflow. */ 14298 Py_ALLOW_RECURSION 14299 t = PyDict_GetItem(interned, s); 14300 Py_END_ALLOW_RECURSION 14301 14302 if (t) { 14303 Py_INCREF(t); 14304 Py_DECREF(*p); 14305 *p = t; 14306 return; 14307 } 14308 14309 PyThreadState_GET()->recursion_critical = 1; 14310 if (PyDict_SetItem(interned, s, s) < 0) { 14311 PyErr_Clear(); 14312 PyThreadState_GET()->recursion_critical = 0; 14313 return; 14314 } 14315 PyThreadState_GET()->recursion_critical = 0; 14316 /* The two references in interned are not counted by refcnt. 14317 The deallocator will take care of this */ 14318 Py_REFCNT(s) -= 2; 14319 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14320} 14321 14322void 14323PyUnicode_InternImmortal(PyObject **p) 14324{ 14325 PyUnicode_InternInPlace(p); 14326 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14327 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14328 Py_INCREF(*p); 14329 } 14330} 14331 14332PyObject * 14333PyUnicode_InternFromString(const char *cp) 14334{ 14335 PyObject *s = PyUnicode_FromString(cp); 14336 if (s == NULL) 14337 return NULL; 14338 PyUnicode_InternInPlace(&s); 14339 return s; 14340} 14341 14342void 14343_Py_ReleaseInternedUnicodeStrings(void) 14344{ 14345 PyObject *keys; 14346 PyObject *s; 14347 Py_ssize_t i, n; 14348 Py_ssize_t immortal_size = 0, mortal_size = 0; 14349 14350 if (interned == NULL || !PyDict_Check(interned)) 14351 return; 14352 keys = PyDict_Keys(interned); 14353 if (keys == NULL || !PyList_Check(keys)) { 14354 PyErr_Clear(); 14355 return; 14356 } 14357 14358 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14359 detector, interned unicode strings are not forcibly deallocated; 14360 rather, we give them their stolen references back, and then clear 14361 and DECREF the interned dict. */ 14362 14363 n = PyList_GET_SIZE(keys); 14364 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14365 n); 14366 for (i = 0; i < n; i++) { 14367 s = PyList_GET_ITEM(keys, i); 14368 if (PyUnicode_READY(s) == -1) { 14369 assert(0 && "could not ready string"); 14370 fprintf(stderr, "could not ready string\n"); 14371 } 14372 switch (PyUnicode_CHECK_INTERNED(s)) { 14373 case SSTATE_NOT_INTERNED: 14374 /* XXX Shouldn't happen */ 14375 break; 14376 case SSTATE_INTERNED_IMMORTAL: 14377 Py_REFCNT(s) += 1; 14378 immortal_size += PyUnicode_GET_LENGTH(s); 14379 break; 14380 case SSTATE_INTERNED_MORTAL: 14381 Py_REFCNT(s) += 2; 14382 mortal_size += PyUnicode_GET_LENGTH(s); 14383 break; 14384 default: 14385 Py_FatalError("Inconsistent interned string state."); 14386 } 14387 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14388 } 14389 fprintf(stderr, "total size of all interned strings: " 14390 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14391 "mortal/immortal\n", mortal_size, immortal_size); 14392 Py_DECREF(keys); 14393 PyDict_Clear(interned); 14394 Py_DECREF(interned); 14395 interned = NULL; 14396} 14397 14398 14399/********************* Unicode Iterator **************************/ 14400 14401typedef struct { 14402 PyObject_HEAD 14403 Py_ssize_t it_index; 14404 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14405} unicodeiterobject; 14406 14407static void 14408unicodeiter_dealloc(unicodeiterobject *it) 14409{ 14410 _PyObject_GC_UNTRACK(it); 14411 Py_XDECREF(it->it_seq); 14412 PyObject_GC_Del(it); 14413} 14414 14415static int 14416unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14417{ 14418 Py_VISIT(it->it_seq); 14419 return 0; 14420} 14421 14422static PyObject * 14423unicodeiter_next(unicodeiterobject *it) 14424{ 14425 PyObject *seq, *item; 14426 14427 assert(it != NULL); 14428 seq = it->it_seq; 14429 if (seq == NULL) 14430 return NULL; 14431 assert(_PyUnicode_CHECK(seq)); 14432 14433 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14434 int kind = PyUnicode_KIND(seq); 14435 void *data = PyUnicode_DATA(seq); 14436 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14437 item = PyUnicode_FromOrdinal(chr); 14438 if (item != NULL) 14439 ++it->it_index; 14440 return item; 14441 } 14442 14443 Py_DECREF(seq); 14444 it->it_seq = NULL; 14445 return NULL; 14446} 14447 14448static PyObject * 14449unicodeiter_len(unicodeiterobject *it) 14450{ 14451 Py_ssize_t len = 0; 14452 if (it->it_seq) 14453 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14454 return PyLong_FromSsize_t(len); 14455} 14456 14457PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14458 14459static PyObject * 14460unicodeiter_reduce(unicodeiterobject *it) 14461{ 14462 if (it->it_seq != NULL) { 14463 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14464 it->it_seq, it->it_index); 14465 } else { 14466 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14467 if (u == NULL) 14468 return NULL; 14469 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14470 } 14471} 14472 14473PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14474 14475static PyObject * 14476unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14477{ 14478 Py_ssize_t index = PyLong_AsSsize_t(state); 14479 if (index == -1 && PyErr_Occurred()) 14480 return NULL; 14481 if (index < 0) 14482 index = 0; 14483 it->it_index = index; 14484 Py_RETURN_NONE; 14485} 14486 14487PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14488 14489static PyMethodDef unicodeiter_methods[] = { 14490 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14491 length_hint_doc}, 14492 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14493 reduce_doc}, 14494 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14495 setstate_doc}, 14496 {NULL, NULL} /* sentinel */ 14497}; 14498 14499PyTypeObject PyUnicodeIter_Type = { 14500 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14501 "str_iterator", /* tp_name */ 14502 sizeof(unicodeiterobject), /* tp_basicsize */ 14503 0, /* tp_itemsize */ 14504 /* methods */ 14505 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14506 0, /* tp_print */ 14507 0, /* tp_getattr */ 14508 0, /* tp_setattr */ 14509 0, /* tp_reserved */ 14510 0, /* tp_repr */ 14511 0, /* tp_as_number */ 14512 0, /* tp_as_sequence */ 14513 0, /* tp_as_mapping */ 14514 0, /* tp_hash */ 14515 0, /* tp_call */ 14516 0, /* tp_str */ 14517 PyObject_GenericGetAttr, /* tp_getattro */ 14518 0, /* tp_setattro */ 14519 0, /* tp_as_buffer */ 14520 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14521 0, /* tp_doc */ 14522 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14523 0, /* tp_clear */ 14524 0, /* tp_richcompare */ 14525 0, /* tp_weaklistoffset */ 14526 PyObject_SelfIter, /* tp_iter */ 14527 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14528 unicodeiter_methods, /* tp_methods */ 14529 0, 14530}; 14531 14532static PyObject * 14533unicode_iter(PyObject *seq) 14534{ 14535 unicodeiterobject *it; 14536 14537 if (!PyUnicode_Check(seq)) { 14538 PyErr_BadInternalCall(); 14539 return NULL; 14540 } 14541 if (PyUnicode_READY(seq) == -1) 14542 return NULL; 14543 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14544 if (it == NULL) 14545 return NULL; 14546 it->it_index = 0; 14547 Py_INCREF(seq); 14548 it->it_seq = seq; 14549 _PyObject_GC_TRACK(it); 14550 return (PyObject *)it; 14551} 14552 14553 14554size_t 14555Py_UNICODE_strlen(const Py_UNICODE *u) 14556{ 14557 int res = 0; 14558 while(*u++) 14559 res++; 14560 return res; 14561} 14562 14563Py_UNICODE* 14564Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14565{ 14566 Py_UNICODE *u = s1; 14567 while ((*u++ = *s2++)); 14568 return s1; 14569} 14570 14571Py_UNICODE* 14572Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14573{ 14574 Py_UNICODE *u = s1; 14575 while ((*u++ = *s2++)) 14576 if (n-- == 0) 14577 break; 14578 return s1; 14579} 14580 14581Py_UNICODE* 14582Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14583{ 14584 Py_UNICODE *u1 = s1; 14585 u1 += Py_UNICODE_strlen(u1); 14586 Py_UNICODE_strcpy(u1, s2); 14587 return s1; 14588} 14589 14590int 14591Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14592{ 14593 while (*s1 && *s2 && *s1 == *s2) 14594 s1++, s2++; 14595 if (*s1 && *s2) 14596 return (*s1 < *s2) ? -1 : +1; 14597 if (*s1) 14598 return 1; 14599 if (*s2) 14600 return -1; 14601 return 0; 14602} 14603 14604int 14605Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14606{ 14607 register Py_UNICODE u1, u2; 14608 for (; n != 0; n--) { 14609 u1 = *s1; 14610 u2 = *s2; 14611 if (u1 != u2) 14612 return (u1 < u2) ? -1 : +1; 14613 if (u1 == '\0') 14614 return 0; 14615 s1++; 14616 s2++; 14617 } 14618 return 0; 14619} 14620 14621Py_UNICODE* 14622Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14623{ 14624 const Py_UNICODE *p; 14625 for (p = s; *p; p++) 14626 if (*p == c) 14627 return (Py_UNICODE*)p; 14628 return NULL; 14629} 14630 14631Py_UNICODE* 14632Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14633{ 14634 const Py_UNICODE *p; 14635 p = s + Py_UNICODE_strlen(s); 14636 while (p != s) { 14637 p--; 14638 if (*p == c) 14639 return (Py_UNICODE*)p; 14640 } 14641 return NULL; 14642} 14643 14644Py_UNICODE* 14645PyUnicode_AsUnicodeCopy(PyObject *unicode) 14646{ 14647 Py_UNICODE *u, *copy; 14648 Py_ssize_t len, size; 14649 14650 if (!PyUnicode_Check(unicode)) { 14651 PyErr_BadArgument(); 14652 return NULL; 14653 } 14654 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14655 if (u == NULL) 14656 return NULL; 14657 /* Ensure we won't overflow the size. */ 14658 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14659 PyErr_NoMemory(); 14660 return NULL; 14661 } 14662 size = len + 1; /* copy the null character */ 14663 size *= sizeof(Py_UNICODE); 14664 copy = PyMem_Malloc(size); 14665 if (copy == NULL) { 14666 PyErr_NoMemory(); 14667 return NULL; 14668 } 14669 memcpy(copy, u, size); 14670 return copy; 14671} 14672 14673/* A _string module, to export formatter_parser and formatter_field_name_split 14674 to the string.Formatter class implemented in Python. */ 14675 14676static PyMethodDef _string_methods[] = { 14677 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14678 METH_O, PyDoc_STR("split the argument as a field name")}, 14679 {"formatter_parser", (PyCFunction) formatter_parser, 14680 METH_O, PyDoc_STR("parse the argument as a format string")}, 14681 {NULL, NULL} 14682}; 14683 14684static struct PyModuleDef _string_module = { 14685 PyModuleDef_HEAD_INIT, 14686 "_string", 14687 PyDoc_STR("string helper module"), 14688 0, 14689 _string_methods, 14690 NULL, 14691 NULL, 14692 NULL, 14693 NULL 14694}; 14695 14696PyMODINIT_FUNC 14697PyInit__string(void) 14698{ 14699 return PyModule_Create(&_string_module); 14700} 14701 14702 14703#ifdef __cplusplus 14704} 14705#endif 14706