unicodeobject.c revision 6d5ad227a50c6c5a78e48a98095788953ab49512
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Endianness switches; defaults to little endian */ 51 52#ifdef WORDS_BIGENDIAN 53# define BYTEORDER_IS_BIG_ENDIAN 54#else 55# define BYTEORDER_IS_LITTLE_ENDIAN 56#endif 57 58/* --- Globals ------------------------------------------------------------ 59 60 The globals are initialized by the _PyUnicode_Init() API and should 61 not be used before calling that API. 62 63*/ 64 65 66#ifdef __cplusplus 67extern "C" { 68#endif 69 70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 71#define MAX_UNICODE 0x10ffff 72 73#ifdef Py_DEBUG 74# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 75#else 76# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 77#endif 78 79#define _PyUnicode_UTF8(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8) 81#define PyUnicode_UTF8(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((char*)((PyASCIIObject*)(op) + 1)) : \ 86 _PyUnicode_UTF8(op)) 87#define _PyUnicode_UTF8_LENGTH(op) \ 88 (((PyCompactUnicodeObject*)(op))->utf8_length) 89#define PyUnicode_UTF8_LENGTH(op) \ 90 (assert(_PyUnicode_CHECK(op)), \ 91 assert(PyUnicode_IS_READY(op)), \ 92 PyUnicode_IS_COMPACT_ASCII(op) ? \ 93 ((PyASCIIObject*)(op))->length : \ 94 _PyUnicode_UTF8_LENGTH(op)) 95#define _PyUnicode_WSTR(op) \ 96 (((PyASCIIObject*)(op))->wstr) 97#define _PyUnicode_WSTR_LENGTH(op) \ 98 (((PyCompactUnicodeObject*)(op))->wstr_length) 99#define _PyUnicode_LENGTH(op) \ 100 (((PyASCIIObject *)(op))->length) 101#define _PyUnicode_STATE(op) \ 102 (((PyASCIIObject *)(op))->state) 103#define _PyUnicode_HASH(op) \ 104 (((PyASCIIObject *)(op))->hash) 105#define _PyUnicode_KIND(op) \ 106 (assert(_PyUnicode_CHECK(op)), \ 107 ((PyASCIIObject *)(op))->state.kind) 108#define _PyUnicode_GET_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 ((PyASCIIObject *)(op))->length) 111#define _PyUnicode_DATA_ANY(op) \ 112 (((PyUnicodeObject*)(op))->data.any) 113 114/* Optimized version of Py_MAX() to compute the maximum character: 115 use it when your are computing the second argument of PyUnicode_New() */ 116#define MAX_MAXCHAR(maxchar1, maxchar2) \ 117 ((maxchar1) | (maxchar2)) 118 119#undef PyUnicode_READY 120#define PyUnicode_READY(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 (PyUnicode_IS_READY(op) ? \ 123 0 : \ 124 _PyUnicode_Ready(op))) 125 126#define _PyUnicode_SHARE_UTF8(op) \ 127 (assert(_PyUnicode_CHECK(op)), \ 128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 130#define _PyUnicode_SHARE_WSTR(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated UTF-8 memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (!PyUnicode_IS_COMPACT_ASCII(op) \ 139 && _PyUnicode_UTF8(op) \ 140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 141 142/* true if the Unicode object has an allocated wstr memory block 143 (not shared with other data) */ 144#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 145 (assert(_PyUnicode_CHECK(op)), \ 146 (_PyUnicode_WSTR(op) && \ 147 (!PyUnicode_IS_READY(op) || \ 148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 149 150/* Generic helper macro to convert characters of different types. 151 from_type and to_type have to be valid type names, begin and end 152 are pointers to the source characters which should be of type 153 "from_type *". to is a pointer of type "to_type *" and points to the 154 buffer where the result characters are written to. */ 155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 156 do { \ 157 to_type *_to = (to_type *) to; \ 158 const from_type *_iter = (begin); \ 159 const from_type *_end = (end); \ 160 Py_ssize_t n = (_end) - (_iter); \ 161 const from_type *_unrolled_end = \ 162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 163 while (_iter < (_unrolled_end)) { \ 164 _to[0] = (to_type) _iter[0]; \ 165 _to[1] = (to_type) _iter[1]; \ 166 _to[2] = (to_type) _iter[2]; \ 167 _to[3] = (to_type) _iter[3]; \ 168 _iter += 4; _to += 4; \ 169 } \ 170 while (_iter < (_end)) \ 171 *_to++ = (to_type) *_iter++; \ 172 } while (0) 173 174/* This dictionary holds all interned unicode strings. Note that references 175 to strings in this dictionary are *not* counted in the string's ob_refcnt. 176 When the interned string reaches a refcnt of 0 the string deallocation 177 function will delete the reference from this dictionary. 178 179 Another way to look at this is that to say that the actual reference 180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 181*/ 182static PyObject *interned; 183 184/* The empty Unicode object is shared to improve performance. */ 185static PyObject *unicode_empty; 186 187/* List of static strings. */ 188static _Py_Identifier *static_strings; 189 190/* Single character Unicode strings in the Latin-1 range are being 191 shared as well. */ 192static PyObject *unicode_latin1[256]; 193 194/* Fast detection of the most frequent whitespace characters */ 195const unsigned char _Py_ascii_whitespace[] = { 196 0, 0, 0, 0, 0, 0, 0, 0, 197/* case 0x0009: * CHARACTER TABULATION */ 198/* case 0x000A: * LINE FEED */ 199/* case 0x000B: * LINE TABULATION */ 200/* case 0x000C: * FORM FEED */ 201/* case 0x000D: * CARRIAGE RETURN */ 202 0, 1, 1, 1, 1, 1, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204/* case 0x001C: * FILE SEPARATOR */ 205/* case 0x001D: * GROUP SEPARATOR */ 206/* case 0x001E: * RECORD SEPARATOR */ 207/* case 0x001F: * UNIT SEPARATOR */ 208 0, 0, 0, 0, 1, 1, 1, 1, 209/* case 0x0020: * SPACE */ 210 1, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0 223}; 224 225/* forward */ 226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 227static PyObject* get_latin1_char(unsigned char ch); 228static int unicode_modifiable(PyObject *unicode); 229 230 231static PyObject * 232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 233static PyObject * 234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 235static PyObject * 236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 237 238static PyObject * 239unicode_encode_call_errorhandler(const char *errors, 240 PyObject **errorHandler,const char *encoding, const char *reason, 241 PyObject *unicode, PyObject **exceptionObject, 242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 243 244static void 245raise_encode_exception(PyObject **exceptionObject, 246 const char *encoding, 247 PyObject *unicode, 248 Py_ssize_t startpos, Py_ssize_t endpos, 249 const char *reason); 250 251/* Same for linebreaks */ 252static unsigned char ascii_linebreak[] = { 253 0, 0, 0, 0, 0, 0, 0, 0, 254/* 0x000A, * LINE FEED */ 255/* 0x000B, * LINE TABULATION */ 256/* 0x000C, * FORM FEED */ 257/* 0x000D, * CARRIAGE RETURN */ 258 0, 0, 1, 1, 1, 1, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260/* 0x001C, * FILE SEPARATOR */ 261/* 0x001D, * GROUP SEPARATOR */ 262/* 0x001E, * RECORD SEPARATOR */ 263 0, 0, 0, 0, 1, 1, 1, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 269 0, 0, 0, 0, 0, 0, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0 277}; 278 279/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 280 This function is kept for backward compatibility with the old API. */ 281Py_UNICODE 282PyUnicode_GetMax(void) 283{ 284#ifdef Py_UNICODE_WIDE 285 return 0x10FFFF; 286#else 287 /* This is actually an illegal character, so it should 288 not be passed to unichr. */ 289 return 0xFFFF; 290#endif 291} 292 293#ifdef Py_DEBUG 294int 295_PyUnicode_CheckConsistency(PyObject *op, int check_content) 296{ 297 PyASCIIObject *ascii; 298 unsigned int kind; 299 300 assert(PyUnicode_Check(op)); 301 302 ascii = (PyASCIIObject *)op; 303 kind = ascii->state.kind; 304 305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 306 assert(kind == PyUnicode_1BYTE_KIND); 307 assert(ascii->state.ready == 1); 308 } 309 else { 310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 311 void *data; 312 313 if (ascii->state.compact == 1) { 314 data = compact + 1; 315 assert(kind == PyUnicode_1BYTE_KIND 316 || kind == PyUnicode_2BYTE_KIND 317 || kind == PyUnicode_4BYTE_KIND); 318 assert(ascii->state.ascii == 0); 319 assert(ascii->state.ready == 1); 320 assert (compact->utf8 != data); 321 } 322 else { 323 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 324 325 data = unicode->data.any; 326 if (kind == PyUnicode_WCHAR_KIND) { 327 assert(ascii->length == 0); 328 assert(ascii->hash == -1); 329 assert(ascii->state.compact == 0); 330 assert(ascii->state.ascii == 0); 331 assert(ascii->state.ready == 0); 332 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 333 assert(ascii->wstr != NULL); 334 assert(data == NULL); 335 assert(compact->utf8 == NULL); 336 } 337 else { 338 assert(kind == PyUnicode_1BYTE_KIND 339 || kind == PyUnicode_2BYTE_KIND 340 || kind == PyUnicode_4BYTE_KIND); 341 assert(ascii->state.compact == 0); 342 assert(ascii->state.ready == 1); 343 assert(data != NULL); 344 if (ascii->state.ascii) { 345 assert (compact->utf8 == data); 346 assert (compact->utf8_length == ascii->length); 347 } 348 else 349 assert (compact->utf8 != data); 350 } 351 } 352 if (kind != PyUnicode_WCHAR_KIND) { 353 if ( 354#if SIZEOF_WCHAR_T == 2 355 kind == PyUnicode_2BYTE_KIND 356#else 357 kind == PyUnicode_4BYTE_KIND 358#endif 359 ) 360 { 361 assert(ascii->wstr == data); 362 assert(compact->wstr_length == ascii->length); 363 } else 364 assert(ascii->wstr != data); 365 } 366 367 if (compact->utf8 == NULL) 368 assert(compact->utf8_length == 0); 369 if (ascii->wstr == NULL) 370 assert(compact->wstr_length == 0); 371 } 372 /* check that the best kind is used */ 373 if (check_content && kind != PyUnicode_WCHAR_KIND) 374 { 375 Py_ssize_t i; 376 Py_UCS4 maxchar = 0; 377 void *data; 378 Py_UCS4 ch; 379 380 data = PyUnicode_DATA(ascii); 381 for (i=0; i < ascii->length; i++) 382 { 383 ch = PyUnicode_READ(kind, data, i); 384 if (ch > maxchar) 385 maxchar = ch; 386 } 387 if (kind == PyUnicode_1BYTE_KIND) { 388 if (ascii->state.ascii == 0) { 389 assert(maxchar >= 128); 390 assert(maxchar <= 255); 391 } 392 else 393 assert(maxchar < 128); 394 } 395 else if (kind == PyUnicode_2BYTE_KIND) { 396 assert(maxchar >= 0x100); 397 assert(maxchar <= 0xFFFF); 398 } 399 else { 400 assert(maxchar >= 0x10000); 401 assert(maxchar <= MAX_UNICODE); 402 } 403 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 404 } 405 return 1; 406} 407#endif 408 409static PyObject* 410unicode_result_wchar(PyObject *unicode) 411{ 412#ifndef Py_DEBUG 413 Py_ssize_t len; 414 415 assert(Py_REFCNT(unicode) == 1); 416 417 len = _PyUnicode_WSTR_LENGTH(unicode); 418 if (len == 0) { 419 Py_INCREF(unicode_empty); 420 Py_DECREF(unicode); 421 return unicode_empty; 422 } 423 424 if (len == 1) { 425 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 426 if (ch < 256) { 427 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 428 Py_DECREF(unicode); 429 return latin1_char; 430 } 431 } 432 433 if (_PyUnicode_Ready(unicode) < 0) { 434 Py_XDECREF(unicode); 435 return NULL; 436 } 437#else 438 /* don't make the result ready in debug mode to ensure that the caller 439 makes the string ready before using it */ 440 assert(_PyUnicode_CheckConsistency(unicode, 1)); 441#endif 442 return unicode; 443} 444 445static PyObject* 446unicode_result_ready(PyObject *unicode) 447{ 448 Py_ssize_t length; 449 450 length = PyUnicode_GET_LENGTH(unicode); 451 if (length == 0) { 452 if (unicode != unicode_empty) { 453 Py_INCREF(unicode_empty); 454 Py_DECREF(unicode); 455 } 456 return unicode_empty; 457 } 458 459 if (length == 1) { 460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 461 if (ch < 256) { 462 PyObject *latin1_char = unicode_latin1[ch]; 463 if (latin1_char != NULL) { 464 if (unicode != latin1_char) { 465 Py_INCREF(latin1_char); 466 Py_DECREF(unicode); 467 } 468 return latin1_char; 469 } 470 else { 471 assert(_PyUnicode_CheckConsistency(unicode, 1)); 472 Py_INCREF(unicode); 473 unicode_latin1[ch] = unicode; 474 return unicode; 475 } 476 } 477 } 478 479 assert(_PyUnicode_CheckConsistency(unicode, 1)); 480 return unicode; 481} 482 483static PyObject* 484unicode_result(PyObject *unicode) 485{ 486 assert(_PyUnicode_CHECK(unicode)); 487 if (PyUnicode_IS_READY(unicode)) 488 return unicode_result_ready(unicode); 489 else 490 return unicode_result_wchar(unicode); 491} 492 493static PyObject* 494unicode_result_unchanged(PyObject *unicode) 495{ 496 if (PyUnicode_CheckExact(unicode)) { 497 if (PyUnicode_READY(unicode) == -1) 498 return NULL; 499 Py_INCREF(unicode); 500 return unicode; 501 } 502 else 503 /* Subtype -- return genuine unicode string with the same value. */ 504 return _PyUnicode_Copy(unicode); 505} 506 507#ifdef HAVE_MBCS 508static OSVERSIONINFOEX winver; 509#endif 510 511/* --- Bloom Filters ----------------------------------------------------- */ 512 513/* stuff to implement simple "bloom filters" for Unicode characters. 514 to keep things simple, we use a single bitmask, using the least 5 515 bits from each unicode characters as the bit index. */ 516 517/* the linebreak mask is set up by Unicode_Init below */ 518 519#if LONG_BIT >= 128 520#define BLOOM_WIDTH 128 521#elif LONG_BIT >= 64 522#define BLOOM_WIDTH 64 523#elif LONG_BIT >= 32 524#define BLOOM_WIDTH 32 525#else 526#error "LONG_BIT is smaller than 32" 527#endif 528 529#define BLOOM_MASK unsigned long 530 531static BLOOM_MASK bloom_linebreak; 532 533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 535 536#define BLOOM_LINEBREAK(ch) \ 537 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 539 540Py_LOCAL_INLINE(BLOOM_MASK) 541make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 542{ 543 /* calculate simple bloom-style bitmask for a given unicode string */ 544 545 BLOOM_MASK mask; 546 Py_ssize_t i; 547 548 mask = 0; 549 for (i = 0; i < len; i++) 550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 551 552 return mask; 553} 554 555#define BLOOM_MEMBER(mask, chr, str) \ 556 (BLOOM(mask, chr) \ 557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 558 559/* Compilation of templated routines */ 560 561#include "stringlib/asciilib.h" 562#include "stringlib/fastsearch.h" 563#include "stringlib/partition.h" 564#include "stringlib/split.h" 565#include "stringlib/count.h" 566#include "stringlib/find.h" 567#include "stringlib/find_max_char.h" 568#include "stringlib/localeutil.h" 569#include "stringlib/undef.h" 570 571#include "stringlib/ucs1lib.h" 572#include "stringlib/fastsearch.h" 573#include "stringlib/partition.h" 574#include "stringlib/split.h" 575#include "stringlib/count.h" 576#include "stringlib/find.h" 577#include "stringlib/find_max_char.h" 578#include "stringlib/localeutil.h" 579#include "stringlib/undef.h" 580 581#include "stringlib/ucs2lib.h" 582#include "stringlib/fastsearch.h" 583#include "stringlib/partition.h" 584#include "stringlib/split.h" 585#include "stringlib/count.h" 586#include "stringlib/find.h" 587#include "stringlib/find_max_char.h" 588#include "stringlib/localeutil.h" 589#include "stringlib/undef.h" 590 591#include "stringlib/ucs4lib.h" 592#include "stringlib/fastsearch.h" 593#include "stringlib/partition.h" 594#include "stringlib/split.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/find_max_char.h" 598#include "stringlib/localeutil.h" 599#include "stringlib/undef.h" 600 601#include "stringlib/unicodedefs.h" 602#include "stringlib/fastsearch.h" 603#include "stringlib/count.h" 604#include "stringlib/find.h" 605#include "stringlib/undef.h" 606 607/* --- Unicode Object ----------------------------------------------------- */ 608 609static PyObject * 610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 611 612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 613 Py_ssize_t size, Py_UCS4 ch, 614 int direction) 615{ 616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 617 618 switch (kind) { 619 case PyUnicode_1BYTE_KIND: 620 { 621 Py_UCS1 ch1 = (Py_UCS1) ch; 622 if (ch1 == ch) 623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 624 else 625 return -1; 626 } 627 case PyUnicode_2BYTE_KIND: 628 { 629 Py_UCS2 ch2 = (Py_UCS2) ch; 630 if (ch2 == ch) 631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 632 else 633 return -1; 634 } 635 case PyUnicode_4BYTE_KIND: 636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 637 default: 638 assert(0); 639 return -1; 640 } 641} 642 643static PyObject* 644resize_compact(PyObject *unicode, Py_ssize_t length) 645{ 646 Py_ssize_t char_size; 647 Py_ssize_t struct_size; 648 Py_ssize_t new_size; 649 int share_wstr; 650 PyObject *new_unicode; 651 assert(unicode_modifiable(unicode)); 652 assert(PyUnicode_IS_READY(unicode)); 653 assert(PyUnicode_IS_COMPACT(unicode)); 654 655 char_size = PyUnicode_KIND(unicode); 656 if (PyUnicode_IS_ASCII(unicode)) 657 struct_size = sizeof(PyASCIIObject); 658 else 659 struct_size = sizeof(PyCompactUnicodeObject); 660 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 661 662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 663 PyErr_NoMemory(); 664 return NULL; 665 } 666 new_size = (struct_size + (length + 1) * char_size); 667 668 _Py_DEC_REFTOTAL; 669 _Py_ForgetReference(unicode); 670 671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 672 if (new_unicode == NULL) { 673 _Py_NewReference(unicode); 674 PyErr_NoMemory(); 675 return NULL; 676 } 677 unicode = new_unicode; 678 _Py_NewReference(unicode); 679 680 _PyUnicode_LENGTH(unicode) = length; 681 if (share_wstr) { 682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 683 if (!PyUnicode_IS_ASCII(unicode)) 684 _PyUnicode_WSTR_LENGTH(unicode) = length; 685 } 686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 687 length, 0); 688 assert(_PyUnicode_CheckConsistency(unicode, 0)); 689 return unicode; 690} 691 692static int 693resize_inplace(PyObject *unicode, Py_ssize_t length) 694{ 695 wchar_t *wstr; 696 Py_ssize_t new_size; 697 assert(!PyUnicode_IS_COMPACT(unicode)); 698 assert(Py_REFCNT(unicode) == 1); 699 700 if (PyUnicode_IS_READY(unicode)) { 701 Py_ssize_t char_size; 702 int share_wstr, share_utf8; 703 void *data; 704 705 data = _PyUnicode_DATA_ANY(unicode); 706 char_size = PyUnicode_KIND(unicode); 707 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 709 710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 711 PyErr_NoMemory(); 712 return -1; 713 } 714 new_size = (length + 1) * char_size; 715 716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 717 { 718 PyObject_DEL(_PyUnicode_UTF8(unicode)); 719 _PyUnicode_UTF8(unicode) = NULL; 720 _PyUnicode_UTF8_LENGTH(unicode) = 0; 721 } 722 723 data = (PyObject *)PyObject_REALLOC(data, new_size); 724 if (data == NULL) { 725 PyErr_NoMemory(); 726 return -1; 727 } 728 _PyUnicode_DATA_ANY(unicode) = data; 729 if (share_wstr) { 730 _PyUnicode_WSTR(unicode) = data; 731 _PyUnicode_WSTR_LENGTH(unicode) = length; 732 } 733 if (share_utf8) { 734 _PyUnicode_UTF8(unicode) = data; 735 _PyUnicode_UTF8_LENGTH(unicode) = length; 736 } 737 _PyUnicode_LENGTH(unicode) = length; 738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 740 assert(_PyUnicode_CheckConsistency(unicode, 0)); 741 return 0; 742 } 743 } 744 assert(_PyUnicode_WSTR(unicode) != NULL); 745 746 /* check for integer overflow */ 747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 748 PyErr_NoMemory(); 749 return -1; 750 } 751 new_size = sizeof(wchar_t) * (length + 1); 752 wstr = _PyUnicode_WSTR(unicode); 753 wstr = PyObject_REALLOC(wstr, new_size); 754 if (!wstr) { 755 PyErr_NoMemory(); 756 return -1; 757 } 758 _PyUnicode_WSTR(unicode) = wstr; 759 _PyUnicode_WSTR(unicode)[length] = 0; 760 _PyUnicode_WSTR_LENGTH(unicode) = length; 761 assert(_PyUnicode_CheckConsistency(unicode, 0)); 762 return 0; 763} 764 765static PyObject* 766resize_copy(PyObject *unicode, Py_ssize_t length) 767{ 768 Py_ssize_t copy_length; 769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 770 PyObject *copy; 771 772 if (PyUnicode_READY(unicode) == -1) 773 return NULL; 774 775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 776 if (copy == NULL) 777 return NULL; 778 779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 781 return copy; 782 } 783 else { 784 PyObject *w; 785 786 w = (PyObject*)_PyUnicode_New(length); 787 if (w == NULL) 788 return NULL; 789 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 790 copy_length = Py_MIN(copy_length, length); 791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 792 copy_length); 793 return w; 794 } 795} 796 797/* We allocate one more byte to make sure the string is 798 Ux0000 terminated; some code (e.g. new_identifier) 799 relies on that. 800 801 XXX This allocator could further be enhanced by assuring that the 802 free list never reduces its size below 1. 803 804*/ 805 806static PyUnicodeObject * 807_PyUnicode_New(Py_ssize_t length) 808{ 809 register PyUnicodeObject *unicode; 810 size_t new_size; 811 812 /* Optimization for empty strings */ 813 if (length == 0 && unicode_empty != NULL) { 814 Py_INCREF(unicode_empty); 815 return (PyUnicodeObject*)unicode_empty; 816 } 817 818 /* Ensure we won't overflow the size. */ 819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 820 return (PyUnicodeObject *)PyErr_NoMemory(); 821 } 822 if (length < 0) { 823 PyErr_SetString(PyExc_SystemError, 824 "Negative size passed to _PyUnicode_New"); 825 return NULL; 826 } 827 828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 829 if (unicode == NULL) 830 return NULL; 831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 833 if (!_PyUnicode_WSTR(unicode)) { 834 Py_DECREF(unicode); 835 PyErr_NoMemory(); 836 return NULL; 837 } 838 839 /* Initialize the first element to guard against cases where 840 * the caller fails before initializing str -- unicode_resize() 841 * reads str[0], and the Keep-Alive optimization can keep memory 842 * allocated for str alive across a call to unicode_dealloc(unicode). 843 * We don't want unicode_resize to read uninitialized memory in 844 * that case. 845 */ 846 _PyUnicode_WSTR(unicode)[0] = 0; 847 _PyUnicode_WSTR(unicode)[length] = 0; 848 _PyUnicode_WSTR_LENGTH(unicode) = length; 849 _PyUnicode_HASH(unicode) = -1; 850 _PyUnicode_STATE(unicode).interned = 0; 851 _PyUnicode_STATE(unicode).kind = 0; 852 _PyUnicode_STATE(unicode).compact = 0; 853 _PyUnicode_STATE(unicode).ready = 0; 854 _PyUnicode_STATE(unicode).ascii = 0; 855 _PyUnicode_DATA_ANY(unicode) = NULL; 856 _PyUnicode_LENGTH(unicode) = 0; 857 _PyUnicode_UTF8(unicode) = NULL; 858 _PyUnicode_UTF8_LENGTH(unicode) = 0; 859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 860 return unicode; 861} 862 863static const char* 864unicode_kind_name(PyObject *unicode) 865{ 866 /* don't check consistency: unicode_kind_name() is called from 867 _PyUnicode_Dump() */ 868 if (!PyUnicode_IS_COMPACT(unicode)) 869 { 870 if (!PyUnicode_IS_READY(unicode)) 871 return "wstr"; 872 switch (PyUnicode_KIND(unicode)) 873 { 874 case PyUnicode_1BYTE_KIND: 875 if (PyUnicode_IS_ASCII(unicode)) 876 return "legacy ascii"; 877 else 878 return "legacy latin1"; 879 case PyUnicode_2BYTE_KIND: 880 return "legacy UCS2"; 881 case PyUnicode_4BYTE_KIND: 882 return "legacy UCS4"; 883 default: 884 return "<legacy invalid kind>"; 885 } 886 } 887 assert(PyUnicode_IS_READY(unicode)); 888 switch (PyUnicode_KIND(unicode)) { 889 case PyUnicode_1BYTE_KIND: 890 if (PyUnicode_IS_ASCII(unicode)) 891 return "ascii"; 892 else 893 return "latin1"; 894 case PyUnicode_2BYTE_KIND: 895 return "UCS2"; 896 case PyUnicode_4BYTE_KIND: 897 return "UCS4"; 898 default: 899 return "<invalid compact kind>"; 900 } 901} 902 903#ifdef Py_DEBUG 904/* Functions wrapping macros for use in debugger */ 905char *_PyUnicode_utf8(void *unicode){ 906 return PyUnicode_UTF8(unicode); 907} 908 909void *_PyUnicode_compact_data(void *unicode) { 910 return _PyUnicode_COMPACT_DATA(unicode); 911} 912void *_PyUnicode_data(void *unicode){ 913 printf("obj %p\n", unicode); 914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 919 return PyUnicode_DATA(unicode); 920} 921 922void 923_PyUnicode_Dump(PyObject *op) 924{ 925 PyASCIIObject *ascii = (PyASCIIObject *)op; 926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 927 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 928 void *data; 929 930 if (ascii->state.compact) 931 { 932 if (ascii->state.ascii) 933 data = (ascii + 1); 934 else 935 data = (compact + 1); 936 } 937 else 938 data = unicode->data.any; 939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 940 941 if (ascii->wstr == data) 942 printf("shared "); 943 printf("wstr=%p", ascii->wstr); 944 945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 946 printf(" (%zu), ", compact->wstr_length); 947 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 948 printf("shared "); 949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 950 } 951 printf(", data=%p\n", data); 952} 953#endif 954 955PyObject * 956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 957{ 958 PyObject *obj; 959 PyCompactUnicodeObject *unicode; 960 void *data; 961 enum PyUnicode_Kind kind; 962 int is_sharing, is_ascii; 963 Py_ssize_t char_size; 964 Py_ssize_t struct_size; 965 966 /* Optimization for empty strings */ 967 if (size == 0 && unicode_empty != NULL) { 968 Py_INCREF(unicode_empty); 969 return unicode_empty; 970 } 971 972 is_ascii = 0; 973 is_sharing = 0; 974 struct_size = sizeof(PyCompactUnicodeObject); 975 if (maxchar < 128) { 976 kind = PyUnicode_1BYTE_KIND; 977 char_size = 1; 978 is_ascii = 1; 979 struct_size = sizeof(PyASCIIObject); 980 } 981 else if (maxchar < 256) { 982 kind = PyUnicode_1BYTE_KIND; 983 char_size = 1; 984 } 985 else if (maxchar < 65536) { 986 kind = PyUnicode_2BYTE_KIND; 987 char_size = 2; 988 if (sizeof(wchar_t) == 2) 989 is_sharing = 1; 990 } 991 else { 992 if (maxchar > MAX_UNICODE) { 993 PyErr_SetString(PyExc_SystemError, 994 "invalid maximum character passed to PyUnicode_New"); 995 return NULL; 996 } 997 kind = PyUnicode_4BYTE_KIND; 998 char_size = 4; 999 if (sizeof(wchar_t) == 4) 1000 is_sharing = 1; 1001 } 1002 1003 /* Ensure we won't overflow the size. */ 1004 if (size < 0) { 1005 PyErr_SetString(PyExc_SystemError, 1006 "Negative size passed to PyUnicode_New"); 1007 return NULL; 1008 } 1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1010 return PyErr_NoMemory(); 1011 1012 /* Duplicated allocation code from _PyObject_New() instead of a call to 1013 * PyObject_New() so we are able to allocate space for the object and 1014 * it's data buffer. 1015 */ 1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1017 if (obj == NULL) 1018 return PyErr_NoMemory(); 1019 obj = PyObject_INIT(obj, &PyUnicode_Type); 1020 if (obj == NULL) 1021 return NULL; 1022 1023 unicode = (PyCompactUnicodeObject *)obj; 1024 if (is_ascii) 1025 data = ((PyASCIIObject*)obj) + 1; 1026 else 1027 data = unicode + 1; 1028 _PyUnicode_LENGTH(unicode) = size; 1029 _PyUnicode_HASH(unicode) = -1; 1030 _PyUnicode_STATE(unicode).interned = 0; 1031 _PyUnicode_STATE(unicode).kind = kind; 1032 _PyUnicode_STATE(unicode).compact = 1; 1033 _PyUnicode_STATE(unicode).ready = 1; 1034 _PyUnicode_STATE(unicode).ascii = is_ascii; 1035 if (is_ascii) { 1036 ((char*)data)[size] = 0; 1037 _PyUnicode_WSTR(unicode) = NULL; 1038 } 1039 else if (kind == PyUnicode_1BYTE_KIND) { 1040 ((char*)data)[size] = 0; 1041 _PyUnicode_WSTR(unicode) = NULL; 1042 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1043 unicode->utf8 = NULL; 1044 unicode->utf8_length = 0; 1045 } 1046 else { 1047 unicode->utf8 = NULL; 1048 unicode->utf8_length = 0; 1049 if (kind == PyUnicode_2BYTE_KIND) 1050 ((Py_UCS2*)data)[size] = 0; 1051 else /* kind == PyUnicode_4BYTE_KIND */ 1052 ((Py_UCS4*)data)[size] = 0; 1053 if (is_sharing) { 1054 _PyUnicode_WSTR_LENGTH(unicode) = size; 1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1056 } 1057 else { 1058 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1059 _PyUnicode_WSTR(unicode) = NULL; 1060 } 1061 } 1062#ifdef Py_DEBUG 1063 /* Fill the data with invalid characters to detect bugs earlier. 1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, 1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII 1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */ 1067 memset(data, 0xff, size * kind); 1068#endif 1069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1070 return obj; 1071} 1072 1073#if SIZEOF_WCHAR_T == 2 1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1075 will decode surrogate pairs, the other conversions are implemented as macros 1076 for efficiency. 1077 1078 This function assumes that unicode can hold one more code point than wstr 1079 characters for a terminating null character. */ 1080static void 1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1082 PyObject *unicode) 1083{ 1084 const wchar_t *iter; 1085 Py_UCS4 *ucs4_out; 1086 1087 assert(unicode != NULL); 1088 assert(_PyUnicode_CHECK(unicode)); 1089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1091 1092 for (iter = begin; iter < end; ) { 1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1094 _PyUnicode_GET_LENGTH(unicode))); 1095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1096 && (iter+1) < end 1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1098 { 1099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1100 iter += 2; 1101 } 1102 else { 1103 *ucs4_out++ = *iter; 1104 iter++; 1105 } 1106 } 1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1108 _PyUnicode_GET_LENGTH(unicode))); 1109 1110} 1111#endif 1112 1113static int 1114unicode_check_modifiable(PyObject *unicode) 1115{ 1116 if (!unicode_modifiable(unicode)) { 1117 PyErr_SetString(PyExc_SystemError, 1118 "Cannot modify a string currently used"); 1119 return -1; 1120 } 1121 return 0; 1122} 1123 1124static int 1125_copy_characters(PyObject *to, Py_ssize_t to_start, 1126 PyObject *from, Py_ssize_t from_start, 1127 Py_ssize_t how_many, int check_maxchar) 1128{ 1129 unsigned int from_kind, to_kind; 1130 void *from_data, *to_data; 1131 1132 assert(0 <= how_many); 1133 assert(0 <= from_start); 1134 assert(0 <= to_start); 1135 assert(PyUnicode_Check(from)); 1136 assert(PyUnicode_IS_READY(from)); 1137 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1138 1139 assert(PyUnicode_Check(to)); 1140 assert(PyUnicode_IS_READY(to)); 1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1142 1143 if (how_many == 0) 1144 return 0; 1145 1146 from_kind = PyUnicode_KIND(from); 1147 from_data = PyUnicode_DATA(from); 1148 to_kind = PyUnicode_KIND(to); 1149 to_data = PyUnicode_DATA(to); 1150 1151#ifdef Py_DEBUG 1152 if (!check_maxchar 1153 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1154 { 1155 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1156 Py_UCS4 ch; 1157 Py_ssize_t i; 1158 for (i=0; i < how_many; i++) { 1159 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1160 assert(ch <= to_maxchar); 1161 } 1162 } 1163#endif 1164 1165 if (from_kind == to_kind) { 1166 if (check_maxchar 1167 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1168 { 1169 /* Writing Latin-1 characters into an ASCII string requires to 1170 check that all written characters are pure ASCII */ 1171 Py_UCS4 max_char; 1172 max_char = ucs1lib_find_max_char(from_data, 1173 (Py_UCS1*)from_data + how_many); 1174 if (max_char >= 128) 1175 return -1; 1176 } 1177 Py_MEMCPY((char*)to_data + to_kind * to_start, 1178 (char*)from_data + from_kind * from_start, 1179 to_kind * how_many); 1180 } 1181 else if (from_kind == PyUnicode_1BYTE_KIND 1182 && to_kind == PyUnicode_2BYTE_KIND) 1183 { 1184 _PyUnicode_CONVERT_BYTES( 1185 Py_UCS1, Py_UCS2, 1186 PyUnicode_1BYTE_DATA(from) + from_start, 1187 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1188 PyUnicode_2BYTE_DATA(to) + to_start 1189 ); 1190 } 1191 else if (from_kind == PyUnicode_1BYTE_KIND 1192 && to_kind == PyUnicode_4BYTE_KIND) 1193 { 1194 _PyUnicode_CONVERT_BYTES( 1195 Py_UCS1, Py_UCS4, 1196 PyUnicode_1BYTE_DATA(from) + from_start, 1197 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1198 PyUnicode_4BYTE_DATA(to) + to_start 1199 ); 1200 } 1201 else if (from_kind == PyUnicode_2BYTE_KIND 1202 && to_kind == PyUnicode_4BYTE_KIND) 1203 { 1204 _PyUnicode_CONVERT_BYTES( 1205 Py_UCS2, Py_UCS4, 1206 PyUnicode_2BYTE_DATA(from) + from_start, 1207 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1208 PyUnicode_4BYTE_DATA(to) + to_start 1209 ); 1210 } 1211 else { 1212 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1213 1214 if (!check_maxchar) { 1215 if (from_kind == PyUnicode_2BYTE_KIND 1216 && to_kind == PyUnicode_1BYTE_KIND) 1217 { 1218 _PyUnicode_CONVERT_BYTES( 1219 Py_UCS2, Py_UCS1, 1220 PyUnicode_2BYTE_DATA(from) + from_start, 1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1222 PyUnicode_1BYTE_DATA(to) + to_start 1223 ); 1224 } 1225 else if (from_kind == PyUnicode_4BYTE_KIND 1226 && to_kind == PyUnicode_1BYTE_KIND) 1227 { 1228 _PyUnicode_CONVERT_BYTES( 1229 Py_UCS4, Py_UCS1, 1230 PyUnicode_4BYTE_DATA(from) + from_start, 1231 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1232 PyUnicode_1BYTE_DATA(to) + to_start 1233 ); 1234 } 1235 else if (from_kind == PyUnicode_4BYTE_KIND 1236 && to_kind == PyUnicode_2BYTE_KIND) 1237 { 1238 _PyUnicode_CONVERT_BYTES( 1239 Py_UCS4, Py_UCS2, 1240 PyUnicode_4BYTE_DATA(from) + from_start, 1241 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1242 PyUnicode_2BYTE_DATA(to) + to_start 1243 ); 1244 } 1245 else { 1246 assert(0); 1247 return -1; 1248 } 1249 } 1250 else { 1251 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1252 Py_UCS4 ch; 1253 Py_ssize_t i; 1254 1255 for (i=0; i < how_many; i++) { 1256 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1257 if (ch > to_maxchar) 1258 return -1; 1259 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1260 } 1261 } 1262 } 1263 return 0; 1264} 1265 1266void 1267_PyUnicode_FastCopyCharacters( 1268 PyObject *to, Py_ssize_t to_start, 1269 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1270{ 1271 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1272} 1273 1274Py_ssize_t 1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1276 PyObject *from, Py_ssize_t from_start, 1277 Py_ssize_t how_many) 1278{ 1279 int err; 1280 1281 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1282 PyErr_BadInternalCall(); 1283 return -1; 1284 } 1285 1286 if (PyUnicode_READY(from) == -1) 1287 return -1; 1288 if (PyUnicode_READY(to) == -1) 1289 return -1; 1290 1291 if (from_start < 0) { 1292 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1293 return -1; 1294 } 1295 if (to_start < 0) { 1296 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1297 return -1; 1298 } 1299 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1300 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1301 PyErr_Format(PyExc_SystemError, 1302 "Cannot write %zi characters at %zi " 1303 "in a string of %zi characters", 1304 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1305 return -1; 1306 } 1307 1308 if (how_many == 0) 1309 return 0; 1310 1311 if (unicode_check_modifiable(to)) 1312 return -1; 1313 1314 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1315 if (err) { 1316 PyErr_Format(PyExc_SystemError, 1317 "Cannot copy %s characters " 1318 "into a string of %s characters", 1319 unicode_kind_name(from), 1320 unicode_kind_name(to)); 1321 return -1; 1322 } 1323 return how_many; 1324} 1325 1326/* Find the maximum code point and count the number of surrogate pairs so a 1327 correct string length can be computed before converting a string to UCS4. 1328 This function counts single surrogates as a character and not as a pair. 1329 1330 Return 0 on success, or -1 on error. */ 1331static int 1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1333 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1334{ 1335 const wchar_t *iter; 1336 Py_UCS4 ch; 1337 1338 assert(num_surrogates != NULL && maxchar != NULL); 1339 *num_surrogates = 0; 1340 *maxchar = 0; 1341 1342 for (iter = begin; iter < end; ) { 1343#if SIZEOF_WCHAR_T == 2 1344 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1345 && (iter+1) < end 1346 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1347 { 1348 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1349 ++(*num_surrogates); 1350 iter += 2; 1351 } 1352 else 1353#endif 1354 { 1355 ch = *iter; 1356 iter++; 1357 } 1358 if (ch > *maxchar) { 1359 *maxchar = ch; 1360 if (*maxchar > MAX_UNICODE) { 1361 PyErr_Format(PyExc_ValueError, 1362 "character U+%x is not in range [U+0000; U+10ffff]", 1363 ch); 1364 return -1; 1365 } 1366 } 1367 } 1368 return 0; 1369} 1370 1371int 1372_PyUnicode_Ready(PyObject *unicode) 1373{ 1374 wchar_t *end; 1375 Py_UCS4 maxchar = 0; 1376 Py_ssize_t num_surrogates; 1377#if SIZEOF_WCHAR_T == 2 1378 Py_ssize_t length_wo_surrogates; 1379#endif 1380 1381 /* _PyUnicode_Ready() is only intended for old-style API usage where 1382 strings were created using _PyObject_New() and where no canonical 1383 representation (the str field) has been set yet aka strings 1384 which are not yet ready. */ 1385 assert(_PyUnicode_CHECK(unicode)); 1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1387 assert(_PyUnicode_WSTR(unicode) != NULL); 1388 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1389 assert(_PyUnicode_UTF8(unicode) == NULL); 1390 /* Actually, it should neither be interned nor be anything else: */ 1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1392 1393 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1394 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1395 &maxchar, &num_surrogates) == -1) 1396 return -1; 1397 1398 if (maxchar < 256) { 1399 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1400 if (!_PyUnicode_DATA_ANY(unicode)) { 1401 PyErr_NoMemory(); 1402 return -1; 1403 } 1404 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1405 _PyUnicode_WSTR(unicode), end, 1406 PyUnicode_1BYTE_DATA(unicode)); 1407 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1408 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1409 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1410 if (maxchar < 128) { 1411 _PyUnicode_STATE(unicode).ascii = 1; 1412 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1413 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1414 } 1415 else { 1416 _PyUnicode_STATE(unicode).ascii = 0; 1417 _PyUnicode_UTF8(unicode) = NULL; 1418 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1419 } 1420 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1421 _PyUnicode_WSTR(unicode) = NULL; 1422 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1423 } 1424 /* In this case we might have to convert down from 4-byte native 1425 wchar_t to 2-byte unicode. */ 1426 else if (maxchar < 65536) { 1427 assert(num_surrogates == 0 && 1428 "FindMaxCharAndNumSurrogatePairs() messed up"); 1429 1430#if SIZEOF_WCHAR_T == 2 1431 /* We can share representations and are done. */ 1432 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1433 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1434 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1435 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1436 _PyUnicode_UTF8(unicode) = NULL; 1437 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1438#else 1439 /* sizeof(wchar_t) == 4 */ 1440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1441 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1442 if (!_PyUnicode_DATA_ANY(unicode)) { 1443 PyErr_NoMemory(); 1444 return -1; 1445 } 1446 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1447 _PyUnicode_WSTR(unicode), end, 1448 PyUnicode_2BYTE_DATA(unicode)); 1449 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1450 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1451 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1452 _PyUnicode_UTF8(unicode) = NULL; 1453 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1454 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1455 _PyUnicode_WSTR(unicode) = NULL; 1456 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1457#endif 1458 } 1459 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1460 else { 1461#if SIZEOF_WCHAR_T == 2 1462 /* in case the native representation is 2-bytes, we need to allocate a 1463 new normalized 4-byte version. */ 1464 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1466 if (!_PyUnicode_DATA_ANY(unicode)) { 1467 PyErr_NoMemory(); 1468 return -1; 1469 } 1470 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1471 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1472 _PyUnicode_UTF8(unicode) = NULL; 1473 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1474 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1475 _PyUnicode_STATE(unicode).ready = 1; 1476 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1477 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1478 _PyUnicode_WSTR(unicode) = NULL; 1479 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1480#else 1481 assert(num_surrogates == 0); 1482 1483 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1484 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1485 _PyUnicode_UTF8(unicode) = NULL; 1486 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1487 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1488#endif 1489 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1490 } 1491 _PyUnicode_STATE(unicode).ready = 1; 1492 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1493 return 0; 1494} 1495 1496static void 1497unicode_dealloc(register PyObject *unicode) 1498{ 1499 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1500 case SSTATE_NOT_INTERNED: 1501 break; 1502 1503 case SSTATE_INTERNED_MORTAL: 1504 /* revive dead object temporarily for DelItem */ 1505 Py_REFCNT(unicode) = 3; 1506 if (PyDict_DelItem(interned, unicode) != 0) 1507 Py_FatalError( 1508 "deletion of interned string failed"); 1509 break; 1510 1511 case SSTATE_INTERNED_IMMORTAL: 1512 Py_FatalError("Immortal interned string died."); 1513 1514 default: 1515 Py_FatalError("Inconsistent interned string state."); 1516 } 1517 1518 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1519 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1520 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1521 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1522 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1523 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1524 1525 Py_TYPE(unicode)->tp_free(unicode); 1526} 1527 1528#ifdef Py_DEBUG 1529static int 1530unicode_is_singleton(PyObject *unicode) 1531{ 1532 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1533 if (unicode == unicode_empty) 1534 return 1; 1535 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1536 { 1537 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1538 if (ch < 256 && unicode_latin1[ch] == unicode) 1539 return 1; 1540 } 1541 return 0; 1542} 1543#endif 1544 1545static int 1546unicode_modifiable(PyObject *unicode) 1547{ 1548 assert(_PyUnicode_CHECK(unicode)); 1549 if (Py_REFCNT(unicode) != 1) 1550 return 0; 1551 if (_PyUnicode_HASH(unicode) != -1) 1552 return 0; 1553 if (PyUnicode_CHECK_INTERNED(unicode)) 1554 return 0; 1555 if (!PyUnicode_CheckExact(unicode)) 1556 return 0; 1557#ifdef Py_DEBUG 1558 /* singleton refcount is greater than 1 */ 1559 assert(!unicode_is_singleton(unicode)); 1560#endif 1561 return 1; 1562} 1563 1564static int 1565unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1566{ 1567 PyObject *unicode; 1568 Py_ssize_t old_length; 1569 1570 assert(p_unicode != NULL); 1571 unicode = *p_unicode; 1572 1573 assert(unicode != NULL); 1574 assert(PyUnicode_Check(unicode)); 1575 assert(0 <= length); 1576 1577 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1578 old_length = PyUnicode_WSTR_LENGTH(unicode); 1579 else 1580 old_length = PyUnicode_GET_LENGTH(unicode); 1581 if (old_length == length) 1582 return 0; 1583 1584 if (length == 0) { 1585 Py_DECREF(*p_unicode); 1586 *p_unicode = unicode_empty; 1587 Py_INCREF(*p_unicode); 1588 return 0; 1589 } 1590 1591 if (!unicode_modifiable(unicode)) { 1592 PyObject *copy = resize_copy(unicode, length); 1593 if (copy == NULL) 1594 return -1; 1595 Py_DECREF(*p_unicode); 1596 *p_unicode = copy; 1597 return 0; 1598 } 1599 1600 if (PyUnicode_IS_COMPACT(unicode)) { 1601 PyObject *new_unicode = resize_compact(unicode, length); 1602 if (new_unicode == NULL) 1603 return -1; 1604 *p_unicode = new_unicode; 1605 return 0; 1606 } 1607 return resize_inplace(unicode, length); 1608} 1609 1610int 1611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1612{ 1613 PyObject *unicode; 1614 if (p_unicode == NULL) { 1615 PyErr_BadInternalCall(); 1616 return -1; 1617 } 1618 unicode = *p_unicode; 1619 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1620 { 1621 PyErr_BadInternalCall(); 1622 return -1; 1623 } 1624 return unicode_resize(p_unicode, length); 1625} 1626 1627static int 1628unicode_widen(PyObject **p_unicode, Py_ssize_t length, 1629 unsigned int maxchar) 1630{ 1631 PyObject *result; 1632 assert(PyUnicode_IS_READY(*p_unicode)); 1633 assert(length <= PyUnicode_GET_LENGTH(*p_unicode)); 1634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1635 return 0; 1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1637 maxchar); 1638 if (result == NULL) 1639 return -1; 1640 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length); 1641 Py_DECREF(*p_unicode); 1642 *p_unicode = result; 1643 return 0; 1644} 1645 1646static int 1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1648 Py_UCS4 ch) 1649{ 1650 assert(ch <= MAX_UNICODE); 1651 if (unicode_widen(p_unicode, *pos, ch) < 0) 1652 return -1; 1653 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1654 PyUnicode_DATA(*p_unicode), 1655 (*pos)++, ch); 1656 return 0; 1657} 1658 1659/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1660 1661 WARNING: The function doesn't copy the terminating null character and 1662 doesn't check the maximum character (may write a latin1 character in an 1663 ASCII string). */ 1664static void 1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1666 const char *str, Py_ssize_t len) 1667{ 1668 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1669 void *data = PyUnicode_DATA(unicode); 1670 const char *end = str + len; 1671 1672 switch (kind) { 1673 case PyUnicode_1BYTE_KIND: { 1674 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1675 memcpy((char *) data + index, str, len); 1676 break; 1677 } 1678 case PyUnicode_2BYTE_KIND: { 1679 Py_UCS2 *start = (Py_UCS2 *)data + index; 1680 Py_UCS2 *ucs2 = start; 1681 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1682 1683 for (; str < end; ++ucs2, ++str) 1684 *ucs2 = (Py_UCS2)*str; 1685 1686 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1687 break; 1688 } 1689 default: { 1690 Py_UCS4 *start = (Py_UCS4 *)data + index; 1691 Py_UCS4 *ucs4 = start; 1692 assert(kind == PyUnicode_4BYTE_KIND); 1693 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1694 1695 for (; str < end; ++ucs4, ++str) 1696 *ucs4 = (Py_UCS4)*str; 1697 1698 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1699 } 1700 } 1701} 1702 1703 1704static PyObject* 1705get_latin1_char(unsigned char ch) 1706{ 1707 PyObject *unicode = unicode_latin1[ch]; 1708 if (!unicode) { 1709 unicode = PyUnicode_New(1, ch); 1710 if (!unicode) 1711 return NULL; 1712 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1713 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1714 unicode_latin1[ch] = unicode; 1715 } 1716 Py_INCREF(unicode); 1717 return unicode; 1718} 1719 1720PyObject * 1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1722{ 1723 PyObject *unicode; 1724 Py_UCS4 maxchar = 0; 1725 Py_ssize_t num_surrogates; 1726 1727 if (u == NULL) 1728 return (PyObject*)_PyUnicode_New(size); 1729 1730 /* If the Unicode data is known at construction time, we can apply 1731 some optimizations which share commonly used objects. */ 1732 1733 /* Optimization for empty strings */ 1734 if (size == 0 && unicode_empty != NULL) { 1735 Py_INCREF(unicode_empty); 1736 return unicode_empty; 1737 } 1738 1739 /* Single character Unicode objects in the Latin-1 range are 1740 shared when using this constructor */ 1741 if (size == 1 && *u < 256) 1742 return get_latin1_char((unsigned char)*u); 1743 1744 /* If not empty and not single character, copy the Unicode data 1745 into the new object */ 1746 if (find_maxchar_surrogates(u, u + size, 1747 &maxchar, &num_surrogates) == -1) 1748 return NULL; 1749 1750 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1751 if (!unicode) 1752 return NULL; 1753 1754 switch (PyUnicode_KIND(unicode)) { 1755 case PyUnicode_1BYTE_KIND: 1756 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1757 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1758 break; 1759 case PyUnicode_2BYTE_KIND: 1760#if Py_UNICODE_SIZE == 2 1761 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1762#else 1763 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1764 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1765#endif 1766 break; 1767 case PyUnicode_4BYTE_KIND: 1768#if SIZEOF_WCHAR_T == 2 1769 /* This is the only case which has to process surrogates, thus 1770 a simple copy loop is not enough and we need a function. */ 1771 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1772#else 1773 assert(num_surrogates == 0); 1774 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1775#endif 1776 break; 1777 default: 1778 assert(0 && "Impossible state"); 1779 } 1780 1781 return unicode_result(unicode); 1782} 1783 1784PyObject * 1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1786{ 1787 if (size < 0) { 1788 PyErr_SetString(PyExc_SystemError, 1789 "Negative size passed to PyUnicode_FromStringAndSize"); 1790 return NULL; 1791 } 1792 if (u != NULL) 1793 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1794 else 1795 return (PyObject *)_PyUnicode_New(size); 1796} 1797 1798PyObject * 1799PyUnicode_FromString(const char *u) 1800{ 1801 size_t size = strlen(u); 1802 if (size > PY_SSIZE_T_MAX) { 1803 PyErr_SetString(PyExc_OverflowError, "input too long"); 1804 return NULL; 1805 } 1806 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1807} 1808 1809PyObject * 1810_PyUnicode_FromId(_Py_Identifier *id) 1811{ 1812 if (!id->object) { 1813 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1814 strlen(id->string), 1815 NULL, NULL); 1816 if (!id->object) 1817 return NULL; 1818 PyUnicode_InternInPlace(&id->object); 1819 assert(!id->next); 1820 id->next = static_strings; 1821 static_strings = id; 1822 } 1823 return id->object; 1824} 1825 1826void 1827_PyUnicode_ClearStaticStrings() 1828{ 1829 _Py_Identifier *i; 1830 for (i = static_strings; i; i = i->next) { 1831 Py_DECREF(i->object); 1832 i->object = NULL; 1833 i->next = NULL; 1834 } 1835} 1836 1837/* Internal function, doesn't check maximum character */ 1838 1839PyObject* 1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1841{ 1842 const unsigned char *s = (const unsigned char *)buffer; 1843 PyObject *unicode; 1844 if (size == 1) { 1845#ifdef Py_DEBUG 1846 assert(s[0] < 128); 1847#endif 1848 return get_latin1_char(s[0]); 1849 } 1850 unicode = PyUnicode_New(size, 127); 1851 if (!unicode) 1852 return NULL; 1853 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1854 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1855 return unicode; 1856} 1857 1858static Py_UCS4 1859kind_maxchar_limit(unsigned int kind) 1860{ 1861 switch (kind) { 1862 case PyUnicode_1BYTE_KIND: 1863 return 0x80; 1864 case PyUnicode_2BYTE_KIND: 1865 return 0x100; 1866 case PyUnicode_4BYTE_KIND: 1867 return 0x10000; 1868 default: 1869 assert(0 && "invalid kind"); 1870 return MAX_UNICODE; 1871 } 1872} 1873 1874Py_LOCAL_INLINE(Py_UCS4) 1875align_maxchar(Py_UCS4 maxchar) 1876{ 1877 if (maxchar <= 127) 1878 return 127; 1879 else if (maxchar <= 255) 1880 return 255; 1881 else if (maxchar <= 65535) 1882 return 65535; 1883 else 1884 return MAX_UNICODE; 1885} 1886 1887static PyObject* 1888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1889{ 1890 PyObject *res; 1891 unsigned char max_char; 1892 1893 if (size == 0) { 1894 Py_INCREF(unicode_empty); 1895 return unicode_empty; 1896 } 1897 assert(size > 0); 1898 if (size == 1) 1899 return get_latin1_char(u[0]); 1900 1901 max_char = ucs1lib_find_max_char(u, u + size); 1902 res = PyUnicode_New(size, max_char); 1903 if (!res) 1904 return NULL; 1905 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1906 assert(_PyUnicode_CheckConsistency(res, 1)); 1907 return res; 1908} 1909 1910static PyObject* 1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1912{ 1913 PyObject *res; 1914 Py_UCS2 max_char; 1915 1916 if (size == 0) { 1917 Py_INCREF(unicode_empty); 1918 return unicode_empty; 1919 } 1920 assert(size > 0); 1921 if (size == 1) { 1922 Py_UCS4 ch = u[0]; 1923 if (ch < 256) 1924 return get_latin1_char((unsigned char)ch); 1925 1926 res = PyUnicode_New(1, ch); 1927 if (res == NULL) 1928 return NULL; 1929 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1930 assert(_PyUnicode_CheckConsistency(res, 1)); 1931 return res; 1932 } 1933 1934 max_char = ucs2lib_find_max_char(u, u + size); 1935 res = PyUnicode_New(size, max_char); 1936 if (!res) 1937 return NULL; 1938 if (max_char >= 256) 1939 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1940 else { 1941 _PyUnicode_CONVERT_BYTES( 1942 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1943 } 1944 assert(_PyUnicode_CheckConsistency(res, 1)); 1945 return res; 1946} 1947 1948static PyObject* 1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1950{ 1951 PyObject *res; 1952 Py_UCS4 max_char; 1953 1954 if (size == 0) { 1955 Py_INCREF(unicode_empty); 1956 return unicode_empty; 1957 } 1958 assert(size > 0); 1959 if (size == 1) { 1960 Py_UCS4 ch = u[0]; 1961 if (ch < 256) 1962 return get_latin1_char((unsigned char)ch); 1963 1964 res = PyUnicode_New(1, ch); 1965 if (res == NULL) 1966 return NULL; 1967 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1968 assert(_PyUnicode_CheckConsistency(res, 1)); 1969 return res; 1970 } 1971 1972 max_char = ucs4lib_find_max_char(u, u + size); 1973 res = PyUnicode_New(size, max_char); 1974 if (!res) 1975 return NULL; 1976 if (max_char < 256) 1977 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1978 PyUnicode_1BYTE_DATA(res)); 1979 else if (max_char < 0x10000) 1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1981 PyUnicode_2BYTE_DATA(res)); 1982 else 1983 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1984 assert(_PyUnicode_CheckConsistency(res, 1)); 1985 return res; 1986} 1987 1988PyObject* 1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1990{ 1991 if (size < 0) { 1992 PyErr_SetString(PyExc_ValueError, "size must be positive"); 1993 return NULL; 1994 } 1995 switch (kind) { 1996 case PyUnicode_1BYTE_KIND: 1997 return _PyUnicode_FromUCS1(buffer, size); 1998 case PyUnicode_2BYTE_KIND: 1999 return _PyUnicode_FromUCS2(buffer, size); 2000 case PyUnicode_4BYTE_KIND: 2001 return _PyUnicode_FromUCS4(buffer, size); 2002 default: 2003 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2004 return NULL; 2005 } 2006} 2007 2008Py_UCS4 2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2010{ 2011 enum PyUnicode_Kind kind; 2012 void *startptr, *endptr; 2013 2014 assert(PyUnicode_IS_READY(unicode)); 2015 assert(0 <= start); 2016 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2017 assert(start <= end); 2018 2019 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2020 return PyUnicode_MAX_CHAR_VALUE(unicode); 2021 2022 if (start == end) 2023 return 127; 2024 2025 if (PyUnicode_IS_ASCII(unicode)) 2026 return 127; 2027 2028 kind = PyUnicode_KIND(unicode); 2029 startptr = PyUnicode_DATA(unicode); 2030 endptr = (char *)startptr + end * kind; 2031 startptr = (char *)startptr + start * kind; 2032 switch(kind) { 2033 case PyUnicode_1BYTE_KIND: 2034 return ucs1lib_find_max_char(startptr, endptr); 2035 case PyUnicode_2BYTE_KIND: 2036 return ucs2lib_find_max_char(startptr, endptr); 2037 case PyUnicode_4BYTE_KIND: 2038 return ucs4lib_find_max_char(startptr, endptr); 2039 default: 2040 assert(0); 2041 return 0; 2042 } 2043} 2044 2045/* Ensure that a string uses the most efficient storage, if it is not the 2046 case: create a new string with of the right kind. Write NULL into *p_unicode 2047 on error. */ 2048static void 2049unicode_adjust_maxchar(PyObject **p_unicode) 2050{ 2051 PyObject *unicode, *copy; 2052 Py_UCS4 max_char; 2053 Py_ssize_t len; 2054 unsigned int kind; 2055 2056 assert(p_unicode != NULL); 2057 unicode = *p_unicode; 2058 assert(PyUnicode_IS_READY(unicode)); 2059 if (PyUnicode_IS_ASCII(unicode)) 2060 return; 2061 2062 len = PyUnicode_GET_LENGTH(unicode); 2063 kind = PyUnicode_KIND(unicode); 2064 if (kind == PyUnicode_1BYTE_KIND) { 2065 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2066 max_char = ucs1lib_find_max_char(u, u + len); 2067 if (max_char >= 128) 2068 return; 2069 } 2070 else if (kind == PyUnicode_2BYTE_KIND) { 2071 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2072 max_char = ucs2lib_find_max_char(u, u + len); 2073 if (max_char >= 256) 2074 return; 2075 } 2076 else { 2077 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2078 assert(kind == PyUnicode_4BYTE_KIND); 2079 max_char = ucs4lib_find_max_char(u, u + len); 2080 if (max_char >= 0x10000) 2081 return; 2082 } 2083 copy = PyUnicode_New(len, max_char); 2084 if (copy != NULL) 2085 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2086 Py_DECREF(unicode); 2087 *p_unicode = copy; 2088} 2089 2090PyObject* 2091_PyUnicode_Copy(PyObject *unicode) 2092{ 2093 Py_ssize_t length; 2094 PyObject *copy; 2095 2096 if (!PyUnicode_Check(unicode)) { 2097 PyErr_BadInternalCall(); 2098 return NULL; 2099 } 2100 if (PyUnicode_READY(unicode) == -1) 2101 return NULL; 2102 2103 length = PyUnicode_GET_LENGTH(unicode); 2104 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2105 if (!copy) 2106 return NULL; 2107 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2108 2109 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2110 length * PyUnicode_KIND(unicode)); 2111 assert(_PyUnicode_CheckConsistency(copy, 1)); 2112 return copy; 2113} 2114 2115 2116/* Widen Unicode objects to larger buffers. Don't write terminating null 2117 character. Return NULL on error. */ 2118 2119void* 2120_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2121{ 2122 Py_ssize_t len; 2123 void *result; 2124 unsigned int skind; 2125 2126 if (PyUnicode_READY(s) == -1) 2127 return NULL; 2128 2129 len = PyUnicode_GET_LENGTH(s); 2130 skind = PyUnicode_KIND(s); 2131 if (skind >= kind) { 2132 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2133 return NULL; 2134 } 2135 switch (kind) { 2136 case PyUnicode_2BYTE_KIND: 2137 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2138 if (!result) 2139 return PyErr_NoMemory(); 2140 assert(skind == PyUnicode_1BYTE_KIND); 2141 _PyUnicode_CONVERT_BYTES( 2142 Py_UCS1, Py_UCS2, 2143 PyUnicode_1BYTE_DATA(s), 2144 PyUnicode_1BYTE_DATA(s) + len, 2145 result); 2146 return result; 2147 case PyUnicode_4BYTE_KIND: 2148 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2149 if (!result) 2150 return PyErr_NoMemory(); 2151 if (skind == PyUnicode_2BYTE_KIND) { 2152 _PyUnicode_CONVERT_BYTES( 2153 Py_UCS2, Py_UCS4, 2154 PyUnicode_2BYTE_DATA(s), 2155 PyUnicode_2BYTE_DATA(s) + len, 2156 result); 2157 } 2158 else { 2159 assert(skind == PyUnicode_1BYTE_KIND); 2160 _PyUnicode_CONVERT_BYTES( 2161 Py_UCS1, Py_UCS4, 2162 PyUnicode_1BYTE_DATA(s), 2163 PyUnicode_1BYTE_DATA(s) + len, 2164 result); 2165 } 2166 return result; 2167 default: 2168 break; 2169 } 2170 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2171 return NULL; 2172} 2173 2174static Py_UCS4* 2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2176 int copy_null) 2177{ 2178 int kind; 2179 void *data; 2180 Py_ssize_t len, targetlen; 2181 if (PyUnicode_READY(string) == -1) 2182 return NULL; 2183 kind = PyUnicode_KIND(string); 2184 data = PyUnicode_DATA(string); 2185 len = PyUnicode_GET_LENGTH(string); 2186 targetlen = len; 2187 if (copy_null) 2188 targetlen++; 2189 if (!target) { 2190 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2191 PyErr_NoMemory(); 2192 return NULL; 2193 } 2194 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2195 if (!target) { 2196 PyErr_NoMemory(); 2197 return NULL; 2198 } 2199 } 2200 else { 2201 if (targetsize < targetlen) { 2202 PyErr_Format(PyExc_SystemError, 2203 "string is longer than the buffer"); 2204 if (copy_null && 0 < targetsize) 2205 target[0] = 0; 2206 return NULL; 2207 } 2208 } 2209 if (kind == PyUnicode_1BYTE_KIND) { 2210 Py_UCS1 *start = (Py_UCS1 *) data; 2211 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2212 } 2213 else if (kind == PyUnicode_2BYTE_KIND) { 2214 Py_UCS2 *start = (Py_UCS2 *) data; 2215 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2216 } 2217 else { 2218 assert(kind == PyUnicode_4BYTE_KIND); 2219 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2220 } 2221 if (copy_null) 2222 target[len] = 0; 2223 return target; 2224} 2225 2226Py_UCS4* 2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2228 int copy_null) 2229{ 2230 if (target == NULL || targetsize < 0) { 2231 PyErr_BadInternalCall(); 2232 return NULL; 2233 } 2234 return as_ucs4(string, target, targetsize, copy_null); 2235} 2236 2237Py_UCS4* 2238PyUnicode_AsUCS4Copy(PyObject *string) 2239{ 2240 return as_ucs4(string, NULL, 0, 1); 2241} 2242 2243#ifdef HAVE_WCHAR_H 2244 2245PyObject * 2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2247{ 2248 if (w == NULL) { 2249 if (size == 0) { 2250 Py_INCREF(unicode_empty); 2251 return unicode_empty; 2252 } 2253 PyErr_BadInternalCall(); 2254 return NULL; 2255 } 2256 2257 if (size == -1) { 2258 size = wcslen(w); 2259 } 2260 2261 return PyUnicode_FromUnicode(w, size); 2262} 2263 2264#endif /* HAVE_WCHAR_H */ 2265 2266static void 2267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2268 int zeropad, int width, int precision, char c) 2269{ 2270 *fmt++ = '%'; 2271 if (width) { 2272 if (zeropad) 2273 *fmt++ = '0'; 2274 fmt += sprintf(fmt, "%d", width); 2275 } 2276 if (precision) 2277 fmt += sprintf(fmt, ".%d", precision); 2278 if (longflag) 2279 *fmt++ = 'l'; 2280 else if (longlongflag) { 2281 /* longlongflag should only ever be nonzero on machines with 2282 HAVE_LONG_LONG defined */ 2283#ifdef HAVE_LONG_LONG 2284 char *f = PY_FORMAT_LONG_LONG; 2285 while (*f) 2286 *fmt++ = *f++; 2287#else 2288 /* we shouldn't ever get here */ 2289 assert(0); 2290 *fmt++ = 'l'; 2291#endif 2292 } 2293 else if (size_tflag) { 2294 char *f = PY_FORMAT_SIZE_T; 2295 while (*f) 2296 *fmt++ = *f++; 2297 } 2298 *fmt++ = c; 2299 *fmt = '\0'; 2300} 2301 2302/* helper for PyUnicode_FromFormatV() */ 2303 2304static const char* 2305parse_format_flags(const char *f, 2306 int *p_width, int *p_precision, 2307 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2308{ 2309 int width, precision, longflag, longlongflag, size_tflag; 2310 2311 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2312 f++; 2313 width = 0; 2314 while (Py_ISDIGIT((unsigned)*f)) 2315 width = (width*10) + *f++ - '0'; 2316 precision = 0; 2317 if (*f == '.') { 2318 f++; 2319 while (Py_ISDIGIT((unsigned)*f)) 2320 precision = (precision*10) + *f++ - '0'; 2321 if (*f == '%') { 2322 /* "%.3%s" => f points to "3" */ 2323 f--; 2324 } 2325 } 2326 if (*f == '\0') { 2327 /* bogus format "%.1" => go backward, f points to "1" */ 2328 f--; 2329 } 2330 if (p_width != NULL) 2331 *p_width = width; 2332 if (p_precision != NULL) 2333 *p_precision = precision; 2334 2335 /* Handle %ld, %lu, %lld and %llu. */ 2336 longflag = 0; 2337 longlongflag = 0; 2338 size_tflag = 0; 2339 2340 if (*f == 'l') { 2341 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2342 longflag = 1; 2343 ++f; 2344 } 2345#ifdef HAVE_LONG_LONG 2346 else if (f[1] == 'l' && 2347 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2348 longlongflag = 1; 2349 f += 2; 2350 } 2351#endif 2352 } 2353 /* handle the size_t flag. */ 2354 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2355 size_tflag = 1; 2356 ++f; 2357 } 2358 if (p_longflag != NULL) 2359 *p_longflag = longflag; 2360 if (p_longlongflag != NULL) 2361 *p_longlongflag = longlongflag; 2362 if (p_size_tflag != NULL) 2363 *p_size_tflag = size_tflag; 2364 return f; 2365} 2366 2367/* maximum number of characters required for output of %ld. 21 characters 2368 allows for 64-bit integers (in decimal) and an optional sign. */ 2369#define MAX_LONG_CHARS 21 2370/* maximum number of characters required for output of %lld. 2371 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2372 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2374 2375PyObject * 2376PyUnicode_FromFormatV(const char *format, va_list vargs) 2377{ 2378 va_list count; 2379 Py_ssize_t callcount = 0; 2380 PyObject **callresults = NULL; 2381 PyObject **callresult = NULL; 2382 Py_ssize_t n = 0; 2383 int width = 0; 2384 int precision = 0; 2385 int zeropad; 2386 const char* f; 2387 PyObject *string; 2388 /* used by sprintf */ 2389 char fmt[61]; /* should be enough for %0width.precisionlld */ 2390 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2391 Py_UCS4 argmaxchar; 2392 Py_ssize_t numbersize = 0; 2393 char *numberresults = NULL; 2394 char *numberresult = NULL; 2395 Py_ssize_t i; 2396 int kind; 2397 void *data; 2398 2399 Py_VA_COPY(count, vargs); 2400 /* step 1: count the number of %S/%R/%A/%s format specifications 2401 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2402 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2403 * result in an array) 2404 * also estimate a upper bound for all the number formats in the string, 2405 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2406 * buffer before putting everything together. */ 2407 for (f = format; *f; f++) { 2408 if (*f == '%') { 2409 int longlongflag; 2410 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2411 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2412 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2413 ++callcount; 2414 2415 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2416#ifdef HAVE_LONG_LONG 2417 if (longlongflag) { 2418 if (width < MAX_LONG_LONG_CHARS) 2419 width = MAX_LONG_LONG_CHARS; 2420 } 2421 else 2422#endif 2423 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2424 including sign. Decimal takes the most space. This 2425 isn't enough for octal. If a width is specified we 2426 need more (which we allocate later). */ 2427 if (width < MAX_LONG_CHARS) 2428 width = MAX_LONG_CHARS; 2429 2430 /* account for the size + '\0' to separate numbers 2431 inside of the numberresults buffer */ 2432 numbersize += (width + 1); 2433 } 2434 } 2435 else if ((unsigned char)*f > 127) { 2436 PyErr_Format(PyExc_ValueError, 2437 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2438 "string, got a non-ASCII byte: 0x%02x", 2439 (unsigned char)*f); 2440 return NULL; 2441 } 2442 } 2443 /* step 2: allocate memory for the results of 2444 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2445 if (callcount) { 2446 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2447 if (!callresults) { 2448 PyErr_NoMemory(); 2449 return NULL; 2450 } 2451 callresult = callresults; 2452 } 2453 /* step 2.5: allocate memory for the results of formating numbers */ 2454 if (numbersize) { 2455 numberresults = PyObject_Malloc(numbersize); 2456 if (!numberresults) { 2457 PyErr_NoMemory(); 2458 goto fail; 2459 } 2460 numberresult = numberresults; 2461 } 2462 2463 /* step 3: format numbers and figure out how large a buffer we need */ 2464 for (f = format; *f; f++) { 2465 if (*f == '%') { 2466 const char* p; 2467 int longflag; 2468 int longlongflag; 2469 int size_tflag; 2470 int numprinted; 2471 2472 p = f; 2473 zeropad = (f[1] == '0'); 2474 f = parse_format_flags(f, &width, &precision, 2475 &longflag, &longlongflag, &size_tflag); 2476 switch (*f) { 2477 case 'c': 2478 { 2479 Py_UCS4 ordinal = va_arg(count, int); 2480 maxchar = MAX_MAXCHAR(maxchar, ordinal); 2481 n++; 2482 break; 2483 } 2484 case '%': 2485 n++; 2486 break; 2487 case 'i': 2488 case 'd': 2489 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2490 width, precision, *f); 2491 if (longflag) 2492 numprinted = sprintf(numberresult, fmt, 2493 va_arg(count, long)); 2494#ifdef HAVE_LONG_LONG 2495 else if (longlongflag) 2496 numprinted = sprintf(numberresult, fmt, 2497 va_arg(count, PY_LONG_LONG)); 2498#endif 2499 else if (size_tflag) 2500 numprinted = sprintf(numberresult, fmt, 2501 va_arg(count, Py_ssize_t)); 2502 else 2503 numprinted = sprintf(numberresult, fmt, 2504 va_arg(count, int)); 2505 n += numprinted; 2506 /* advance by +1 to skip over the '\0' */ 2507 numberresult += (numprinted + 1); 2508 assert(*(numberresult - 1) == '\0'); 2509 assert(*(numberresult - 2) != '\0'); 2510 assert(numprinted >= 0); 2511 assert(numberresult <= numberresults + numbersize); 2512 break; 2513 case 'u': 2514 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2515 width, precision, 'u'); 2516 if (longflag) 2517 numprinted = sprintf(numberresult, fmt, 2518 va_arg(count, unsigned long)); 2519#ifdef HAVE_LONG_LONG 2520 else if (longlongflag) 2521 numprinted = sprintf(numberresult, fmt, 2522 va_arg(count, unsigned PY_LONG_LONG)); 2523#endif 2524 else if (size_tflag) 2525 numprinted = sprintf(numberresult, fmt, 2526 va_arg(count, size_t)); 2527 else 2528 numprinted = sprintf(numberresult, fmt, 2529 va_arg(count, unsigned int)); 2530 n += numprinted; 2531 numberresult += (numprinted + 1); 2532 assert(*(numberresult - 1) == '\0'); 2533 assert(*(numberresult - 2) != '\0'); 2534 assert(numprinted >= 0); 2535 assert(numberresult <= numberresults + numbersize); 2536 break; 2537 case 'x': 2538 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2539 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2540 n += numprinted; 2541 numberresult += (numprinted + 1); 2542 assert(*(numberresult - 1) == '\0'); 2543 assert(*(numberresult - 2) != '\0'); 2544 assert(numprinted >= 0); 2545 assert(numberresult <= numberresults + numbersize); 2546 break; 2547 case 'p': 2548 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2549 /* %p is ill-defined: ensure leading 0x. */ 2550 if (numberresult[1] == 'X') 2551 numberresult[1] = 'x'; 2552 else if (numberresult[1] != 'x') { 2553 memmove(numberresult + 2, numberresult, 2554 strlen(numberresult) + 1); 2555 numberresult[0] = '0'; 2556 numberresult[1] = 'x'; 2557 numprinted += 2; 2558 } 2559 n += numprinted; 2560 numberresult += (numprinted + 1); 2561 assert(*(numberresult - 1) == '\0'); 2562 assert(*(numberresult - 2) != '\0'); 2563 assert(numprinted >= 0); 2564 assert(numberresult <= numberresults + numbersize); 2565 break; 2566 case 's': 2567 { 2568 /* UTF-8 */ 2569 const char *s = va_arg(count, const char*); 2570 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); 2571 if (!str) 2572 goto fail; 2573 /* since PyUnicode_DecodeUTF8 returns already flexible 2574 unicode objects, there is no need to call ready on them */ 2575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2576 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2577 n += PyUnicode_GET_LENGTH(str); 2578 /* Remember the str and switch to the next slot */ 2579 *callresult++ = str; 2580 break; 2581 } 2582 case 'U': 2583 { 2584 PyObject *obj = va_arg(count, PyObject *); 2585 assert(obj && _PyUnicode_CHECK(obj)); 2586 if (PyUnicode_READY(obj) == -1) 2587 goto fail; 2588 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2589 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2590 n += PyUnicode_GET_LENGTH(obj); 2591 break; 2592 } 2593 case 'V': 2594 { 2595 PyObject *obj = va_arg(count, PyObject *); 2596 const char *str = va_arg(count, const char *); 2597 PyObject *str_obj; 2598 assert(obj || str); 2599 assert(!obj || _PyUnicode_CHECK(obj)); 2600 if (obj) { 2601 if (PyUnicode_READY(obj) == -1) 2602 goto fail; 2603 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2604 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2605 n += PyUnicode_GET_LENGTH(obj); 2606 *callresult++ = NULL; 2607 } 2608 else { 2609 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2610 if (!str_obj) 2611 goto fail; 2612 if (PyUnicode_READY(str_obj) == -1) { 2613 Py_DECREF(str_obj); 2614 goto fail; 2615 } 2616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2617 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2618 n += PyUnicode_GET_LENGTH(str_obj); 2619 *callresult++ = str_obj; 2620 } 2621 break; 2622 } 2623 case 'S': 2624 { 2625 PyObject *obj = va_arg(count, PyObject *); 2626 PyObject *str; 2627 assert(obj); 2628 str = PyObject_Str(obj); 2629 if (!str) 2630 goto fail; 2631 if (PyUnicode_READY(str) == -1) { 2632 Py_DECREF(str); 2633 goto fail; 2634 } 2635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2636 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2637 n += PyUnicode_GET_LENGTH(str); 2638 /* Remember the str and switch to the next slot */ 2639 *callresult++ = str; 2640 break; 2641 } 2642 case 'R': 2643 { 2644 PyObject *obj = va_arg(count, PyObject *); 2645 PyObject *repr; 2646 assert(obj); 2647 repr = PyObject_Repr(obj); 2648 if (!repr) 2649 goto fail; 2650 if (PyUnicode_READY(repr) == -1) { 2651 Py_DECREF(repr); 2652 goto fail; 2653 } 2654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2655 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2656 n += PyUnicode_GET_LENGTH(repr); 2657 /* Remember the repr and switch to the next slot */ 2658 *callresult++ = repr; 2659 break; 2660 } 2661 case 'A': 2662 { 2663 PyObject *obj = va_arg(count, PyObject *); 2664 PyObject *ascii; 2665 assert(obj); 2666 ascii = PyObject_ASCII(obj); 2667 if (!ascii) 2668 goto fail; 2669 if (PyUnicode_READY(ascii) == -1) { 2670 Py_DECREF(ascii); 2671 goto fail; 2672 } 2673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2674 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2675 n += PyUnicode_GET_LENGTH(ascii); 2676 /* Remember the repr and switch to the next slot */ 2677 *callresult++ = ascii; 2678 break; 2679 } 2680 default: 2681 /* if we stumble upon an unknown 2682 formatting code, copy the rest of 2683 the format string to the output 2684 string. (we cannot just skip the 2685 code, since there's no way to know 2686 what's in the argument list) */ 2687 n += strlen(p); 2688 goto expand; 2689 } 2690 } else 2691 n++; 2692 } 2693 expand: 2694 /* step 4: fill the buffer */ 2695 /* Since we've analyzed how much space we need, 2696 we don't have to resize the string. 2697 There can be no errors beyond this point. */ 2698 string = PyUnicode_New(n, maxchar); 2699 if (!string) 2700 goto fail; 2701 kind = PyUnicode_KIND(string); 2702 data = PyUnicode_DATA(string); 2703 callresult = callresults; 2704 numberresult = numberresults; 2705 2706 for (i = 0, f = format; *f; f++) { 2707 if (*f == '%') { 2708 const char* p; 2709 2710 p = f; 2711 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2712 /* checking for == because the last argument could be a empty 2713 string, which causes i to point to end, the assert at the end of 2714 the loop */ 2715 assert(i <= PyUnicode_GET_LENGTH(string)); 2716 2717 switch (*f) { 2718 case 'c': 2719 { 2720 const int ordinal = va_arg(vargs, int); 2721 PyUnicode_WRITE(kind, data, i++, ordinal); 2722 break; 2723 } 2724 case 'i': 2725 case 'd': 2726 case 'u': 2727 case 'x': 2728 case 'p': 2729 { 2730 Py_ssize_t len; 2731 /* unused, since we already have the result */ 2732 if (*f == 'p') 2733 (void) va_arg(vargs, void *); 2734 else 2735 (void) va_arg(vargs, int); 2736 /* extract the result from numberresults and append. */ 2737 len = strlen(numberresult); 2738 unicode_write_cstr(string, i, numberresult, len); 2739 /* skip over the separating '\0' */ 2740 i += len; 2741 numberresult += len; 2742 assert(*numberresult == '\0'); 2743 numberresult++; 2744 assert(numberresult <= numberresults + numbersize); 2745 break; 2746 } 2747 case 's': 2748 { 2749 /* unused, since we already have the result */ 2750 Py_ssize_t size; 2751 (void) va_arg(vargs, char *); 2752 size = PyUnicode_GET_LENGTH(*callresult); 2753 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2754 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2755 i += size; 2756 /* We're done with the unicode()/repr() => forget it */ 2757 Py_DECREF(*callresult); 2758 /* switch to next unicode()/repr() result */ 2759 ++callresult; 2760 break; 2761 } 2762 case 'U': 2763 { 2764 PyObject *obj = va_arg(vargs, PyObject *); 2765 Py_ssize_t size; 2766 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2767 size = PyUnicode_GET_LENGTH(obj); 2768 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2769 i += size; 2770 break; 2771 } 2772 case 'V': 2773 { 2774 Py_ssize_t size; 2775 PyObject *obj = va_arg(vargs, PyObject *); 2776 va_arg(vargs, const char *); 2777 if (obj) { 2778 size = PyUnicode_GET_LENGTH(obj); 2779 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2780 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2781 i += size; 2782 } else { 2783 size = PyUnicode_GET_LENGTH(*callresult); 2784 assert(PyUnicode_KIND(*callresult) <= 2785 PyUnicode_KIND(string)); 2786 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2787 i += size; 2788 Py_DECREF(*callresult); 2789 } 2790 ++callresult; 2791 break; 2792 } 2793 case 'S': 2794 case 'R': 2795 case 'A': 2796 { 2797 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2798 /* unused, since we already have the result */ 2799 (void) va_arg(vargs, PyObject *); 2800 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2801 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2802 i += size; 2803 /* We're done with the unicode()/repr() => forget it */ 2804 Py_DECREF(*callresult); 2805 /* switch to next unicode()/repr() result */ 2806 ++callresult; 2807 break; 2808 } 2809 case '%': 2810 PyUnicode_WRITE(kind, data, i++, '%'); 2811 break; 2812 default: 2813 { 2814 Py_ssize_t len = strlen(p); 2815 unicode_write_cstr(string, i, p, len); 2816 i += len; 2817 assert(i == PyUnicode_GET_LENGTH(string)); 2818 goto end; 2819 } 2820 } 2821 } 2822 else { 2823 assert(i < PyUnicode_GET_LENGTH(string)); 2824 PyUnicode_WRITE(kind, data, i++, *f); 2825 } 2826 } 2827 assert(i == PyUnicode_GET_LENGTH(string)); 2828 2829 end: 2830 if (callresults) 2831 PyObject_Free(callresults); 2832 if (numberresults) 2833 PyObject_Free(numberresults); 2834 return unicode_result(string); 2835 fail: 2836 if (callresults) { 2837 PyObject **callresult2 = callresults; 2838 while (callresult2 < callresult) { 2839 Py_XDECREF(*callresult2); 2840 ++callresult2; 2841 } 2842 PyObject_Free(callresults); 2843 } 2844 if (numberresults) 2845 PyObject_Free(numberresults); 2846 return NULL; 2847} 2848 2849PyObject * 2850PyUnicode_FromFormat(const char *format, ...) 2851{ 2852 PyObject* ret; 2853 va_list vargs; 2854 2855#ifdef HAVE_STDARG_PROTOTYPES 2856 va_start(vargs, format); 2857#else 2858 va_start(vargs); 2859#endif 2860 ret = PyUnicode_FromFormatV(format, vargs); 2861 va_end(vargs); 2862 return ret; 2863} 2864 2865#ifdef HAVE_WCHAR_H 2866 2867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2868 convert a Unicode object to a wide character string. 2869 2870 - If w is NULL: return the number of wide characters (including the null 2871 character) required to convert the unicode object. Ignore size argument. 2872 2873 - Otherwise: return the number of wide characters (excluding the null 2874 character) written into w. Write at most size wide characters (including 2875 the null character). */ 2876static Py_ssize_t 2877unicode_aswidechar(PyObject *unicode, 2878 wchar_t *w, 2879 Py_ssize_t size) 2880{ 2881 Py_ssize_t res; 2882 const wchar_t *wstr; 2883 2884 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2885 if (wstr == NULL) 2886 return -1; 2887 2888 if (w != NULL) { 2889 if (size > res) 2890 size = res + 1; 2891 else 2892 res = size; 2893 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2894 return res; 2895 } 2896 else 2897 return res + 1; 2898} 2899 2900Py_ssize_t 2901PyUnicode_AsWideChar(PyObject *unicode, 2902 wchar_t *w, 2903 Py_ssize_t size) 2904{ 2905 if (unicode == NULL) { 2906 PyErr_BadInternalCall(); 2907 return -1; 2908 } 2909 return unicode_aswidechar(unicode, w, size); 2910} 2911 2912wchar_t* 2913PyUnicode_AsWideCharString(PyObject *unicode, 2914 Py_ssize_t *size) 2915{ 2916 wchar_t* buffer; 2917 Py_ssize_t buflen; 2918 2919 if (unicode == NULL) { 2920 PyErr_BadInternalCall(); 2921 return NULL; 2922 } 2923 2924 buflen = unicode_aswidechar(unicode, NULL, 0); 2925 if (buflen == -1) 2926 return NULL; 2927 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2928 PyErr_NoMemory(); 2929 return NULL; 2930 } 2931 2932 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2933 if (buffer == NULL) { 2934 PyErr_NoMemory(); 2935 return NULL; 2936 } 2937 buflen = unicode_aswidechar(unicode, buffer, buflen); 2938 if (buflen == -1) { 2939 PyMem_FREE(buffer); 2940 return NULL; 2941 } 2942 if (size != NULL) 2943 *size = buflen; 2944 return buffer; 2945} 2946 2947#endif /* HAVE_WCHAR_H */ 2948 2949PyObject * 2950PyUnicode_FromOrdinal(int ordinal) 2951{ 2952 PyObject *v; 2953 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2954 PyErr_SetString(PyExc_ValueError, 2955 "chr() arg not in range(0x110000)"); 2956 return NULL; 2957 } 2958 2959 if (ordinal < 256) 2960 return get_latin1_char(ordinal); 2961 2962 v = PyUnicode_New(1, ordinal); 2963 if (v == NULL) 2964 return NULL; 2965 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2966 assert(_PyUnicode_CheckConsistency(v, 1)); 2967 return v; 2968} 2969 2970PyObject * 2971PyUnicode_FromObject(register PyObject *obj) 2972{ 2973 /* XXX Perhaps we should make this API an alias of 2974 PyObject_Str() instead ?! */ 2975 if (PyUnicode_CheckExact(obj)) { 2976 if (PyUnicode_READY(obj) == -1) 2977 return NULL; 2978 Py_INCREF(obj); 2979 return obj; 2980 } 2981 if (PyUnicode_Check(obj)) { 2982 /* For a Unicode subtype that's not a Unicode object, 2983 return a true Unicode object with the same data. */ 2984 return _PyUnicode_Copy(obj); 2985 } 2986 PyErr_Format(PyExc_TypeError, 2987 "Can't convert '%.100s' object to str implicitly", 2988 Py_TYPE(obj)->tp_name); 2989 return NULL; 2990} 2991 2992PyObject * 2993PyUnicode_FromEncodedObject(register PyObject *obj, 2994 const char *encoding, 2995 const char *errors) 2996{ 2997 Py_buffer buffer; 2998 PyObject *v; 2999 3000 if (obj == NULL) { 3001 PyErr_BadInternalCall(); 3002 return NULL; 3003 } 3004 3005 /* Decoding bytes objects is the most common case and should be fast */ 3006 if (PyBytes_Check(obj)) { 3007 if (PyBytes_GET_SIZE(obj) == 0) { 3008 Py_INCREF(unicode_empty); 3009 v = unicode_empty; 3010 } 3011 else { 3012 v = PyUnicode_Decode( 3013 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3014 encoding, errors); 3015 } 3016 return v; 3017 } 3018 3019 if (PyUnicode_Check(obj)) { 3020 PyErr_SetString(PyExc_TypeError, 3021 "decoding str is not supported"); 3022 return NULL; 3023 } 3024 3025 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3026 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3027 PyErr_Format(PyExc_TypeError, 3028 "coercing to str: need bytes, bytearray " 3029 "or buffer-like object, %.80s found", 3030 Py_TYPE(obj)->tp_name); 3031 return NULL; 3032 } 3033 3034 if (buffer.len == 0) { 3035 Py_INCREF(unicode_empty); 3036 v = unicode_empty; 3037 } 3038 else 3039 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3040 3041 PyBuffer_Release(&buffer); 3042 return v; 3043} 3044 3045/* Convert encoding to lower case and replace '_' with '-' in order to 3046 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 3047 1 on success. */ 3048static int 3049normalize_encoding(const char *encoding, 3050 char *lower, 3051 size_t lower_len) 3052{ 3053 const char *e; 3054 char *l; 3055 char *l_end; 3056 3057 if (encoding == NULL) { 3058 strcpy(lower, "utf-8"); 3059 return 1; 3060 } 3061 e = encoding; 3062 l = lower; 3063 l_end = &lower[lower_len - 1]; 3064 while (*e) { 3065 if (l == l_end) 3066 return 0; 3067 if (Py_ISUPPER(*e)) { 3068 *l++ = Py_TOLOWER(*e++); 3069 } 3070 else if (*e == '_') { 3071 *l++ = '-'; 3072 e++; 3073 } 3074 else { 3075 *l++ = *e++; 3076 } 3077 } 3078 *l = '\0'; 3079 return 1; 3080} 3081 3082PyObject * 3083PyUnicode_Decode(const char *s, 3084 Py_ssize_t size, 3085 const char *encoding, 3086 const char *errors) 3087{ 3088 PyObject *buffer = NULL, *unicode; 3089 Py_buffer info; 3090 char lower[11]; /* Enough for any encoding shortcut */ 3091 3092 /* Shortcuts for common default encodings */ 3093 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3094 if ((strcmp(lower, "utf-8") == 0) || 3095 (strcmp(lower, "utf8") == 0)) 3096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3097 else if ((strcmp(lower, "latin-1") == 0) || 3098 (strcmp(lower, "latin1") == 0) || 3099 (strcmp(lower, "iso-8859-1") == 0)) 3100 return PyUnicode_DecodeLatin1(s, size, errors); 3101#ifdef HAVE_MBCS 3102 else if (strcmp(lower, "mbcs") == 0) 3103 return PyUnicode_DecodeMBCS(s, size, errors); 3104#endif 3105 else if (strcmp(lower, "ascii") == 0) 3106 return PyUnicode_DecodeASCII(s, size, errors); 3107 else if (strcmp(lower, "utf-16") == 0) 3108 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3109 else if (strcmp(lower, "utf-32") == 0) 3110 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3111 } 3112 3113 /* Decode via the codec registry */ 3114 buffer = NULL; 3115 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3116 goto onError; 3117 buffer = PyMemoryView_FromBuffer(&info); 3118 if (buffer == NULL) 3119 goto onError; 3120 unicode = PyCodec_Decode(buffer, encoding, errors); 3121 if (unicode == NULL) 3122 goto onError; 3123 if (!PyUnicode_Check(unicode)) { 3124 PyErr_Format(PyExc_TypeError, 3125 "decoder did not return a str object (type=%.400s)", 3126 Py_TYPE(unicode)->tp_name); 3127 Py_DECREF(unicode); 3128 goto onError; 3129 } 3130 Py_DECREF(buffer); 3131 return unicode_result(unicode); 3132 3133 onError: 3134 Py_XDECREF(buffer); 3135 return NULL; 3136} 3137 3138PyObject * 3139PyUnicode_AsDecodedObject(PyObject *unicode, 3140 const char *encoding, 3141 const char *errors) 3142{ 3143 PyObject *v; 3144 3145 if (!PyUnicode_Check(unicode)) { 3146 PyErr_BadArgument(); 3147 goto onError; 3148 } 3149 3150 if (encoding == NULL) 3151 encoding = PyUnicode_GetDefaultEncoding(); 3152 3153 /* Decode via the codec registry */ 3154 v = PyCodec_Decode(unicode, encoding, errors); 3155 if (v == NULL) 3156 goto onError; 3157 return unicode_result(v); 3158 3159 onError: 3160 return NULL; 3161} 3162 3163PyObject * 3164PyUnicode_AsDecodedUnicode(PyObject *unicode, 3165 const char *encoding, 3166 const char *errors) 3167{ 3168 PyObject *v; 3169 3170 if (!PyUnicode_Check(unicode)) { 3171 PyErr_BadArgument(); 3172 goto onError; 3173 } 3174 3175 if (encoding == NULL) 3176 encoding = PyUnicode_GetDefaultEncoding(); 3177 3178 /* Decode via the codec registry */ 3179 v = PyCodec_Decode(unicode, encoding, errors); 3180 if (v == NULL) 3181 goto onError; 3182 if (!PyUnicode_Check(v)) { 3183 PyErr_Format(PyExc_TypeError, 3184 "decoder did not return a str object (type=%.400s)", 3185 Py_TYPE(v)->tp_name); 3186 Py_DECREF(v); 3187 goto onError; 3188 } 3189 return unicode_result(v); 3190 3191 onError: 3192 return NULL; 3193} 3194 3195PyObject * 3196PyUnicode_Encode(const Py_UNICODE *s, 3197 Py_ssize_t size, 3198 const char *encoding, 3199 const char *errors) 3200{ 3201 PyObject *v, *unicode; 3202 3203 unicode = PyUnicode_FromUnicode(s, size); 3204 if (unicode == NULL) 3205 return NULL; 3206 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3207 Py_DECREF(unicode); 3208 return v; 3209} 3210 3211PyObject * 3212PyUnicode_AsEncodedObject(PyObject *unicode, 3213 const char *encoding, 3214 const char *errors) 3215{ 3216 PyObject *v; 3217 3218 if (!PyUnicode_Check(unicode)) { 3219 PyErr_BadArgument(); 3220 goto onError; 3221 } 3222 3223 if (encoding == NULL) 3224 encoding = PyUnicode_GetDefaultEncoding(); 3225 3226 /* Encode via the codec registry */ 3227 v = PyCodec_Encode(unicode, encoding, errors); 3228 if (v == NULL) 3229 goto onError; 3230 return v; 3231 3232 onError: 3233 return NULL; 3234} 3235 3236static size_t 3237wcstombs_errorpos(const wchar_t *wstr) 3238{ 3239 size_t len; 3240#if SIZEOF_WCHAR_T == 2 3241 wchar_t buf[3]; 3242#else 3243 wchar_t buf[2]; 3244#endif 3245 char outbuf[MB_LEN_MAX]; 3246 const wchar_t *start, *previous; 3247 3248#if SIZEOF_WCHAR_T == 2 3249 buf[2] = 0; 3250#else 3251 buf[1] = 0; 3252#endif 3253 start = wstr; 3254 while (*wstr != L'\0') 3255 { 3256 previous = wstr; 3257#if SIZEOF_WCHAR_T == 2 3258 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3259 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3260 { 3261 buf[0] = wstr[0]; 3262 buf[1] = wstr[1]; 3263 wstr += 2; 3264 } 3265 else { 3266 buf[0] = *wstr; 3267 buf[1] = 0; 3268 wstr++; 3269 } 3270#else 3271 buf[0] = *wstr; 3272 wstr++; 3273#endif 3274 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3275 if (len == (size_t)-1) 3276 return previous - start; 3277 } 3278 3279 /* failed to find the unencodable character */ 3280 return 0; 3281} 3282 3283static int 3284locale_error_handler(const char *errors, int *surrogateescape) 3285{ 3286 if (errors == NULL) { 3287 *surrogateescape = 0; 3288 return 0; 3289 } 3290 3291 if (strcmp(errors, "strict") == 0) { 3292 *surrogateescape = 0; 3293 return 0; 3294 } 3295 if (strcmp(errors, "surrogateescape") == 0) { 3296 *surrogateescape = 1; 3297 return 0; 3298 } 3299 PyErr_Format(PyExc_ValueError, 3300 "only 'strict' and 'surrogateescape' error handlers " 3301 "are supported, not '%s'", 3302 errors); 3303 return -1; 3304} 3305 3306PyObject * 3307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3308{ 3309 Py_ssize_t wlen, wlen2; 3310 wchar_t *wstr; 3311 PyObject *bytes = NULL; 3312 char *errmsg; 3313 PyObject *reason; 3314 PyObject *exc; 3315 size_t error_pos; 3316 int surrogateescape; 3317 3318 if (locale_error_handler(errors, &surrogateescape) < 0) 3319 return NULL; 3320 3321 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3322 if (wstr == NULL) 3323 return NULL; 3324 3325 wlen2 = wcslen(wstr); 3326 if (wlen2 != wlen) { 3327 PyMem_Free(wstr); 3328 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3329 return NULL; 3330 } 3331 3332 if (surrogateescape) { 3333 /* locale encoding with surrogateescape */ 3334 char *str; 3335 3336 str = _Py_wchar2char(wstr, &error_pos); 3337 if (str == NULL) { 3338 if (error_pos == (size_t)-1) { 3339 PyErr_NoMemory(); 3340 PyMem_Free(wstr); 3341 return NULL; 3342 } 3343 else { 3344 goto encode_error; 3345 } 3346 } 3347 PyMem_Free(wstr); 3348 3349 bytes = PyBytes_FromString(str); 3350 PyMem_Free(str); 3351 } 3352 else { 3353 size_t len, len2; 3354 3355 len = wcstombs(NULL, wstr, 0); 3356 if (len == (size_t)-1) { 3357 error_pos = (size_t)-1; 3358 goto encode_error; 3359 } 3360 3361 bytes = PyBytes_FromStringAndSize(NULL, len); 3362 if (bytes == NULL) { 3363 PyMem_Free(wstr); 3364 return NULL; 3365 } 3366 3367 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3368 if (len2 == (size_t)-1 || len2 > len) { 3369 error_pos = (size_t)-1; 3370 goto encode_error; 3371 } 3372 PyMem_Free(wstr); 3373 } 3374 return bytes; 3375 3376encode_error: 3377 errmsg = strerror(errno); 3378 assert(errmsg != NULL); 3379 3380 if (error_pos == (size_t)-1) 3381 error_pos = wcstombs_errorpos(wstr); 3382 3383 PyMem_Free(wstr); 3384 Py_XDECREF(bytes); 3385 3386 if (errmsg != NULL) { 3387 size_t errlen; 3388 wstr = _Py_char2wchar(errmsg, &errlen); 3389 if (wstr != NULL) { 3390 reason = PyUnicode_FromWideChar(wstr, errlen); 3391 PyMem_Free(wstr); 3392 } else 3393 errmsg = NULL; 3394 } 3395 if (errmsg == NULL) 3396 reason = PyUnicode_FromString( 3397 "wcstombs() encountered an unencodable " 3398 "wide character"); 3399 if (reason == NULL) 3400 return NULL; 3401 3402 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3403 "locale", unicode, 3404 (Py_ssize_t)error_pos, 3405 (Py_ssize_t)(error_pos+1), 3406 reason); 3407 Py_DECREF(reason); 3408 if (exc != NULL) { 3409 PyCodec_StrictErrors(exc); 3410 Py_XDECREF(exc); 3411 } 3412 return NULL; 3413} 3414 3415PyObject * 3416PyUnicode_EncodeFSDefault(PyObject *unicode) 3417{ 3418#ifdef HAVE_MBCS 3419 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3420#elif defined(__APPLE__) 3421 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3422#else 3423 PyInterpreterState *interp = PyThreadState_GET()->interp; 3424 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3425 cannot use it to encode and decode filenames before it is loaded. Load 3426 the Python codec requires to encode at least its own filename. Use the C 3427 version of the locale codec until the codec registry is initialized and 3428 the Python codec is loaded. 3429 3430 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3431 cannot only rely on it: check also interp->fscodec_initialized for 3432 subinterpreters. */ 3433 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3434 return PyUnicode_AsEncodedString(unicode, 3435 Py_FileSystemDefaultEncoding, 3436 "surrogateescape"); 3437 } 3438 else { 3439 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3440 } 3441#endif 3442} 3443 3444PyObject * 3445PyUnicode_AsEncodedString(PyObject *unicode, 3446 const char *encoding, 3447 const char *errors) 3448{ 3449 PyObject *v; 3450 char lower[11]; /* Enough for any encoding shortcut */ 3451 3452 if (!PyUnicode_Check(unicode)) { 3453 PyErr_BadArgument(); 3454 return NULL; 3455 } 3456 3457 /* Shortcuts for common default encodings */ 3458 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3459 if ((strcmp(lower, "utf-8") == 0) || 3460 (strcmp(lower, "utf8") == 0)) 3461 { 3462 if (errors == NULL || strcmp(errors, "strict") == 0) 3463 return _PyUnicode_AsUTF8String(unicode, NULL); 3464 else 3465 return _PyUnicode_AsUTF8String(unicode, errors); 3466 } 3467 else if ((strcmp(lower, "latin-1") == 0) || 3468 (strcmp(lower, "latin1") == 0) || 3469 (strcmp(lower, "iso-8859-1") == 0)) 3470 return _PyUnicode_AsLatin1String(unicode, errors); 3471#ifdef HAVE_MBCS 3472 else if (strcmp(lower, "mbcs") == 0) 3473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3474#endif 3475 else if (strcmp(lower, "ascii") == 0) 3476 return _PyUnicode_AsASCIIString(unicode, errors); 3477 } 3478 3479 /* Encode via the codec registry */ 3480 v = PyCodec_Encode(unicode, encoding, errors); 3481 if (v == NULL) 3482 return NULL; 3483 3484 /* The normal path */ 3485 if (PyBytes_Check(v)) 3486 return v; 3487 3488 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3489 if (PyByteArray_Check(v)) { 3490 int error; 3491 PyObject *b; 3492 3493 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3494 "encoder %s returned bytearray instead of bytes", 3495 encoding); 3496 if (error) { 3497 Py_DECREF(v); 3498 return NULL; 3499 } 3500 3501 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3502 Py_DECREF(v); 3503 return b; 3504 } 3505 3506 PyErr_Format(PyExc_TypeError, 3507 "encoder did not return a bytes object (type=%.400s)", 3508 Py_TYPE(v)->tp_name); 3509 Py_DECREF(v); 3510 return NULL; 3511} 3512 3513PyObject * 3514PyUnicode_AsEncodedUnicode(PyObject *unicode, 3515 const char *encoding, 3516 const char *errors) 3517{ 3518 PyObject *v; 3519 3520 if (!PyUnicode_Check(unicode)) { 3521 PyErr_BadArgument(); 3522 goto onError; 3523 } 3524 3525 if (encoding == NULL) 3526 encoding = PyUnicode_GetDefaultEncoding(); 3527 3528 /* Encode via the codec registry */ 3529 v = PyCodec_Encode(unicode, encoding, errors); 3530 if (v == NULL) 3531 goto onError; 3532 if (!PyUnicode_Check(v)) { 3533 PyErr_Format(PyExc_TypeError, 3534 "encoder did not return an str object (type=%.400s)", 3535 Py_TYPE(v)->tp_name); 3536 Py_DECREF(v); 3537 goto onError; 3538 } 3539 return v; 3540 3541 onError: 3542 return NULL; 3543} 3544 3545static size_t 3546mbstowcs_errorpos(const char *str, size_t len) 3547{ 3548#ifdef HAVE_MBRTOWC 3549 const char *start = str; 3550 mbstate_t mbs; 3551 size_t converted; 3552 wchar_t ch; 3553 3554 memset(&mbs, 0, sizeof mbs); 3555 while (len) 3556 { 3557 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3558 if (converted == 0) 3559 /* Reached end of string */ 3560 break; 3561 if (converted == (size_t)-1 || converted == (size_t)-2) { 3562 /* Conversion error or incomplete character */ 3563 return str - start; 3564 } 3565 else { 3566 str += converted; 3567 len -= converted; 3568 } 3569 } 3570 /* failed to find the undecodable byte sequence */ 3571 return 0; 3572#endif 3573 return 0; 3574} 3575 3576PyObject* 3577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3578 const char *errors) 3579{ 3580 wchar_t smallbuf[256]; 3581 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3582 wchar_t *wstr; 3583 size_t wlen, wlen2; 3584 PyObject *unicode; 3585 int surrogateescape; 3586 size_t error_pos; 3587 char *errmsg; 3588 PyObject *reason, *exc; 3589 3590 if (locale_error_handler(errors, &surrogateescape) < 0) 3591 return NULL; 3592 3593 if (str[len] != '\0' || len != strlen(str)) { 3594 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3595 return NULL; 3596 } 3597 3598 if (surrogateescape) 3599 { 3600 wstr = _Py_char2wchar(str, &wlen); 3601 if (wstr == NULL) { 3602 if (wlen == (size_t)-1) 3603 PyErr_NoMemory(); 3604 else 3605 PyErr_SetFromErrno(PyExc_OSError); 3606 return NULL; 3607 } 3608 3609 unicode = PyUnicode_FromWideChar(wstr, wlen); 3610 PyMem_Free(wstr); 3611 } 3612 else { 3613#ifndef HAVE_BROKEN_MBSTOWCS 3614 wlen = mbstowcs(NULL, str, 0); 3615#else 3616 wlen = len; 3617#endif 3618 if (wlen == (size_t)-1) 3619 goto decode_error; 3620 if (wlen+1 <= smallbuf_len) { 3621 wstr = smallbuf; 3622 } 3623 else { 3624 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3625 return PyErr_NoMemory(); 3626 3627 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3628 if (!wstr) 3629 return PyErr_NoMemory(); 3630 } 3631 3632 /* This shouldn't fail now */ 3633 wlen2 = mbstowcs(wstr, str, wlen+1); 3634 if (wlen2 == (size_t)-1) { 3635 if (wstr != smallbuf) 3636 PyMem_Free(wstr); 3637 goto decode_error; 3638 } 3639#ifdef HAVE_BROKEN_MBSTOWCS 3640 assert(wlen2 == wlen); 3641#endif 3642 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3643 if (wstr != smallbuf) 3644 PyMem_Free(wstr); 3645 } 3646 return unicode; 3647 3648decode_error: 3649 errmsg = strerror(errno); 3650 assert(errmsg != NULL); 3651 3652 error_pos = mbstowcs_errorpos(str, len); 3653 if (errmsg != NULL) { 3654 size_t errlen; 3655 wstr = _Py_char2wchar(errmsg, &errlen); 3656 if (wstr != NULL) { 3657 reason = PyUnicode_FromWideChar(wstr, errlen); 3658 PyMem_Free(wstr); 3659 } else 3660 errmsg = NULL; 3661 } 3662 if (errmsg == NULL) 3663 reason = PyUnicode_FromString( 3664 "mbstowcs() encountered an invalid multibyte sequence"); 3665 if (reason == NULL) 3666 return NULL; 3667 3668 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3669 "locale", str, len, 3670 (Py_ssize_t)error_pos, 3671 (Py_ssize_t)(error_pos+1), 3672 reason); 3673 Py_DECREF(reason); 3674 if (exc != NULL) { 3675 PyCodec_StrictErrors(exc); 3676 Py_XDECREF(exc); 3677 } 3678 return NULL; 3679} 3680 3681PyObject* 3682PyUnicode_DecodeLocale(const char *str, const char *errors) 3683{ 3684 Py_ssize_t size = (Py_ssize_t)strlen(str); 3685 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3686} 3687 3688 3689PyObject* 3690PyUnicode_DecodeFSDefault(const char *s) { 3691 Py_ssize_t size = (Py_ssize_t)strlen(s); 3692 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3693} 3694 3695PyObject* 3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3697{ 3698#ifdef HAVE_MBCS 3699 return PyUnicode_DecodeMBCS(s, size, NULL); 3700#elif defined(__APPLE__) 3701 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3702#else 3703 PyInterpreterState *interp = PyThreadState_GET()->interp; 3704 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3705 cannot use it to encode and decode filenames before it is loaded. Load 3706 the Python codec requires to encode at least its own filename. Use the C 3707 version of the locale codec until the codec registry is initialized and 3708 the Python codec is loaded. 3709 3710 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3711 cannot only rely on it: check also interp->fscodec_initialized for 3712 subinterpreters. */ 3713 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3714 return PyUnicode_Decode(s, size, 3715 Py_FileSystemDefaultEncoding, 3716 "surrogateescape"); 3717 } 3718 else { 3719 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3720 } 3721#endif 3722} 3723 3724 3725int 3726_PyUnicode_HasNULChars(PyObject* s) 3727{ 3728 static PyObject *nul = NULL; 3729 3730 if (nul == NULL) 3731 nul = PyUnicode_FromStringAndSize("\0", 1); 3732 if (nul == NULL) 3733 return -1; 3734 return PyUnicode_Contains(s, nul); 3735} 3736 3737 3738int 3739PyUnicode_FSConverter(PyObject* arg, void* addr) 3740{ 3741 PyObject *output = NULL; 3742 Py_ssize_t size; 3743 void *data; 3744 if (arg == NULL) { 3745 Py_DECREF(*(PyObject**)addr); 3746 return 1; 3747 } 3748 if (PyBytes_Check(arg)) { 3749 output = arg; 3750 Py_INCREF(output); 3751 } 3752 else { 3753 arg = PyUnicode_FromObject(arg); 3754 if (!arg) 3755 return 0; 3756 output = PyUnicode_EncodeFSDefault(arg); 3757 Py_DECREF(arg); 3758 if (!output) 3759 return 0; 3760 if (!PyBytes_Check(output)) { 3761 Py_DECREF(output); 3762 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3763 return 0; 3764 } 3765 } 3766 size = PyBytes_GET_SIZE(output); 3767 data = PyBytes_AS_STRING(output); 3768 if (size != strlen(data)) { 3769 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3770 Py_DECREF(output); 3771 return 0; 3772 } 3773 *(PyObject**)addr = output; 3774 return Py_CLEANUP_SUPPORTED; 3775} 3776 3777 3778int 3779PyUnicode_FSDecoder(PyObject* arg, void* addr) 3780{ 3781 PyObject *output = NULL; 3782 if (arg == NULL) { 3783 Py_DECREF(*(PyObject**)addr); 3784 return 1; 3785 } 3786 if (PyUnicode_Check(arg)) { 3787 if (PyUnicode_READY(arg) == -1) 3788 return 0; 3789 output = arg; 3790 Py_INCREF(output); 3791 } 3792 else { 3793 arg = PyBytes_FromObject(arg); 3794 if (!arg) 3795 return 0; 3796 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3797 PyBytes_GET_SIZE(arg)); 3798 Py_DECREF(arg); 3799 if (!output) 3800 return 0; 3801 if (!PyUnicode_Check(output)) { 3802 Py_DECREF(output); 3803 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3804 return 0; 3805 } 3806 } 3807 if (PyUnicode_READY(output) == -1) { 3808 Py_DECREF(output); 3809 return 0; 3810 } 3811 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3812 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3813 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3814 Py_DECREF(output); 3815 return 0; 3816 } 3817 *(PyObject**)addr = output; 3818 return Py_CLEANUP_SUPPORTED; 3819} 3820 3821 3822char* 3823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3824{ 3825 PyObject *bytes; 3826 3827 if (!PyUnicode_Check(unicode)) { 3828 PyErr_BadArgument(); 3829 return NULL; 3830 } 3831 if (PyUnicode_READY(unicode) == -1) 3832 return NULL; 3833 3834 if (PyUnicode_UTF8(unicode) == NULL) { 3835 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3836 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3837 if (bytes == NULL) 3838 return NULL; 3839 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3840 if (_PyUnicode_UTF8(unicode) == NULL) { 3841 Py_DECREF(bytes); 3842 return NULL; 3843 } 3844 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3845 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3846 PyBytes_AS_STRING(bytes), 3847 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3848 Py_DECREF(bytes); 3849 } 3850 3851 if (psize) 3852 *psize = PyUnicode_UTF8_LENGTH(unicode); 3853 return PyUnicode_UTF8(unicode); 3854} 3855 3856char* 3857PyUnicode_AsUTF8(PyObject *unicode) 3858{ 3859 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3860} 3861 3862Py_UNICODE * 3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3864{ 3865 const unsigned char *one_byte; 3866#if SIZEOF_WCHAR_T == 4 3867 const Py_UCS2 *two_bytes; 3868#else 3869 const Py_UCS4 *four_bytes; 3870 const Py_UCS4 *ucs4_end; 3871 Py_ssize_t num_surrogates; 3872#endif 3873 wchar_t *w; 3874 wchar_t *wchar_end; 3875 3876 if (!PyUnicode_Check(unicode)) { 3877 PyErr_BadArgument(); 3878 return NULL; 3879 } 3880 if (_PyUnicode_WSTR(unicode) == NULL) { 3881 /* Non-ASCII compact unicode object */ 3882 assert(_PyUnicode_KIND(unicode) != 0); 3883 assert(PyUnicode_IS_READY(unicode)); 3884 3885 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3886#if SIZEOF_WCHAR_T == 2 3887 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3888 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3889 num_surrogates = 0; 3890 3891 for (; four_bytes < ucs4_end; ++four_bytes) { 3892 if (*four_bytes > 0xFFFF) 3893 ++num_surrogates; 3894 } 3895 3896 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3897 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3898 if (!_PyUnicode_WSTR(unicode)) { 3899 PyErr_NoMemory(); 3900 return NULL; 3901 } 3902 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3903 3904 w = _PyUnicode_WSTR(unicode); 3905 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3906 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3907 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3908 if (*four_bytes > 0xFFFF) { 3909 assert(*four_bytes <= MAX_UNICODE); 3910 /* encode surrogate pair in this case */ 3911 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3912 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3913 } 3914 else 3915 *w = *four_bytes; 3916 3917 if (w > wchar_end) { 3918 assert(0 && "Miscalculated string end"); 3919 } 3920 } 3921 *w = 0; 3922#else 3923 /* sizeof(wchar_t) == 4 */ 3924 Py_FatalError("Impossible unicode object state, wstr and str " 3925 "should share memory already."); 3926 return NULL; 3927#endif 3928 } 3929 else { 3930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3931 (_PyUnicode_LENGTH(unicode) + 1)); 3932 if (!_PyUnicode_WSTR(unicode)) { 3933 PyErr_NoMemory(); 3934 return NULL; 3935 } 3936 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3937 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3938 w = _PyUnicode_WSTR(unicode); 3939 wchar_end = w + _PyUnicode_LENGTH(unicode); 3940 3941 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3942 one_byte = PyUnicode_1BYTE_DATA(unicode); 3943 for (; w < wchar_end; ++one_byte, ++w) 3944 *w = *one_byte; 3945 /* null-terminate the wstr */ 3946 *w = 0; 3947 } 3948 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3949#if SIZEOF_WCHAR_T == 4 3950 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3951 for (; w < wchar_end; ++two_bytes, ++w) 3952 *w = *two_bytes; 3953 /* null-terminate the wstr */ 3954 *w = 0; 3955#else 3956 /* sizeof(wchar_t) == 2 */ 3957 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3958 _PyUnicode_WSTR(unicode) = NULL; 3959 Py_FatalError("Impossible unicode object state, wstr " 3960 "and str should share memory already."); 3961 return NULL; 3962#endif 3963 } 3964 else { 3965 assert(0 && "This should never happen."); 3966 } 3967 } 3968 } 3969 if (size != NULL) 3970 *size = PyUnicode_WSTR_LENGTH(unicode); 3971 return _PyUnicode_WSTR(unicode); 3972} 3973 3974Py_UNICODE * 3975PyUnicode_AsUnicode(PyObject *unicode) 3976{ 3977 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3978} 3979 3980 3981Py_ssize_t 3982PyUnicode_GetSize(PyObject *unicode) 3983{ 3984 if (!PyUnicode_Check(unicode)) { 3985 PyErr_BadArgument(); 3986 goto onError; 3987 } 3988 return PyUnicode_GET_SIZE(unicode); 3989 3990 onError: 3991 return -1; 3992} 3993 3994Py_ssize_t 3995PyUnicode_GetLength(PyObject *unicode) 3996{ 3997 if (!PyUnicode_Check(unicode)) { 3998 PyErr_BadArgument(); 3999 return -1; 4000 } 4001 if (PyUnicode_READY(unicode) == -1) 4002 return -1; 4003 return PyUnicode_GET_LENGTH(unicode); 4004} 4005 4006Py_UCS4 4007PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4008{ 4009 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 4010 PyErr_BadArgument(); 4011 return (Py_UCS4)-1; 4012 } 4013 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4014 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4015 return (Py_UCS4)-1; 4016 } 4017 return PyUnicode_READ_CHAR(unicode, index); 4018} 4019 4020int 4021PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4022{ 4023 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4024 PyErr_BadArgument(); 4025 return -1; 4026 } 4027 assert(PyUnicode_IS_READY(unicode)); 4028 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4029 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4030 return -1; 4031 } 4032 if (unicode_check_modifiable(unicode)) 4033 return -1; 4034 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4035 PyErr_SetString(PyExc_ValueError, "character out of range"); 4036 return -1; 4037 } 4038 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4039 index, ch); 4040 return 0; 4041} 4042 4043const char * 4044PyUnicode_GetDefaultEncoding(void) 4045{ 4046 return "utf-8"; 4047} 4048 4049/* create or adjust a UnicodeDecodeError */ 4050static void 4051make_decode_exception(PyObject **exceptionObject, 4052 const char *encoding, 4053 const char *input, Py_ssize_t length, 4054 Py_ssize_t startpos, Py_ssize_t endpos, 4055 const char *reason) 4056{ 4057 if (*exceptionObject == NULL) { 4058 *exceptionObject = PyUnicodeDecodeError_Create( 4059 encoding, input, length, startpos, endpos, reason); 4060 } 4061 else { 4062 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4063 goto onError; 4064 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4065 goto onError; 4066 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4067 goto onError; 4068 } 4069 return; 4070 4071onError: 4072 Py_DECREF(*exceptionObject); 4073 *exceptionObject = NULL; 4074} 4075 4076/* error handling callback helper: 4077 build arguments, call the callback and check the arguments, 4078 if no exception occurred, copy the replacement to the output 4079 and adjust various state variables. 4080 return 0 on success, -1 on error 4081*/ 4082 4083static int 4084unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4085 const char *encoding, const char *reason, 4086 const char **input, const char **inend, Py_ssize_t *startinpos, 4087 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4088 PyObject **output, Py_ssize_t *outpos) 4089{ 4090 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4091 4092 PyObject *restuple = NULL; 4093 PyObject *repunicode = NULL; 4094 Py_ssize_t outsize; 4095 Py_ssize_t insize; 4096 Py_ssize_t requiredsize; 4097 Py_ssize_t newpos; 4098 PyObject *inputobj = NULL; 4099 int res = -1; 4100 4101 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4102 outsize = PyUnicode_GET_LENGTH(*output); 4103 else 4104 outsize = _PyUnicode_WSTR_LENGTH(*output); 4105 4106 if (*errorHandler == NULL) { 4107 *errorHandler = PyCodec_LookupError(errors); 4108 if (*errorHandler == NULL) 4109 goto onError; 4110 } 4111 4112 make_decode_exception(exceptionObject, 4113 encoding, 4114 *input, *inend - *input, 4115 *startinpos, *endinpos, 4116 reason); 4117 if (*exceptionObject == NULL) 4118 goto onError; 4119 4120 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4121 if (restuple == NULL) 4122 goto onError; 4123 if (!PyTuple_Check(restuple)) { 4124 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4125 goto onError; 4126 } 4127 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4128 goto onError; 4129 if (PyUnicode_READY(repunicode) == -1) 4130 goto onError; 4131 4132 /* Copy back the bytes variables, which might have been modified by the 4133 callback */ 4134 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4135 if (!inputobj) 4136 goto onError; 4137 if (!PyBytes_Check(inputobj)) { 4138 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4139 } 4140 *input = PyBytes_AS_STRING(inputobj); 4141 insize = PyBytes_GET_SIZE(inputobj); 4142 *inend = *input + insize; 4143 /* we can DECREF safely, as the exception has another reference, 4144 so the object won't go away. */ 4145 Py_DECREF(inputobj); 4146 4147 if (newpos<0) 4148 newpos = insize+newpos; 4149 if (newpos<0 || newpos>insize) { 4150 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4151 goto onError; 4152 } 4153 4154 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4155 /* need more space? (at least enough for what we 4156 have+the replacement+the rest of the string (starting 4157 at the new input position), so we won't have to check space 4158 when there are no errors in the rest of the string) */ 4159 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4160 requiredsize = *outpos + replen + insize-newpos; 4161 if (requiredsize > outsize) { 4162 if (requiredsize<2*outsize) 4163 requiredsize = 2*outsize; 4164 if (unicode_resize(output, requiredsize) < 0) 4165 goto onError; 4166 } 4167 if (unicode_widen(output, *outpos, 4168 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 4169 goto onError; 4170 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4171 *outpos += replen; 4172 } 4173 else { 4174 wchar_t *repwstr; 4175 Py_ssize_t repwlen; 4176 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4177 if (repwstr == NULL) 4178 goto onError; 4179 /* need more space? (at least enough for what we 4180 have+the replacement+the rest of the string (starting 4181 at the new input position), so we won't have to check space 4182 when there are no errors in the rest of the string) */ 4183 requiredsize = *outpos + repwlen + insize-newpos; 4184 if (requiredsize > outsize) { 4185 if (requiredsize < 2*outsize) 4186 requiredsize = 2*outsize; 4187 if (unicode_resize(output, requiredsize) < 0) 4188 goto onError; 4189 } 4190 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4191 *outpos += repwlen; 4192 } 4193 *endinpos = newpos; 4194 *inptr = *input + newpos; 4195 4196 /* we made it! */ 4197 res = 0; 4198 4199 onError: 4200 Py_XDECREF(restuple); 4201 return res; 4202} 4203 4204/* --- UTF-7 Codec -------------------------------------------------------- */ 4205 4206/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4207 4208/* Three simple macros defining base-64. */ 4209 4210/* Is c a base-64 character? */ 4211 4212#define IS_BASE64(c) \ 4213 (((c) >= 'A' && (c) <= 'Z') || \ 4214 ((c) >= 'a' && (c) <= 'z') || \ 4215 ((c) >= '0' && (c) <= '9') || \ 4216 (c) == '+' || (c) == '/') 4217 4218/* given that c is a base-64 character, what is its base-64 value? */ 4219 4220#define FROM_BASE64(c) \ 4221 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4222 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4223 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4224 (c) == '+' ? 62 : 63) 4225 4226/* What is the base-64 character of the bottom 6 bits of n? */ 4227 4228#define TO_BASE64(n) \ 4229 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4230 4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4232 * decoded as itself. We are permissive on decoding; the only ASCII 4233 * byte not decoding to itself is the + which begins a base64 4234 * string. */ 4235 4236#define DECODE_DIRECT(c) \ 4237 ((c) <= 127 && (c) != '+') 4238 4239/* The UTF-7 encoder treats ASCII characters differently according to 4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4241 * the above). See RFC2152. This array identifies these different 4242 * sets: 4243 * 0 : "Set D" 4244 * alphanumeric and '(),-./:? 4245 * 1 : "Set O" 4246 * !"#$%&*;<=>@[]^_`{|} 4247 * 2 : "whitespace" 4248 * ht nl cr sp 4249 * 3 : special (must be base64 encoded) 4250 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4251 */ 4252 4253static 4254char utf7_category[128] = { 4255/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4259/* sp ! " # $ % & ' ( ) * + , - . / */ 4260 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4261/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4263/* @ A B C D E F G H I J K L M N O */ 4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4265/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4267/* ` a b c d e f g h i j k l m n o */ 4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4269/* p q r s t u v w x y z { | } ~ del */ 4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4271}; 4272 4273/* ENCODE_DIRECT: this character should be encoded as itself. The 4274 * answer depends on whether we are encoding set O as itself, and also 4275 * on whether we are encoding whitespace as itself. RFC2152 makes it 4276 * clear that the answers to these questions vary between 4277 * applications, so this code needs to be flexible. */ 4278 4279#define ENCODE_DIRECT(c, directO, directWS) \ 4280 ((c) < 128 && (c) > 0 && \ 4281 ((utf7_category[(c)] == 0) || \ 4282 (directWS && (utf7_category[(c)] == 2)) || \ 4283 (directO && (utf7_category[(c)] == 1)))) 4284 4285PyObject * 4286PyUnicode_DecodeUTF7(const char *s, 4287 Py_ssize_t size, 4288 const char *errors) 4289{ 4290 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4291} 4292 4293/* The decoder. The only state we preserve is our read position, 4294 * i.e. how many characters we have consumed. So if we end in the 4295 * middle of a shift sequence we have to back off the read position 4296 * and the output to the beginning of the sequence, otherwise we lose 4297 * all the shift state (seen bits, number of bits seen, high 4298 * surrogate). */ 4299 4300PyObject * 4301PyUnicode_DecodeUTF7Stateful(const char *s, 4302 Py_ssize_t size, 4303 const char *errors, 4304 Py_ssize_t *consumed) 4305{ 4306 const char *starts = s; 4307 Py_ssize_t startinpos; 4308 Py_ssize_t endinpos; 4309 Py_ssize_t outpos; 4310 const char *e; 4311 PyObject *unicode; 4312 const char *errmsg = ""; 4313 int inShift = 0; 4314 Py_ssize_t shiftOutStart; 4315 unsigned int base64bits = 0; 4316 unsigned long base64buffer = 0; 4317 Py_UCS4 surrogate = 0; 4318 PyObject *errorHandler = NULL; 4319 PyObject *exc = NULL; 4320 4321 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4322 unicode = PyUnicode_New(size, 127); 4323 if (!unicode) 4324 return NULL; 4325 if (size == 0) { 4326 if (consumed) 4327 *consumed = 0; 4328 return unicode; 4329 } 4330 4331 shiftOutStart = outpos = 0; 4332 e = s + size; 4333 4334 while (s < e) { 4335 Py_UCS4 ch; 4336 restart: 4337 ch = (unsigned char) *s; 4338 4339 if (inShift) { /* in a base-64 section */ 4340 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4341 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4342 base64bits += 6; 4343 s++; 4344 if (base64bits >= 16) { 4345 /* we have enough bits for a UTF-16 value */ 4346 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4347 base64bits -= 16; 4348 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4349 if (surrogate) { 4350 /* expecting a second surrogate */ 4351 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4352 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4353 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4354 goto onError; 4355 surrogate = 0; 4356 continue; 4357 } 4358 else { 4359 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4360 goto onError; 4361 surrogate = 0; 4362 } 4363 } 4364 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4365 /* first surrogate */ 4366 surrogate = outCh; 4367 } 4368 else { 4369 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4370 goto onError; 4371 } 4372 } 4373 } 4374 else { /* now leaving a base-64 section */ 4375 inShift = 0; 4376 s++; 4377 if (surrogate) { 4378 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4379 goto onError; 4380 surrogate = 0; 4381 } 4382 if (base64bits > 0) { /* left-over bits */ 4383 if (base64bits >= 6) { 4384 /* We've seen at least one base-64 character */ 4385 errmsg = "partial character in shift sequence"; 4386 goto utf7Error; 4387 } 4388 else { 4389 /* Some bits remain; they should be zero */ 4390 if (base64buffer != 0) { 4391 errmsg = "non-zero padding bits in shift sequence"; 4392 goto utf7Error; 4393 } 4394 } 4395 } 4396 if (ch != '-') { 4397 /* '-' is absorbed; other terminating 4398 characters are preserved */ 4399 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4400 goto onError; 4401 } 4402 } 4403 } 4404 else if ( ch == '+' ) { 4405 startinpos = s-starts; 4406 s++; /* consume '+' */ 4407 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4408 s++; 4409 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4410 goto onError; 4411 } 4412 else { /* begin base64-encoded section */ 4413 inShift = 1; 4414 shiftOutStart = outpos; 4415 base64bits = 0; 4416 } 4417 } 4418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4419 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4420 goto onError; 4421 s++; 4422 } 4423 else { 4424 startinpos = s-starts; 4425 s++; 4426 errmsg = "unexpected special character"; 4427 goto utf7Error; 4428 } 4429 continue; 4430utf7Error: 4431 endinpos = s-starts; 4432 if (unicode_decode_call_errorhandler( 4433 errors, &errorHandler, 4434 "utf7", errmsg, 4435 &starts, &e, &startinpos, &endinpos, &exc, &s, 4436 &unicode, &outpos)) 4437 goto onError; 4438 } 4439 4440 /* end of string */ 4441 4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4443 /* if we're in an inconsistent state, that's an error */ 4444 if (surrogate || 4445 (base64bits >= 6) || 4446 (base64bits > 0 && base64buffer != 0)) { 4447 endinpos = size; 4448 if (unicode_decode_call_errorhandler( 4449 errors, &errorHandler, 4450 "utf7", "unterminated shift sequence", 4451 &starts, &e, &startinpos, &endinpos, &exc, &s, 4452 &unicode, &outpos)) 4453 goto onError; 4454 if (s < e) 4455 goto restart; 4456 } 4457 } 4458 4459 /* return state */ 4460 if (consumed) { 4461 if (inShift) { 4462 outpos = shiftOutStart; /* back off output */ 4463 *consumed = startinpos; 4464 } 4465 else { 4466 *consumed = s-starts; 4467 } 4468 } 4469 4470 if (unicode_resize(&unicode, outpos) < 0) 4471 goto onError; 4472 4473 Py_XDECREF(errorHandler); 4474 Py_XDECREF(exc); 4475 return unicode_result(unicode); 4476 4477 onError: 4478 Py_XDECREF(errorHandler); 4479 Py_XDECREF(exc); 4480 Py_DECREF(unicode); 4481 return NULL; 4482} 4483 4484 4485PyObject * 4486_PyUnicode_EncodeUTF7(PyObject *str, 4487 int base64SetO, 4488 int base64WhiteSpace, 4489 const char *errors) 4490{ 4491 int kind; 4492 void *data; 4493 Py_ssize_t len; 4494 PyObject *v; 4495 int inShift = 0; 4496 Py_ssize_t i; 4497 unsigned int base64bits = 0; 4498 unsigned long base64buffer = 0; 4499 char * out; 4500 char * start; 4501 4502 if (PyUnicode_READY(str) == -1) 4503 return NULL; 4504 kind = PyUnicode_KIND(str); 4505 data = PyUnicode_DATA(str); 4506 len = PyUnicode_GET_LENGTH(str); 4507 4508 if (len == 0) 4509 return PyBytes_FromStringAndSize(NULL, 0); 4510 4511 /* It might be possible to tighten this worst case */ 4512 if (len > PY_SSIZE_T_MAX / 8) 4513 return PyErr_NoMemory(); 4514 v = PyBytes_FromStringAndSize(NULL, len * 8); 4515 if (v == NULL) 4516 return NULL; 4517 4518 start = out = PyBytes_AS_STRING(v); 4519 for (i = 0; i < len; ++i) { 4520 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4521 4522 if (inShift) { 4523 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4524 /* shifting out */ 4525 if (base64bits) { /* output remaining bits */ 4526 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4527 base64buffer = 0; 4528 base64bits = 0; 4529 } 4530 inShift = 0; 4531 /* Characters not in the BASE64 set implicitly unshift the sequence 4532 so no '-' is required, except if the character is itself a '-' */ 4533 if (IS_BASE64(ch) || ch == '-') { 4534 *out++ = '-'; 4535 } 4536 *out++ = (char) ch; 4537 } 4538 else { 4539 goto encode_char; 4540 } 4541 } 4542 else { /* not in a shift sequence */ 4543 if (ch == '+') { 4544 *out++ = '+'; 4545 *out++ = '-'; 4546 } 4547 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4548 *out++ = (char) ch; 4549 } 4550 else { 4551 *out++ = '+'; 4552 inShift = 1; 4553 goto encode_char; 4554 } 4555 } 4556 continue; 4557encode_char: 4558 if (ch >= 0x10000) { 4559 assert(ch <= MAX_UNICODE); 4560 4561 /* code first surrogate */ 4562 base64bits += 16; 4563 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4564 while (base64bits >= 6) { 4565 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4566 base64bits -= 6; 4567 } 4568 /* prepare second surrogate */ 4569 ch = Py_UNICODE_LOW_SURROGATE(ch); 4570 } 4571 base64bits += 16; 4572 base64buffer = (base64buffer << 16) | ch; 4573 while (base64bits >= 6) { 4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4575 base64bits -= 6; 4576 } 4577 } 4578 if (base64bits) 4579 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4580 if (inShift) 4581 *out++ = '-'; 4582 if (_PyBytes_Resize(&v, out - start) < 0) 4583 return NULL; 4584 return v; 4585} 4586PyObject * 4587PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4588 Py_ssize_t size, 4589 int base64SetO, 4590 int base64WhiteSpace, 4591 const char *errors) 4592{ 4593 PyObject *result; 4594 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4595 if (tmp == NULL) 4596 return NULL; 4597 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4598 base64WhiteSpace, errors); 4599 Py_DECREF(tmp); 4600 return result; 4601} 4602 4603#undef IS_BASE64 4604#undef FROM_BASE64 4605#undef TO_BASE64 4606#undef DECODE_DIRECT 4607#undef ENCODE_DIRECT 4608 4609/* --- UTF-8 Codec -------------------------------------------------------- */ 4610 4611PyObject * 4612PyUnicode_DecodeUTF8(const char *s, 4613 Py_ssize_t size, 4614 const char *errors) 4615{ 4616 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4617} 4618 4619#include "stringlib/asciilib.h" 4620#include "stringlib/codecs.h" 4621#include "stringlib/undef.h" 4622 4623#include "stringlib/ucs1lib.h" 4624#include "stringlib/codecs.h" 4625#include "stringlib/undef.h" 4626 4627#include "stringlib/ucs2lib.h" 4628#include "stringlib/codecs.h" 4629#include "stringlib/undef.h" 4630 4631#include "stringlib/ucs4lib.h" 4632#include "stringlib/codecs.h" 4633#include "stringlib/undef.h" 4634 4635/* Mask to quickly check whether a C 'long' contains a 4636 non-ASCII, UTF8-encoded char. */ 4637#if (SIZEOF_LONG == 8) 4638# define ASCII_CHAR_MASK 0x8080808080808080UL 4639#elif (SIZEOF_LONG == 4) 4640# define ASCII_CHAR_MASK 0x80808080UL 4641#else 4642# error C 'long' size should be either 4 or 8! 4643#endif 4644 4645static Py_ssize_t 4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4647{ 4648 const char *p = start; 4649 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4650 4651#if SIZEOF_LONG <= SIZEOF_VOID_P 4652 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4653 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4654 /* Fast path, see in STRINGLIB(utf8_decode) for 4655 an explanation. */ 4656 /* Help register allocation */ 4657 register const char *_p = p; 4658 register Py_UCS1 * q = dest; 4659 while (_p < aligned_end) { 4660 unsigned long value = *(const unsigned long *) _p; 4661 if (value & ASCII_CHAR_MASK) 4662 break; 4663 *((unsigned long *)q) = value; 4664 _p += SIZEOF_LONG; 4665 q += SIZEOF_LONG; 4666 } 4667 p = _p; 4668 while (p < end) { 4669 if ((unsigned char)*p & 0x80) 4670 break; 4671 *q++ = *p++; 4672 } 4673 return p - start; 4674 } 4675#endif 4676 while (p < end) { 4677 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4678 for an explanation. */ 4679 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4680 /* Help register allocation */ 4681 register const char *_p = p; 4682 while (_p < aligned_end) { 4683 unsigned long value = *(unsigned long *) _p; 4684 if (value & ASCII_CHAR_MASK) 4685 break; 4686 _p += SIZEOF_LONG; 4687 } 4688 p = _p; 4689 if (_p == end) 4690 break; 4691 } 4692 if ((unsigned char)*p & 0x80) 4693 break; 4694 ++p; 4695 } 4696 memcpy(dest, start, p - start); 4697 return p - start; 4698} 4699 4700PyObject * 4701PyUnicode_DecodeUTF8Stateful(const char *s, 4702 Py_ssize_t size, 4703 const char *errors, 4704 Py_ssize_t *consumed) 4705{ 4706 PyObject *unicode; 4707 const char *starts = s; 4708 const char *end = s + size; 4709 Py_ssize_t outpos; 4710 4711 Py_ssize_t startinpos; 4712 Py_ssize_t endinpos; 4713 const char *errmsg = ""; 4714 PyObject *errorHandler = NULL; 4715 PyObject *exc = NULL; 4716 4717 if (size == 0) { 4718 if (consumed) 4719 *consumed = 0; 4720 Py_INCREF(unicode_empty); 4721 return unicode_empty; 4722 } 4723 4724 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4725 if (size == 1 && (unsigned char)s[0] < 128) { 4726 if (consumed) 4727 *consumed = 1; 4728 return get_latin1_char((unsigned char)s[0]); 4729 } 4730 4731 unicode = PyUnicode_New(size, 127); 4732 if (!unicode) 4733 return NULL; 4734 4735 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4736 s += outpos; 4737 while (s < end) { 4738 Py_UCS4 ch; 4739 int kind = PyUnicode_KIND(unicode); 4740 if (kind == PyUnicode_1BYTE_KIND) { 4741 if (PyUnicode_IS_ASCII(unicode)) 4742 ch = asciilib_utf8_decode(&s, end, 4743 PyUnicode_1BYTE_DATA(unicode), &outpos); 4744 else 4745 ch = ucs1lib_utf8_decode(&s, end, 4746 PyUnicode_1BYTE_DATA(unicode), &outpos); 4747 } else if (kind == PyUnicode_2BYTE_KIND) { 4748 ch = ucs2lib_utf8_decode(&s, end, 4749 PyUnicode_2BYTE_DATA(unicode), &outpos); 4750 } else { 4751 assert(kind == PyUnicode_4BYTE_KIND); 4752 ch = ucs4lib_utf8_decode(&s, end, 4753 PyUnicode_4BYTE_DATA(unicode), &outpos); 4754 } 4755 4756 switch (ch) { 4757 case 0: 4758 if (s == end || consumed) 4759 goto End; 4760 errmsg = "unexpected end of data"; 4761 startinpos = s - starts; 4762 endinpos = end - starts; 4763 break; 4764 case 1: 4765 errmsg = "invalid start byte"; 4766 startinpos = s - starts; 4767 endinpos = startinpos + 1; 4768 break; 4769 case 2: 4770 case 3: 4771 case 4: 4772 errmsg = "invalid continuation byte"; 4773 startinpos = s - starts; 4774 endinpos = startinpos + ch - 1; 4775 break; 4776 default: 4777 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4778 goto onError; 4779 continue; 4780 } 4781 4782 if (unicode_decode_call_errorhandler( 4783 errors, &errorHandler, 4784 "utf-8", errmsg, 4785 &starts, &end, &startinpos, &endinpos, &exc, &s, 4786 &unicode, &outpos)) 4787 goto onError; 4788 } 4789 4790End: 4791 if (unicode_resize(&unicode, outpos) < 0) 4792 goto onError; 4793 4794 if (consumed) 4795 *consumed = s - starts; 4796 4797 Py_XDECREF(errorHandler); 4798 Py_XDECREF(exc); 4799 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4800 return unicode; 4801 4802onError: 4803 Py_XDECREF(errorHandler); 4804 Py_XDECREF(exc); 4805 Py_XDECREF(unicode); 4806 return NULL; 4807} 4808 4809#ifdef __APPLE__ 4810 4811/* Simplified UTF-8 decoder using surrogateescape error handler, 4812 used to decode the command line arguments on Mac OS X. */ 4813 4814wchar_t* 4815_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4816{ 4817 const char *e; 4818 wchar_t *unicode; 4819 Py_ssize_t outpos; 4820 4821 /* Note: size will always be longer than the resulting Unicode 4822 character count */ 4823 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4824 PyErr_NoMemory(); 4825 return NULL; 4826 } 4827 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4828 if (!unicode) 4829 return NULL; 4830 4831 /* Unpack UTF-8 encoded data */ 4832 e = s + size; 4833 outpos = 0; 4834 while (s < e) { 4835 Py_UCS4 ch; 4836#if SIZEOF_WCHAR_T == 4 4837 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4838#else 4839 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4840#endif 4841 if (ch > 0xFF) { 4842#if SIZEOF_WCHAR_T == 4 4843 assert(0); 4844#else 4845 assert(Py_UNICODE_IS_SURROGATE(ch)); 4846 /* compute and append the two surrogates: */ 4847 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4848 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4849#endif 4850 } 4851 else { 4852 if (!ch && s == e) 4853 break; 4854 /* surrogateescape */ 4855 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4856 } 4857 } 4858 unicode[outpos] = L'\0'; 4859 return unicode; 4860} 4861 4862#endif /* __APPLE__ */ 4863 4864/* Primary internal function which creates utf8 encoded bytes objects. 4865 4866 Allocation strategy: if the string is short, convert into a stack buffer 4867 and allocate exactly as much space needed at the end. Else allocate the 4868 maximum possible needed (4 result bytes per Unicode character), and return 4869 the excess memory at the end. 4870*/ 4871PyObject * 4872_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4873{ 4874 enum PyUnicode_Kind kind; 4875 void *data; 4876 Py_ssize_t size; 4877 4878 if (!PyUnicode_Check(unicode)) { 4879 PyErr_BadArgument(); 4880 return NULL; 4881 } 4882 4883 if (PyUnicode_READY(unicode) == -1) 4884 return NULL; 4885 4886 if (PyUnicode_UTF8(unicode)) 4887 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4888 PyUnicode_UTF8_LENGTH(unicode)); 4889 4890 kind = PyUnicode_KIND(unicode); 4891 data = PyUnicode_DATA(unicode); 4892 size = PyUnicode_GET_LENGTH(unicode); 4893 4894 switch (kind) { 4895 default: 4896 assert(0); 4897 case PyUnicode_1BYTE_KIND: 4898 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4899 assert(!PyUnicode_IS_ASCII(unicode)); 4900 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4901 case PyUnicode_2BYTE_KIND: 4902 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4903 case PyUnicode_4BYTE_KIND: 4904 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4905 } 4906} 4907 4908PyObject * 4909PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4910 Py_ssize_t size, 4911 const char *errors) 4912{ 4913 PyObject *v, *unicode; 4914 4915 unicode = PyUnicode_FromUnicode(s, size); 4916 if (unicode == NULL) 4917 return NULL; 4918 v = _PyUnicode_AsUTF8String(unicode, errors); 4919 Py_DECREF(unicode); 4920 return v; 4921} 4922 4923PyObject * 4924PyUnicode_AsUTF8String(PyObject *unicode) 4925{ 4926 return _PyUnicode_AsUTF8String(unicode, NULL); 4927} 4928 4929/* --- UTF-32 Codec ------------------------------------------------------- */ 4930 4931PyObject * 4932PyUnicode_DecodeUTF32(const char *s, 4933 Py_ssize_t size, 4934 const char *errors, 4935 int *byteorder) 4936{ 4937 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4938} 4939 4940PyObject * 4941PyUnicode_DecodeUTF32Stateful(const char *s, 4942 Py_ssize_t size, 4943 const char *errors, 4944 int *byteorder, 4945 Py_ssize_t *consumed) 4946{ 4947 const char *starts = s; 4948 Py_ssize_t startinpos; 4949 Py_ssize_t endinpos; 4950 Py_ssize_t outpos; 4951 PyObject *unicode; 4952 const unsigned char *q, *e; 4953 int bo = 0; /* assume native ordering by default */ 4954 const char *errmsg = ""; 4955 /* Offsets from q for retrieving bytes in the right order. */ 4956#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4957 int iorder[] = {0, 1, 2, 3}; 4958#else 4959 int iorder[] = {3, 2, 1, 0}; 4960#endif 4961 PyObject *errorHandler = NULL; 4962 PyObject *exc = NULL; 4963 4964 q = (unsigned char *)s; 4965 e = q + size; 4966 4967 if (byteorder) 4968 bo = *byteorder; 4969 4970 /* Check for BOM marks (U+FEFF) in the input and adjust current 4971 byte order setting accordingly. In native mode, the leading BOM 4972 mark is skipped, in all other modes, it is copied to the output 4973 stream as-is (giving a ZWNBSP character). */ 4974 if (bo == 0) { 4975 if (size >= 4) { 4976 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4977 (q[iorder[1]] << 8) | q[iorder[0]]; 4978#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4979 if (bom == 0x0000FEFF) { 4980 q += 4; 4981 bo = -1; 4982 } 4983 else if (bom == 0xFFFE0000) { 4984 q += 4; 4985 bo = 1; 4986 } 4987#else 4988 if (bom == 0x0000FEFF) { 4989 q += 4; 4990 bo = 1; 4991 } 4992 else if (bom == 0xFFFE0000) { 4993 q += 4; 4994 bo = -1; 4995 } 4996#endif 4997 } 4998 } 4999 5000 if (bo == -1) { 5001 /* force LE */ 5002 iorder[0] = 0; 5003 iorder[1] = 1; 5004 iorder[2] = 2; 5005 iorder[3] = 3; 5006 } 5007 else if (bo == 1) { 5008 /* force BE */ 5009 iorder[0] = 3; 5010 iorder[1] = 2; 5011 iorder[2] = 1; 5012 iorder[3] = 0; 5013 } 5014 5015 /* This might be one to much, because of a BOM */ 5016 unicode = PyUnicode_New((size+3)/4, 127); 5017 if (!unicode) 5018 return NULL; 5019 if (size == 0) 5020 return unicode; 5021 outpos = 0; 5022 5023 while (q < e) { 5024 Py_UCS4 ch; 5025 /* remaining bytes at the end? (size should be divisible by 4) */ 5026 if (e-q<4) { 5027 if (consumed) 5028 break; 5029 errmsg = "truncated data"; 5030 startinpos = ((const char *)q)-starts; 5031 endinpos = ((const char *)e)-starts; 5032 goto utf32Error; 5033 /* The remaining input chars are ignored if the callback 5034 chooses to skip the input */ 5035 } 5036 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 5037 (q[iorder[1]] << 8) | q[iorder[0]]; 5038 5039 if (ch >= 0x110000) 5040 { 5041 errmsg = "codepoint not in range(0x110000)"; 5042 startinpos = ((const char *)q)-starts; 5043 endinpos = startinpos+4; 5044 goto utf32Error; 5045 } 5046 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5047 goto onError; 5048 q += 4; 5049 continue; 5050 utf32Error: 5051 if (unicode_decode_call_errorhandler( 5052 errors, &errorHandler, 5053 "utf32", errmsg, 5054 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5055 &unicode, &outpos)) 5056 goto onError; 5057 } 5058 5059 if (byteorder) 5060 *byteorder = bo; 5061 5062 if (consumed) 5063 *consumed = (const char *)q-starts; 5064 5065 /* Adjust length */ 5066 if (unicode_resize(&unicode, outpos) < 0) 5067 goto onError; 5068 5069 Py_XDECREF(errorHandler); 5070 Py_XDECREF(exc); 5071 return unicode_result(unicode); 5072 5073 onError: 5074 Py_DECREF(unicode); 5075 Py_XDECREF(errorHandler); 5076 Py_XDECREF(exc); 5077 return NULL; 5078} 5079 5080PyObject * 5081_PyUnicode_EncodeUTF32(PyObject *str, 5082 const char *errors, 5083 int byteorder) 5084{ 5085 int kind; 5086 void *data; 5087 Py_ssize_t len; 5088 PyObject *v; 5089 unsigned char *p; 5090 Py_ssize_t nsize, i; 5091 /* Offsets from p for storing byte pairs in the right order. */ 5092#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5093 int iorder[] = {0, 1, 2, 3}; 5094#else 5095 int iorder[] = {3, 2, 1, 0}; 5096#endif 5097 5098#define STORECHAR(CH) \ 5099 do { \ 5100 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5101 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5102 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5103 p[iorder[0]] = (CH) & 0xff; \ 5104 p += 4; \ 5105 } while(0) 5106 5107 if (!PyUnicode_Check(str)) { 5108 PyErr_BadArgument(); 5109 return NULL; 5110 } 5111 if (PyUnicode_READY(str) == -1) 5112 return NULL; 5113 kind = PyUnicode_KIND(str); 5114 data = PyUnicode_DATA(str); 5115 len = PyUnicode_GET_LENGTH(str); 5116 5117 nsize = len + (byteorder == 0); 5118 if (nsize > PY_SSIZE_T_MAX / 4) 5119 return PyErr_NoMemory(); 5120 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5121 if (v == NULL) 5122 return NULL; 5123 5124 p = (unsigned char *)PyBytes_AS_STRING(v); 5125 if (byteorder == 0) 5126 STORECHAR(0xFEFF); 5127 if (len == 0) 5128 goto done; 5129 5130 if (byteorder == -1) { 5131 /* force LE */ 5132 iorder[0] = 0; 5133 iorder[1] = 1; 5134 iorder[2] = 2; 5135 iorder[3] = 3; 5136 } 5137 else if (byteorder == 1) { 5138 /* force BE */ 5139 iorder[0] = 3; 5140 iorder[1] = 2; 5141 iorder[2] = 1; 5142 iorder[3] = 0; 5143 } 5144 5145 for (i = 0; i < len; i++) 5146 STORECHAR(PyUnicode_READ(kind, data, i)); 5147 5148 done: 5149 return v; 5150#undef STORECHAR 5151} 5152 5153PyObject * 5154PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5155 Py_ssize_t size, 5156 const char *errors, 5157 int byteorder) 5158{ 5159 PyObject *result; 5160 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5161 if (tmp == NULL) 5162 return NULL; 5163 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5164 Py_DECREF(tmp); 5165 return result; 5166} 5167 5168PyObject * 5169PyUnicode_AsUTF32String(PyObject *unicode) 5170{ 5171 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5172} 5173 5174/* --- UTF-16 Codec ------------------------------------------------------- */ 5175 5176PyObject * 5177PyUnicode_DecodeUTF16(const char *s, 5178 Py_ssize_t size, 5179 const char *errors, 5180 int *byteorder) 5181{ 5182 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5183} 5184 5185PyObject * 5186PyUnicode_DecodeUTF16Stateful(const char *s, 5187 Py_ssize_t size, 5188 const char *errors, 5189 int *byteorder, 5190 Py_ssize_t *consumed) 5191{ 5192 const char *starts = s; 5193 Py_ssize_t startinpos; 5194 Py_ssize_t endinpos; 5195 Py_ssize_t outpos; 5196 PyObject *unicode; 5197 const unsigned char *q, *e; 5198 int bo = 0; /* assume native ordering by default */ 5199 int native_ordering; 5200 const char *errmsg = ""; 5201 PyObject *errorHandler = NULL; 5202 PyObject *exc = NULL; 5203 5204 q = (unsigned char *)s; 5205 e = q + size; 5206 5207 if (byteorder) 5208 bo = *byteorder; 5209 5210 /* Check for BOM marks (U+FEFF) in the input and adjust current 5211 byte order setting accordingly. In native mode, the leading BOM 5212 mark is skipped, in all other modes, it is copied to the output 5213 stream as-is (giving a ZWNBSP character). */ 5214 if (bo == 0 && size >= 2) { 5215 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5216 if (bom == 0xFEFF) { 5217 q += 2; 5218 bo = -1; 5219 } 5220 else if (bom == 0xFFFE) { 5221 q += 2; 5222 bo = 1; 5223 } 5224 if (byteorder) 5225 *byteorder = bo; 5226 } 5227 5228 if (q == e) { 5229 if (consumed) 5230 *consumed = size; 5231 Py_INCREF(unicode_empty); 5232 return unicode_empty; 5233 } 5234 5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5236 native_ordering = bo <= 0; 5237#else 5238 native_ordering = bo >= 0; 5239#endif 5240 5241 /* Note: size will always be longer than the resulting Unicode 5242 character count */ 5243 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5244 if (!unicode) 5245 return NULL; 5246 5247 outpos = 0; 5248 while (1) { 5249 Py_UCS4 ch = 0; 5250 if (e - q >= 2) { 5251 int kind = PyUnicode_KIND(unicode); 5252 if (kind == PyUnicode_1BYTE_KIND) { 5253 if (PyUnicode_IS_ASCII(unicode)) 5254 ch = asciilib_utf16_decode(&q, e, 5255 PyUnicode_1BYTE_DATA(unicode), &outpos, 5256 native_ordering); 5257 else 5258 ch = ucs1lib_utf16_decode(&q, e, 5259 PyUnicode_1BYTE_DATA(unicode), &outpos, 5260 native_ordering); 5261 } else if (kind == PyUnicode_2BYTE_KIND) { 5262 ch = ucs2lib_utf16_decode(&q, e, 5263 PyUnicode_2BYTE_DATA(unicode), &outpos, 5264 native_ordering); 5265 } else { 5266 assert(kind == PyUnicode_4BYTE_KIND); 5267 ch = ucs4lib_utf16_decode(&q, e, 5268 PyUnicode_4BYTE_DATA(unicode), &outpos, 5269 native_ordering); 5270 } 5271 } 5272 5273 switch (ch) 5274 { 5275 case 0: 5276 /* remaining byte at the end? (size should be even) */ 5277 if (q == e || consumed) 5278 goto End; 5279 errmsg = "truncated data"; 5280 startinpos = ((const char *)q) - starts; 5281 endinpos = ((const char *)e) - starts; 5282 break; 5283 /* The remaining input chars are ignored if the callback 5284 chooses to skip the input */ 5285 case 1: 5286 errmsg = "unexpected end of data"; 5287 startinpos = ((const char *)q) - 2 - starts; 5288 endinpos = ((const char *)e) - starts; 5289 break; 5290 case 2: 5291 errmsg = "illegal encoding"; 5292 startinpos = ((const char *)q) - 2 - starts; 5293 endinpos = startinpos + 2; 5294 break; 5295 case 3: 5296 errmsg = "illegal UTF-16 surrogate"; 5297 startinpos = ((const char *)q) - 4 - starts; 5298 endinpos = startinpos + 2; 5299 break; 5300 default: 5301 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5302 goto onError; 5303 continue; 5304 } 5305 5306 if (unicode_decode_call_errorhandler( 5307 errors, 5308 &errorHandler, 5309 "utf16", errmsg, 5310 &starts, 5311 (const char **)&e, 5312 &startinpos, 5313 &endinpos, 5314 &exc, 5315 (const char **)&q, 5316 &unicode, 5317 &outpos)) 5318 goto onError; 5319 } 5320 5321End: 5322 if (consumed) 5323 *consumed = (const char *)q-starts; 5324 5325 /* Adjust length */ 5326 if (unicode_resize(&unicode, outpos) < 0) 5327 goto onError; 5328 5329 Py_XDECREF(errorHandler); 5330 Py_XDECREF(exc); 5331 return unicode_result(unicode); 5332 5333 onError: 5334 Py_DECREF(unicode); 5335 Py_XDECREF(errorHandler); 5336 Py_XDECREF(exc); 5337 return NULL; 5338} 5339 5340PyObject * 5341_PyUnicode_EncodeUTF16(PyObject *str, 5342 const char *errors, 5343 int byteorder) 5344{ 5345 enum PyUnicode_Kind kind; 5346 const void *data; 5347 Py_ssize_t len; 5348 PyObject *v; 5349 unsigned short *out; 5350 Py_ssize_t bytesize; 5351 Py_ssize_t pairs; 5352#ifdef WORDS_BIGENDIAN 5353 int native_ordering = byteorder >= 0; 5354#else 5355 int native_ordering = byteorder <= 0; 5356#endif 5357 5358 if (!PyUnicode_Check(str)) { 5359 PyErr_BadArgument(); 5360 return NULL; 5361 } 5362 if (PyUnicode_READY(str) == -1) 5363 return NULL; 5364 kind = PyUnicode_KIND(str); 5365 data = PyUnicode_DATA(str); 5366 len = PyUnicode_GET_LENGTH(str); 5367 5368 pairs = 0; 5369 if (kind == PyUnicode_4BYTE_KIND) { 5370 const Py_UCS4 *in = (const Py_UCS4 *)data; 5371 const Py_UCS4 *end = in + len; 5372 while (in < end) 5373 if (*in++ >= 0x10000) 5374 pairs++; 5375 } 5376 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5377 return PyErr_NoMemory(); 5378 bytesize = (len + pairs + (byteorder == 0)) * 2; 5379 v = PyBytes_FromStringAndSize(NULL, bytesize); 5380 if (v == NULL) 5381 return NULL; 5382 5383 /* output buffer is 2-bytes aligned */ 5384 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5385 out = (unsigned short *)PyBytes_AS_STRING(v); 5386 if (byteorder == 0) 5387 *out++ = 0xFEFF; 5388 if (len == 0) 5389 goto done; 5390 5391 switch (kind) { 5392 case PyUnicode_1BYTE_KIND: { 5393 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5394 break; 5395 } 5396 case PyUnicode_2BYTE_KIND: { 5397 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5398 break; 5399 } 5400 case PyUnicode_4BYTE_KIND: { 5401 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5402 break; 5403 } 5404 default: 5405 assert(0); 5406 } 5407 5408 done: 5409 return v; 5410} 5411 5412PyObject * 5413PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5414 Py_ssize_t size, 5415 const char *errors, 5416 int byteorder) 5417{ 5418 PyObject *result; 5419 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5420 if (tmp == NULL) 5421 return NULL; 5422 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5423 Py_DECREF(tmp); 5424 return result; 5425} 5426 5427PyObject * 5428PyUnicode_AsUTF16String(PyObject *unicode) 5429{ 5430 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5431} 5432 5433/* --- Unicode Escape Codec ----------------------------------------------- */ 5434 5435/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5436 if all the escapes in the string make it still a valid ASCII string. 5437 Returns -1 if any escapes were found which cause the string to 5438 pop out of ASCII range. Otherwise returns the length of the 5439 required buffer to hold the string. 5440 */ 5441static Py_ssize_t 5442length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5443{ 5444 const unsigned char *p = (const unsigned char *)s; 5445 const unsigned char *end = p + size; 5446 Py_ssize_t length = 0; 5447 5448 if (size < 0) 5449 return -1; 5450 5451 for (; p < end; ++p) { 5452 if (*p > 127) { 5453 /* Non-ASCII */ 5454 return -1; 5455 } 5456 else if (*p != '\\') { 5457 /* Normal character */ 5458 ++length; 5459 } 5460 else { 5461 /* Backslash-escape, check next char */ 5462 ++p; 5463 /* Escape sequence reaches till end of string or 5464 non-ASCII follow-up. */ 5465 if (p >= end || *p > 127) 5466 return -1; 5467 switch (*p) { 5468 case '\n': 5469 /* backslash + \n result in zero characters */ 5470 break; 5471 case '\\': case '\'': case '\"': 5472 case 'b': case 'f': case 't': 5473 case 'n': case 'r': case 'v': case 'a': 5474 ++length; 5475 break; 5476 case '0': case '1': case '2': case '3': 5477 case '4': case '5': case '6': case '7': 5478 case 'x': case 'u': case 'U': case 'N': 5479 /* these do not guarantee ASCII characters */ 5480 return -1; 5481 default: 5482 /* count the backslash + the other character */ 5483 length += 2; 5484 } 5485 } 5486 } 5487 return length; 5488} 5489 5490static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5491 5492PyObject * 5493PyUnicode_DecodeUnicodeEscape(const char *s, 5494 Py_ssize_t size, 5495 const char *errors) 5496{ 5497 const char *starts = s; 5498 Py_ssize_t startinpos; 5499 Py_ssize_t endinpos; 5500 int j; 5501 PyObject *v; 5502 const char *end; 5503 char* message; 5504 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5505 PyObject *errorHandler = NULL; 5506 PyObject *exc = NULL; 5507 Py_ssize_t len; 5508 Py_ssize_t i; 5509 5510 len = length_of_escaped_ascii_string(s, size); 5511 5512 /* After length_of_escaped_ascii_string() there are two alternatives, 5513 either the string is pure ASCII with named escapes like \n, etc. 5514 and we determined it's exact size (common case) 5515 or it contains \x, \u, ... escape sequences. then we create a 5516 legacy wchar string and resize it at the end of this function. */ 5517 if (len >= 0) { 5518 v = PyUnicode_New(len, 127); 5519 if (!v) 5520 goto onError; 5521 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5522 } 5523 else { 5524 /* Escaped strings will always be longer than the resulting 5525 Unicode string, so we start with size here and then reduce the 5526 length after conversion to the true value. 5527 (but if the error callback returns a long replacement string 5528 we'll have to allocate more space) */ 5529 v = PyUnicode_New(size, 127); 5530 if (!v) 5531 goto onError; 5532 len = size; 5533 } 5534 5535 if (size == 0) 5536 return v; 5537 i = 0; 5538 end = s + size; 5539 5540 while (s < end) { 5541 unsigned char c; 5542 Py_UCS4 x; 5543 int digits; 5544 5545 /* The only case in which i == ascii_length is a backslash 5546 followed by a newline. */ 5547 assert(i <= len); 5548 5549 /* Non-escape characters are interpreted as Unicode ordinals */ 5550 if (*s != '\\') { 5551 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5552 goto onError; 5553 continue; 5554 } 5555 5556 startinpos = s-starts; 5557 /* \ - Escapes */ 5558 s++; 5559 c = *s++; 5560 if (s > end) 5561 c = '\0'; /* Invalid after \ */ 5562 5563 /* The only case in which i == ascii_length is a backslash 5564 followed by a newline. */ 5565 assert(i < len || (i == len && c == '\n')); 5566 5567 switch (c) { 5568 5569 /* \x escapes */ 5570#define WRITECHAR(ch) \ 5571 do { \ 5572 if (unicode_putchar(&v, &i, ch) < 0) \ 5573 goto onError; \ 5574 }while(0) 5575 5576 case '\n': break; 5577 case '\\': WRITECHAR('\\'); break; 5578 case '\'': WRITECHAR('\''); break; 5579 case '\"': WRITECHAR('\"'); break; 5580 case 'b': WRITECHAR('\b'); break; 5581 /* FF */ 5582 case 'f': WRITECHAR('\014'); break; 5583 case 't': WRITECHAR('\t'); break; 5584 case 'n': WRITECHAR('\n'); break; 5585 case 'r': WRITECHAR('\r'); break; 5586 /* VT */ 5587 case 'v': WRITECHAR('\013'); break; 5588 /* BEL, not classic C */ 5589 case 'a': WRITECHAR('\007'); break; 5590 5591 /* \OOO (octal) escapes */ 5592 case '0': case '1': case '2': case '3': 5593 case '4': case '5': case '6': case '7': 5594 x = s[-1] - '0'; 5595 if (s < end && '0' <= *s && *s <= '7') { 5596 x = (x<<3) + *s++ - '0'; 5597 if (s < end && '0' <= *s && *s <= '7') 5598 x = (x<<3) + *s++ - '0'; 5599 } 5600 WRITECHAR(x); 5601 break; 5602 5603 /* hex escapes */ 5604 /* \xXX */ 5605 case 'x': 5606 digits = 2; 5607 message = "truncated \\xXX escape"; 5608 goto hexescape; 5609 5610 /* \uXXXX */ 5611 case 'u': 5612 digits = 4; 5613 message = "truncated \\uXXXX escape"; 5614 goto hexescape; 5615 5616 /* \UXXXXXXXX */ 5617 case 'U': 5618 digits = 8; 5619 message = "truncated \\UXXXXXXXX escape"; 5620 hexescape: 5621 chr = 0; 5622 if (s+digits>end) { 5623 endinpos = size; 5624 if (unicode_decode_call_errorhandler( 5625 errors, &errorHandler, 5626 "unicodeescape", "end of string in escape sequence", 5627 &starts, &end, &startinpos, &endinpos, &exc, &s, 5628 &v, &i)) 5629 goto onError; 5630 goto nextByte; 5631 } 5632 for (j = 0; j < digits; ++j) { 5633 c = (unsigned char) s[j]; 5634 if (!Py_ISXDIGIT(c)) { 5635 endinpos = (s+j+1)-starts; 5636 if (unicode_decode_call_errorhandler( 5637 errors, &errorHandler, 5638 "unicodeescape", message, 5639 &starts, &end, &startinpos, &endinpos, &exc, &s, 5640 &v, &i)) 5641 goto onError; 5642 len = PyUnicode_GET_LENGTH(v); 5643 goto nextByte; 5644 } 5645 chr = (chr<<4) & ~0xF; 5646 if (c >= '0' && c <= '9') 5647 chr += c - '0'; 5648 else if (c >= 'a' && c <= 'f') 5649 chr += 10 + c - 'a'; 5650 else 5651 chr += 10 + c - 'A'; 5652 } 5653 s += j; 5654 if (chr == 0xffffffff && PyErr_Occurred()) 5655 /* _decoding_error will have already written into the 5656 target buffer. */ 5657 break; 5658 store: 5659 /* when we get here, chr is a 32-bit unicode character */ 5660 if (chr <= MAX_UNICODE) { 5661 WRITECHAR(chr); 5662 } else { 5663 endinpos = s-starts; 5664 if (unicode_decode_call_errorhandler( 5665 errors, &errorHandler, 5666 "unicodeescape", "illegal Unicode character", 5667 &starts, &end, &startinpos, &endinpos, &exc, &s, 5668 &v, &i)) 5669 goto onError; 5670 } 5671 break; 5672 5673 /* \N{name} */ 5674 case 'N': 5675 message = "malformed \\N character escape"; 5676 if (ucnhash_CAPI == NULL) { 5677 /* load the unicode data module */ 5678 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5679 PyUnicodeData_CAPSULE_NAME, 1); 5680 if (ucnhash_CAPI == NULL) 5681 goto ucnhashError; 5682 } 5683 if (*s == '{') { 5684 const char *start = s+1; 5685 /* look for the closing brace */ 5686 while (*s != '}' && s < end) 5687 s++; 5688 if (s > start && s < end && *s == '}') { 5689 /* found a name. look it up in the unicode database */ 5690 message = "unknown Unicode character name"; 5691 s++; 5692 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5693 &chr, 0)) 5694 goto store; 5695 } 5696 } 5697 endinpos = s-starts; 5698 if (unicode_decode_call_errorhandler( 5699 errors, &errorHandler, 5700 "unicodeescape", message, 5701 &starts, &end, &startinpos, &endinpos, &exc, &s, 5702 &v, &i)) 5703 goto onError; 5704 break; 5705 5706 default: 5707 if (s > end) { 5708 message = "\\ at end of string"; 5709 s--; 5710 endinpos = s-starts; 5711 if (unicode_decode_call_errorhandler( 5712 errors, &errorHandler, 5713 "unicodeescape", message, 5714 &starts, &end, &startinpos, &endinpos, &exc, &s, 5715 &v, &i)) 5716 goto onError; 5717 } 5718 else { 5719 WRITECHAR('\\'); 5720 WRITECHAR(s[-1]); 5721 } 5722 break; 5723 } 5724 nextByte: 5725 ; 5726 } 5727#undef WRITECHAR 5728 5729 if (unicode_resize(&v, i) < 0) 5730 goto onError; 5731 Py_XDECREF(errorHandler); 5732 Py_XDECREF(exc); 5733 return unicode_result(v); 5734 5735 ucnhashError: 5736 PyErr_SetString( 5737 PyExc_UnicodeError, 5738 "\\N escapes not supported (can't load unicodedata module)" 5739 ); 5740 Py_XDECREF(v); 5741 Py_XDECREF(errorHandler); 5742 Py_XDECREF(exc); 5743 return NULL; 5744 5745 onError: 5746 Py_XDECREF(v); 5747 Py_XDECREF(errorHandler); 5748 Py_XDECREF(exc); 5749 return NULL; 5750} 5751 5752/* Return a Unicode-Escape string version of the Unicode object. 5753 5754 If quotes is true, the string is enclosed in u"" or u'' quotes as 5755 appropriate. 5756 5757*/ 5758 5759PyObject * 5760PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5761{ 5762 Py_ssize_t i, len; 5763 PyObject *repr; 5764 char *p; 5765 int kind; 5766 void *data; 5767 Py_ssize_t expandsize = 0; 5768 5769 /* Initial allocation is based on the longest-possible character 5770 escape. 5771 5772 For UCS1 strings it's '\xxx', 4 bytes per source character. 5773 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5774 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5775 */ 5776 5777 if (!PyUnicode_Check(unicode)) { 5778 PyErr_BadArgument(); 5779 return NULL; 5780 } 5781 if (PyUnicode_READY(unicode) == -1) 5782 return NULL; 5783 len = PyUnicode_GET_LENGTH(unicode); 5784 kind = PyUnicode_KIND(unicode); 5785 data = PyUnicode_DATA(unicode); 5786 switch (kind) { 5787 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5788 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5789 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5790 } 5791 5792 if (len == 0) 5793 return PyBytes_FromStringAndSize(NULL, 0); 5794 5795 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5796 return PyErr_NoMemory(); 5797 5798 repr = PyBytes_FromStringAndSize(NULL, 5799 2 5800 + expandsize*len 5801 + 1); 5802 if (repr == NULL) 5803 return NULL; 5804 5805 p = PyBytes_AS_STRING(repr); 5806 5807 for (i = 0; i < len; i++) { 5808 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5809 5810 /* Escape backslashes */ 5811 if (ch == '\\') { 5812 *p++ = '\\'; 5813 *p++ = (char) ch; 5814 continue; 5815 } 5816 5817 /* Map 21-bit characters to '\U00xxxxxx' */ 5818 else if (ch >= 0x10000) { 5819 assert(ch <= MAX_UNICODE); 5820 *p++ = '\\'; 5821 *p++ = 'U'; 5822 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5823 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5824 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5825 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5826 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5827 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5828 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5829 *p++ = Py_hexdigits[ch & 0x0000000F]; 5830 continue; 5831 } 5832 5833 /* Map 16-bit characters to '\uxxxx' */ 5834 if (ch >= 256) { 5835 *p++ = '\\'; 5836 *p++ = 'u'; 5837 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5838 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5839 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5840 *p++ = Py_hexdigits[ch & 0x000F]; 5841 } 5842 5843 /* Map special whitespace to '\t', \n', '\r' */ 5844 else if (ch == '\t') { 5845 *p++ = '\\'; 5846 *p++ = 't'; 5847 } 5848 else if (ch == '\n') { 5849 *p++ = '\\'; 5850 *p++ = 'n'; 5851 } 5852 else if (ch == '\r') { 5853 *p++ = '\\'; 5854 *p++ = 'r'; 5855 } 5856 5857 /* Map non-printable US ASCII to '\xhh' */ 5858 else if (ch < ' ' || ch >= 0x7F) { 5859 *p++ = '\\'; 5860 *p++ = 'x'; 5861 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5862 *p++ = Py_hexdigits[ch & 0x000F]; 5863 } 5864 5865 /* Copy everything else as-is */ 5866 else 5867 *p++ = (char) ch; 5868 } 5869 5870 assert(p - PyBytes_AS_STRING(repr) > 0); 5871 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5872 return NULL; 5873 return repr; 5874} 5875 5876PyObject * 5877PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5878 Py_ssize_t size) 5879{ 5880 PyObject *result; 5881 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5882 if (tmp == NULL) 5883 return NULL; 5884 result = PyUnicode_AsUnicodeEscapeString(tmp); 5885 Py_DECREF(tmp); 5886 return result; 5887} 5888 5889/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5890 5891PyObject * 5892PyUnicode_DecodeRawUnicodeEscape(const char *s, 5893 Py_ssize_t size, 5894 const char *errors) 5895{ 5896 const char *starts = s; 5897 Py_ssize_t startinpos; 5898 Py_ssize_t endinpos; 5899 Py_ssize_t outpos; 5900 PyObject *v; 5901 const char *end; 5902 const char *bs; 5903 PyObject *errorHandler = NULL; 5904 PyObject *exc = NULL; 5905 5906 /* Escaped strings will always be longer than the resulting 5907 Unicode string, so we start with size here and then reduce the 5908 length after conversion to the true value. (But decoding error 5909 handler might have to resize the string) */ 5910 v = PyUnicode_New(size, 127); 5911 if (v == NULL) 5912 goto onError; 5913 if (size == 0) 5914 return v; 5915 outpos = 0; 5916 end = s + size; 5917 while (s < end) { 5918 unsigned char c; 5919 Py_UCS4 x; 5920 int i; 5921 int count; 5922 5923 /* Non-escape characters are interpreted as Unicode ordinals */ 5924 if (*s != '\\') { 5925 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5926 goto onError; 5927 continue; 5928 } 5929 startinpos = s-starts; 5930 5931 /* \u-escapes are only interpreted iff the number of leading 5932 backslashes if odd */ 5933 bs = s; 5934 for (;s < end;) { 5935 if (*s != '\\') 5936 break; 5937 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5938 goto onError; 5939 } 5940 if (((s - bs) & 1) == 0 || 5941 s >= end || 5942 (*s != 'u' && *s != 'U')) { 5943 continue; 5944 } 5945 outpos--; 5946 count = *s=='u' ? 4 : 8; 5947 s++; 5948 5949 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5950 for (x = 0, i = 0; i < count; ++i, ++s) { 5951 c = (unsigned char)*s; 5952 if (!Py_ISXDIGIT(c)) { 5953 endinpos = s-starts; 5954 if (unicode_decode_call_errorhandler( 5955 errors, &errorHandler, 5956 "rawunicodeescape", "truncated \\uXXXX", 5957 &starts, &end, &startinpos, &endinpos, &exc, &s, 5958 &v, &outpos)) 5959 goto onError; 5960 goto nextByte; 5961 } 5962 x = (x<<4) & ~0xF; 5963 if (c >= '0' && c <= '9') 5964 x += c - '0'; 5965 else if (c >= 'a' && c <= 'f') 5966 x += 10 + c - 'a'; 5967 else 5968 x += 10 + c - 'A'; 5969 } 5970 if (x <= MAX_UNICODE) { 5971 if (unicode_putchar(&v, &outpos, x) < 0) 5972 goto onError; 5973 } else { 5974 endinpos = s-starts; 5975 if (unicode_decode_call_errorhandler( 5976 errors, &errorHandler, 5977 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5978 &starts, &end, &startinpos, &endinpos, &exc, &s, 5979 &v, &outpos)) 5980 goto onError; 5981 } 5982 nextByte: 5983 ; 5984 } 5985 if (unicode_resize(&v, outpos) < 0) 5986 goto onError; 5987 Py_XDECREF(errorHandler); 5988 Py_XDECREF(exc); 5989 return unicode_result(v); 5990 5991 onError: 5992 Py_XDECREF(v); 5993 Py_XDECREF(errorHandler); 5994 Py_XDECREF(exc); 5995 return NULL; 5996} 5997 5998 5999PyObject * 6000PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6001{ 6002 PyObject *repr; 6003 char *p; 6004 char *q; 6005 Py_ssize_t expandsize, pos; 6006 int kind; 6007 void *data; 6008 Py_ssize_t len; 6009 6010 if (!PyUnicode_Check(unicode)) { 6011 PyErr_BadArgument(); 6012 return NULL; 6013 } 6014 if (PyUnicode_READY(unicode) == -1) 6015 return NULL; 6016 kind = PyUnicode_KIND(unicode); 6017 data = PyUnicode_DATA(unicode); 6018 len = PyUnicode_GET_LENGTH(unicode); 6019 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6020 bytes, and 1 byte characters 4. */ 6021 expandsize = kind * 2 + 2; 6022 6023 if (len > PY_SSIZE_T_MAX / expandsize) 6024 return PyErr_NoMemory(); 6025 6026 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6027 if (repr == NULL) 6028 return NULL; 6029 if (len == 0) 6030 return repr; 6031 6032 p = q = PyBytes_AS_STRING(repr); 6033 for (pos = 0; pos < len; pos++) { 6034 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6035 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6036 if (ch >= 0x10000) { 6037 assert(ch <= MAX_UNICODE); 6038 *p++ = '\\'; 6039 *p++ = 'U'; 6040 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6041 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6042 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6043 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6044 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6045 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6046 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6047 *p++ = Py_hexdigits[ch & 15]; 6048 } 6049 /* Map 16-bit characters to '\uxxxx' */ 6050 else if (ch >= 256) { 6051 *p++ = '\\'; 6052 *p++ = 'u'; 6053 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6054 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6055 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6056 *p++ = Py_hexdigits[ch & 15]; 6057 } 6058 /* Copy everything else as-is */ 6059 else 6060 *p++ = (char) ch; 6061 } 6062 6063 assert(p > q); 6064 if (_PyBytes_Resize(&repr, p - q) < 0) 6065 return NULL; 6066 return repr; 6067} 6068 6069PyObject * 6070PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6071 Py_ssize_t size) 6072{ 6073 PyObject *result; 6074 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6075 if (tmp == NULL) 6076 return NULL; 6077 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6078 Py_DECREF(tmp); 6079 return result; 6080} 6081 6082/* --- Unicode Internal Codec ------------------------------------------- */ 6083 6084PyObject * 6085_PyUnicode_DecodeUnicodeInternal(const char *s, 6086 Py_ssize_t size, 6087 const char *errors) 6088{ 6089 const char *starts = s; 6090 Py_ssize_t startinpos; 6091 Py_ssize_t endinpos; 6092 Py_ssize_t outpos; 6093 PyObject *v; 6094 const char *end; 6095 const char *reason; 6096 PyObject *errorHandler = NULL; 6097 PyObject *exc = NULL; 6098 6099 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6100 "unicode_internal codec has been deprecated", 6101 1)) 6102 return NULL; 6103 6104 /* XXX overflow detection missing */ 6105 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6106 if (v == NULL) 6107 goto onError; 6108 if (PyUnicode_GET_LENGTH(v) == 0) 6109 return v; 6110 outpos = 0; 6111 end = s + size; 6112 6113 while (s < end) { 6114 Py_UNICODE uch; 6115 Py_UCS4 ch; 6116 /* We copy the raw representation one byte at a time because the 6117 pointer may be unaligned (see test_codeccallbacks). */ 6118 ((char *) &uch)[0] = s[0]; 6119 ((char *) &uch)[1] = s[1]; 6120#ifdef Py_UNICODE_WIDE 6121 ((char *) &uch)[2] = s[2]; 6122 ((char *) &uch)[3] = s[3]; 6123#endif 6124 ch = uch; 6125 6126 /* We have to sanity check the raw data, otherwise doom looms for 6127 some malformed UCS-4 data. */ 6128 if ( 6129#ifdef Py_UNICODE_WIDE 6130 ch > 0x10ffff || 6131#endif 6132 end-s < Py_UNICODE_SIZE 6133 ) 6134 { 6135 startinpos = s - starts; 6136 if (end-s < Py_UNICODE_SIZE) { 6137 endinpos = end-starts; 6138 reason = "truncated input"; 6139 } 6140 else { 6141 endinpos = s - starts + Py_UNICODE_SIZE; 6142 reason = "illegal code point (> 0x10FFFF)"; 6143 } 6144 if (unicode_decode_call_errorhandler( 6145 errors, &errorHandler, 6146 "unicode_internal", reason, 6147 &starts, &end, &startinpos, &endinpos, &exc, &s, 6148 &v, &outpos)) 6149 goto onError; 6150 continue; 6151 } 6152 6153 s += Py_UNICODE_SIZE; 6154#ifndef Py_UNICODE_WIDE 6155 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6156 { 6157 Py_UNICODE uch2; 6158 ((char *) &uch2)[0] = s[0]; 6159 ((char *) &uch2)[1] = s[1]; 6160 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6161 { 6162 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6163 s += Py_UNICODE_SIZE; 6164 } 6165 } 6166#endif 6167 6168 if (unicode_putchar(&v, &outpos, ch) < 0) 6169 goto onError; 6170 } 6171 6172 if (unicode_resize(&v, outpos) < 0) 6173 goto onError; 6174 Py_XDECREF(errorHandler); 6175 Py_XDECREF(exc); 6176 return unicode_result(v); 6177 6178 onError: 6179 Py_XDECREF(v); 6180 Py_XDECREF(errorHandler); 6181 Py_XDECREF(exc); 6182 return NULL; 6183} 6184 6185/* --- Latin-1 Codec ------------------------------------------------------ */ 6186 6187PyObject * 6188PyUnicode_DecodeLatin1(const char *s, 6189 Py_ssize_t size, 6190 const char *errors) 6191{ 6192 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6193 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6194} 6195 6196/* create or adjust a UnicodeEncodeError */ 6197static void 6198make_encode_exception(PyObject **exceptionObject, 6199 const char *encoding, 6200 PyObject *unicode, 6201 Py_ssize_t startpos, Py_ssize_t endpos, 6202 const char *reason) 6203{ 6204 if (*exceptionObject == NULL) { 6205 *exceptionObject = PyObject_CallFunction( 6206 PyExc_UnicodeEncodeError, "sOnns", 6207 encoding, unicode, startpos, endpos, reason); 6208 } 6209 else { 6210 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6211 goto onError; 6212 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6213 goto onError; 6214 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6215 goto onError; 6216 return; 6217 onError: 6218 Py_DECREF(*exceptionObject); 6219 *exceptionObject = NULL; 6220 } 6221} 6222 6223/* raises a UnicodeEncodeError */ 6224static void 6225raise_encode_exception(PyObject **exceptionObject, 6226 const char *encoding, 6227 PyObject *unicode, 6228 Py_ssize_t startpos, Py_ssize_t endpos, 6229 const char *reason) 6230{ 6231 make_encode_exception(exceptionObject, 6232 encoding, unicode, startpos, endpos, reason); 6233 if (*exceptionObject != NULL) 6234 PyCodec_StrictErrors(*exceptionObject); 6235} 6236 6237/* error handling callback helper: 6238 build arguments, call the callback and check the arguments, 6239 put the result into newpos and return the replacement string, which 6240 has to be freed by the caller */ 6241static PyObject * 6242unicode_encode_call_errorhandler(const char *errors, 6243 PyObject **errorHandler, 6244 const char *encoding, const char *reason, 6245 PyObject *unicode, PyObject **exceptionObject, 6246 Py_ssize_t startpos, Py_ssize_t endpos, 6247 Py_ssize_t *newpos) 6248{ 6249 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6250 Py_ssize_t len; 6251 PyObject *restuple; 6252 PyObject *resunicode; 6253 6254 if (*errorHandler == NULL) { 6255 *errorHandler = PyCodec_LookupError(errors); 6256 if (*errorHandler == NULL) 6257 return NULL; 6258 } 6259 6260 if (PyUnicode_READY(unicode) == -1) 6261 return NULL; 6262 len = PyUnicode_GET_LENGTH(unicode); 6263 6264 make_encode_exception(exceptionObject, 6265 encoding, unicode, startpos, endpos, reason); 6266 if (*exceptionObject == NULL) 6267 return NULL; 6268 6269 restuple = PyObject_CallFunctionObjArgs( 6270 *errorHandler, *exceptionObject, NULL); 6271 if (restuple == NULL) 6272 return NULL; 6273 if (!PyTuple_Check(restuple)) { 6274 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6275 Py_DECREF(restuple); 6276 return NULL; 6277 } 6278 if (!PyArg_ParseTuple(restuple, argparse, 6279 &resunicode, newpos)) { 6280 Py_DECREF(restuple); 6281 return NULL; 6282 } 6283 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6284 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6285 Py_DECREF(restuple); 6286 return NULL; 6287 } 6288 if (*newpos<0) 6289 *newpos = len + *newpos; 6290 if (*newpos<0 || *newpos>len) { 6291 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6292 Py_DECREF(restuple); 6293 return NULL; 6294 } 6295 Py_INCREF(resunicode); 6296 Py_DECREF(restuple); 6297 return resunicode; 6298} 6299 6300static PyObject * 6301unicode_encode_ucs1(PyObject *unicode, 6302 const char *errors, 6303 unsigned int limit) 6304{ 6305 /* input state */ 6306 Py_ssize_t pos=0, size; 6307 int kind; 6308 void *data; 6309 /* output object */ 6310 PyObject *res; 6311 /* pointer into the output */ 6312 char *str; 6313 /* current output position */ 6314 Py_ssize_t ressize; 6315 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6316 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6317 PyObject *errorHandler = NULL; 6318 PyObject *exc = NULL; 6319 /* the following variable is used for caching string comparisons 6320 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6321 int known_errorHandler = -1; 6322 6323 if (PyUnicode_READY(unicode) == -1) 6324 return NULL; 6325 size = PyUnicode_GET_LENGTH(unicode); 6326 kind = PyUnicode_KIND(unicode); 6327 data = PyUnicode_DATA(unicode); 6328 /* allocate enough for a simple encoding without 6329 replacements, if we need more, we'll resize */ 6330 if (size == 0) 6331 return PyBytes_FromStringAndSize(NULL, 0); 6332 res = PyBytes_FromStringAndSize(NULL, size); 6333 if (res == NULL) 6334 return NULL; 6335 str = PyBytes_AS_STRING(res); 6336 ressize = size; 6337 6338 while (pos < size) { 6339 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6340 6341 /* can we encode this? */ 6342 if (c<limit) { 6343 /* no overflow check, because we know that the space is enough */ 6344 *str++ = (char)c; 6345 ++pos; 6346 } 6347 else { 6348 Py_ssize_t requiredsize; 6349 PyObject *repunicode; 6350 Py_ssize_t repsize, newpos, respos, i; 6351 /* startpos for collecting unencodable chars */ 6352 Py_ssize_t collstart = pos; 6353 Py_ssize_t collend = pos; 6354 /* find all unecodable characters */ 6355 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6356 ++collend; 6357 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6358 if (known_errorHandler==-1) { 6359 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6360 known_errorHandler = 1; 6361 else if (!strcmp(errors, "replace")) 6362 known_errorHandler = 2; 6363 else if (!strcmp(errors, "ignore")) 6364 known_errorHandler = 3; 6365 else if (!strcmp(errors, "xmlcharrefreplace")) 6366 known_errorHandler = 4; 6367 else 6368 known_errorHandler = 0; 6369 } 6370 switch (known_errorHandler) { 6371 case 1: /* strict */ 6372 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6373 goto onError; 6374 case 2: /* replace */ 6375 while (collstart++<collend) 6376 *str++ = '?'; /* fall through */ 6377 case 3: /* ignore */ 6378 pos = collend; 6379 break; 6380 case 4: /* xmlcharrefreplace */ 6381 respos = str - PyBytes_AS_STRING(res); 6382 /* determine replacement size */ 6383 for (i = collstart, repsize = 0; i < collend; ++i) { 6384 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6385 if (ch < 10) 6386 repsize += 2+1+1; 6387 else if (ch < 100) 6388 repsize += 2+2+1; 6389 else if (ch < 1000) 6390 repsize += 2+3+1; 6391 else if (ch < 10000) 6392 repsize += 2+4+1; 6393 else if (ch < 100000) 6394 repsize += 2+5+1; 6395 else if (ch < 1000000) 6396 repsize += 2+6+1; 6397 else { 6398 assert(ch <= MAX_UNICODE); 6399 repsize += 2+7+1; 6400 } 6401 } 6402 requiredsize = respos+repsize+(size-collend); 6403 if (requiredsize > ressize) { 6404 if (requiredsize<2*ressize) 6405 requiredsize = 2*ressize; 6406 if (_PyBytes_Resize(&res, requiredsize)) 6407 goto onError; 6408 str = PyBytes_AS_STRING(res) + respos; 6409 ressize = requiredsize; 6410 } 6411 /* generate replacement */ 6412 for (i = collstart; i < collend; ++i) { 6413 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6414 } 6415 pos = collend; 6416 break; 6417 default: 6418 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6419 encoding, reason, unicode, &exc, 6420 collstart, collend, &newpos); 6421 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6422 PyUnicode_READY(repunicode) == -1)) 6423 goto onError; 6424 if (PyBytes_Check(repunicode)) { 6425 /* Directly copy bytes result to output. */ 6426 repsize = PyBytes_Size(repunicode); 6427 if (repsize > 1) { 6428 /* Make room for all additional bytes. */ 6429 respos = str - PyBytes_AS_STRING(res); 6430 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6431 Py_DECREF(repunicode); 6432 goto onError; 6433 } 6434 str = PyBytes_AS_STRING(res) + respos; 6435 ressize += repsize-1; 6436 } 6437 memcpy(str, PyBytes_AsString(repunicode), repsize); 6438 str += repsize; 6439 pos = newpos; 6440 Py_DECREF(repunicode); 6441 break; 6442 } 6443 /* need more space? (at least enough for what we 6444 have+the replacement+the rest of the string, so 6445 we won't have to check space for encodable characters) */ 6446 respos = str - PyBytes_AS_STRING(res); 6447 repsize = PyUnicode_GET_LENGTH(repunicode); 6448 requiredsize = respos+repsize+(size-collend); 6449 if (requiredsize > ressize) { 6450 if (requiredsize<2*ressize) 6451 requiredsize = 2*ressize; 6452 if (_PyBytes_Resize(&res, requiredsize)) { 6453 Py_DECREF(repunicode); 6454 goto onError; 6455 } 6456 str = PyBytes_AS_STRING(res) + respos; 6457 ressize = requiredsize; 6458 } 6459 /* check if there is anything unencodable in the replacement 6460 and copy it to the output */ 6461 for (i = 0; repsize-->0; ++i, ++str) { 6462 c = PyUnicode_READ_CHAR(repunicode, i); 6463 if (c >= limit) { 6464 raise_encode_exception(&exc, encoding, unicode, 6465 pos, pos+1, reason); 6466 Py_DECREF(repunicode); 6467 goto onError; 6468 } 6469 *str = (char)c; 6470 } 6471 pos = newpos; 6472 Py_DECREF(repunicode); 6473 } 6474 } 6475 } 6476 /* Resize if we allocated to much */ 6477 size = str - PyBytes_AS_STRING(res); 6478 if (size < ressize) { /* If this falls res will be NULL */ 6479 assert(size >= 0); 6480 if (_PyBytes_Resize(&res, size) < 0) 6481 goto onError; 6482 } 6483 6484 Py_XDECREF(errorHandler); 6485 Py_XDECREF(exc); 6486 return res; 6487 6488 onError: 6489 Py_XDECREF(res); 6490 Py_XDECREF(errorHandler); 6491 Py_XDECREF(exc); 6492 return NULL; 6493} 6494 6495/* Deprecated */ 6496PyObject * 6497PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6498 Py_ssize_t size, 6499 const char *errors) 6500{ 6501 PyObject *result; 6502 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6503 if (unicode == NULL) 6504 return NULL; 6505 result = unicode_encode_ucs1(unicode, errors, 256); 6506 Py_DECREF(unicode); 6507 return result; 6508} 6509 6510PyObject * 6511_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6512{ 6513 if (!PyUnicode_Check(unicode)) { 6514 PyErr_BadArgument(); 6515 return NULL; 6516 } 6517 if (PyUnicode_READY(unicode) == -1) 6518 return NULL; 6519 /* Fast path: if it is a one-byte string, construct 6520 bytes object directly. */ 6521 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6522 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6523 PyUnicode_GET_LENGTH(unicode)); 6524 /* Non-Latin-1 characters present. Defer to above function to 6525 raise the exception. */ 6526 return unicode_encode_ucs1(unicode, errors, 256); 6527} 6528 6529PyObject* 6530PyUnicode_AsLatin1String(PyObject *unicode) 6531{ 6532 return _PyUnicode_AsLatin1String(unicode, NULL); 6533} 6534 6535/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6536 6537PyObject * 6538PyUnicode_DecodeASCII(const char *s, 6539 Py_ssize_t size, 6540 const char *errors) 6541{ 6542 const char *starts = s; 6543 PyObject *unicode; 6544 int kind; 6545 void *data; 6546 Py_ssize_t startinpos; 6547 Py_ssize_t endinpos; 6548 Py_ssize_t outpos; 6549 const char *e; 6550 PyObject *errorHandler = NULL; 6551 PyObject *exc = NULL; 6552 6553 if (size == 0) { 6554 Py_INCREF(unicode_empty); 6555 return unicode_empty; 6556 } 6557 6558 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6559 if (size == 1 && (unsigned char)s[0] < 128) 6560 return get_latin1_char((unsigned char)s[0]); 6561 6562 unicode = PyUnicode_New(size, 127); 6563 if (unicode == NULL) 6564 goto onError; 6565 6566 e = s + size; 6567 data = PyUnicode_1BYTE_DATA(unicode); 6568 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6569 if (outpos == size) 6570 return unicode; 6571 6572 s += outpos; 6573 kind = PyUnicode_1BYTE_KIND; 6574 while (s < e) { 6575 register unsigned char c = (unsigned char)*s; 6576 if (c < 128) { 6577 PyUnicode_WRITE(kind, data, outpos++, c); 6578 ++s; 6579 } 6580 else { 6581 startinpos = s-starts; 6582 endinpos = startinpos + 1; 6583 if (unicode_decode_call_errorhandler( 6584 errors, &errorHandler, 6585 "ascii", "ordinal not in range(128)", 6586 &starts, &e, &startinpos, &endinpos, &exc, &s, 6587 &unicode, &outpos)) 6588 goto onError; 6589 kind = PyUnicode_KIND(unicode); 6590 data = PyUnicode_DATA(unicode); 6591 } 6592 } 6593 if (unicode_resize(&unicode, outpos) < 0) 6594 goto onError; 6595 Py_XDECREF(errorHandler); 6596 Py_XDECREF(exc); 6597 assert(_PyUnicode_CheckConsistency(unicode, 1)); 6598 return unicode; 6599 6600 onError: 6601 Py_XDECREF(unicode); 6602 Py_XDECREF(errorHandler); 6603 Py_XDECREF(exc); 6604 return NULL; 6605} 6606 6607/* Deprecated */ 6608PyObject * 6609PyUnicode_EncodeASCII(const Py_UNICODE *p, 6610 Py_ssize_t size, 6611 const char *errors) 6612{ 6613 PyObject *result; 6614 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6615 if (unicode == NULL) 6616 return NULL; 6617 result = unicode_encode_ucs1(unicode, errors, 128); 6618 Py_DECREF(unicode); 6619 return result; 6620} 6621 6622PyObject * 6623_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6624{ 6625 if (!PyUnicode_Check(unicode)) { 6626 PyErr_BadArgument(); 6627 return NULL; 6628 } 6629 if (PyUnicode_READY(unicode) == -1) 6630 return NULL; 6631 /* Fast path: if it is an ASCII-only string, construct bytes object 6632 directly. Else defer to above function to raise the exception. */ 6633 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6634 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6635 PyUnicode_GET_LENGTH(unicode)); 6636 return unicode_encode_ucs1(unicode, errors, 128); 6637} 6638 6639PyObject * 6640PyUnicode_AsASCIIString(PyObject *unicode) 6641{ 6642 return _PyUnicode_AsASCIIString(unicode, NULL); 6643} 6644 6645#ifdef HAVE_MBCS 6646 6647/* --- MBCS codecs for Windows -------------------------------------------- */ 6648 6649#if SIZEOF_INT < SIZEOF_SIZE_T 6650#define NEED_RETRY 6651#endif 6652 6653#ifndef WC_ERR_INVALID_CHARS 6654# define WC_ERR_INVALID_CHARS 0x0080 6655#endif 6656 6657static char* 6658code_page_name(UINT code_page, PyObject **obj) 6659{ 6660 *obj = NULL; 6661 if (code_page == CP_ACP) 6662 return "mbcs"; 6663 if (code_page == CP_UTF7) 6664 return "CP_UTF7"; 6665 if (code_page == CP_UTF8) 6666 return "CP_UTF8"; 6667 6668 *obj = PyBytes_FromFormat("cp%u", code_page); 6669 if (*obj == NULL) 6670 return NULL; 6671 return PyBytes_AS_STRING(*obj); 6672} 6673 6674static int 6675is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6676{ 6677 const char *curr = s + offset; 6678 const char *prev; 6679 6680 if (!IsDBCSLeadByteEx(code_page, *curr)) 6681 return 0; 6682 6683 prev = CharPrevExA(code_page, s, curr, 0); 6684 if (prev == curr) 6685 return 1; 6686 /* FIXME: This code is limited to "true" double-byte encodings, 6687 as it assumes an incomplete character consists of a single 6688 byte. */ 6689 if (curr - prev == 2) 6690 return 1; 6691 if (!IsDBCSLeadByteEx(code_page, *prev)) 6692 return 1; 6693 return 0; 6694} 6695 6696static DWORD 6697decode_code_page_flags(UINT code_page) 6698{ 6699 if (code_page == CP_UTF7) { 6700 /* The CP_UTF7 decoder only supports flags=0 */ 6701 return 0; 6702 } 6703 else 6704 return MB_ERR_INVALID_CHARS; 6705} 6706 6707/* 6708 * Decode a byte string from a Windows code page into unicode object in strict 6709 * mode. 6710 * 6711 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6712 * WindowsError and returns -1 on other error. 6713 */ 6714static int 6715decode_code_page_strict(UINT code_page, 6716 PyObject **v, 6717 const char *in, 6718 int insize) 6719{ 6720 const DWORD flags = decode_code_page_flags(code_page); 6721 wchar_t *out; 6722 DWORD outsize; 6723 6724 /* First get the size of the result */ 6725 assert(insize > 0); 6726 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6727 if (outsize <= 0) 6728 goto error; 6729 6730 if (*v == NULL) { 6731 /* Create unicode object */ 6732 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6733 *v = (PyObject*)_PyUnicode_New(outsize); 6734 if (*v == NULL) 6735 return -1; 6736 out = PyUnicode_AS_UNICODE(*v); 6737 } 6738 else { 6739 /* Extend unicode object */ 6740 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6741 if (unicode_resize(v, n + outsize) < 0) 6742 return -1; 6743 out = PyUnicode_AS_UNICODE(*v) + n; 6744 } 6745 6746 /* Do the conversion */ 6747 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6748 if (outsize <= 0) 6749 goto error; 6750 return insize; 6751 6752error: 6753 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6754 return -2; 6755 PyErr_SetFromWindowsErr(0); 6756 return -1; 6757} 6758 6759/* 6760 * Decode a byte string from a code page into unicode object with an error 6761 * handler. 6762 * 6763 * Returns consumed size if succeed, or raise a WindowsError or 6764 * UnicodeDecodeError exception and returns -1 on error. 6765 */ 6766static int 6767decode_code_page_errors(UINT code_page, 6768 PyObject **v, 6769 const char *in, const int size, 6770 const char *errors) 6771{ 6772 const char *startin = in; 6773 const char *endin = in + size; 6774 const DWORD flags = decode_code_page_flags(code_page); 6775 /* Ideally, we should get reason from FormatMessage. This is the Windows 6776 2000 English version of the message. */ 6777 const char *reason = "No mapping for the Unicode character exists " 6778 "in the target code page."; 6779 /* each step cannot decode more than 1 character, but a character can be 6780 represented as a surrogate pair */ 6781 wchar_t buffer[2], *startout, *out; 6782 int insize, outsize; 6783 PyObject *errorHandler = NULL; 6784 PyObject *exc = NULL; 6785 PyObject *encoding_obj = NULL; 6786 char *encoding; 6787 DWORD err; 6788 int ret = -1; 6789 6790 assert(size > 0); 6791 6792 encoding = code_page_name(code_page, &encoding_obj); 6793 if (encoding == NULL) 6794 return -1; 6795 6796 if (errors == NULL || strcmp(errors, "strict") == 0) { 6797 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6798 UnicodeDecodeError. */ 6799 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6800 if (exc != NULL) { 6801 PyCodec_StrictErrors(exc); 6802 Py_CLEAR(exc); 6803 } 6804 goto error; 6805 } 6806 6807 if (*v == NULL) { 6808 /* Create unicode object */ 6809 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6810 PyErr_NoMemory(); 6811 goto error; 6812 } 6813 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6814 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6815 if (*v == NULL) 6816 goto error; 6817 startout = PyUnicode_AS_UNICODE(*v); 6818 } 6819 else { 6820 /* Extend unicode object */ 6821 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6822 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6823 PyErr_NoMemory(); 6824 goto error; 6825 } 6826 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6827 goto error; 6828 startout = PyUnicode_AS_UNICODE(*v) + n; 6829 } 6830 6831 /* Decode the byte string character per character */ 6832 out = startout; 6833 while (in < endin) 6834 { 6835 /* Decode a character */ 6836 insize = 1; 6837 do 6838 { 6839 outsize = MultiByteToWideChar(code_page, flags, 6840 in, insize, 6841 buffer, Py_ARRAY_LENGTH(buffer)); 6842 if (outsize > 0) 6843 break; 6844 err = GetLastError(); 6845 if (err != ERROR_NO_UNICODE_TRANSLATION 6846 && err != ERROR_INSUFFICIENT_BUFFER) 6847 { 6848 PyErr_SetFromWindowsErr(0); 6849 goto error; 6850 } 6851 insize++; 6852 } 6853 /* 4=maximum length of a UTF-8 sequence */ 6854 while (insize <= 4 && (in + insize) <= endin); 6855 6856 if (outsize <= 0) { 6857 Py_ssize_t startinpos, endinpos, outpos; 6858 6859 startinpos = in - startin; 6860 endinpos = startinpos + 1; 6861 outpos = out - PyUnicode_AS_UNICODE(*v); 6862 if (unicode_decode_call_errorhandler( 6863 errors, &errorHandler, 6864 encoding, reason, 6865 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6866 v, &outpos)) 6867 { 6868 goto error; 6869 } 6870 out = PyUnicode_AS_UNICODE(*v) + outpos; 6871 } 6872 else { 6873 in += insize; 6874 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6875 out += outsize; 6876 } 6877 } 6878 6879 /* write a NUL character at the end */ 6880 *out = 0; 6881 6882 /* Extend unicode object */ 6883 outsize = out - startout; 6884 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6885 if (unicode_resize(v, outsize) < 0) 6886 goto error; 6887 ret = size; 6888 6889error: 6890 Py_XDECREF(encoding_obj); 6891 Py_XDECREF(errorHandler); 6892 Py_XDECREF(exc); 6893 return ret; 6894} 6895 6896static PyObject * 6897decode_code_page_stateful(int code_page, 6898 const char *s, Py_ssize_t size, 6899 const char *errors, Py_ssize_t *consumed) 6900{ 6901 PyObject *v = NULL; 6902 int chunk_size, final, converted, done; 6903 6904 if (code_page < 0) { 6905 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6906 return NULL; 6907 } 6908 6909 if (consumed) 6910 *consumed = 0; 6911 6912 do 6913 { 6914#ifdef NEED_RETRY 6915 if (size > INT_MAX) { 6916 chunk_size = INT_MAX; 6917 final = 0; 6918 done = 0; 6919 } 6920 else 6921#endif 6922 { 6923 chunk_size = (int)size; 6924 final = (consumed == NULL); 6925 done = 1; 6926 } 6927 6928 /* Skip trailing lead-byte unless 'final' is set */ 6929 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6930 --chunk_size; 6931 6932 if (chunk_size == 0 && done) { 6933 if (v != NULL) 6934 break; 6935 Py_INCREF(unicode_empty); 6936 return unicode_empty; 6937 } 6938 6939 6940 converted = decode_code_page_strict(code_page, &v, 6941 s, chunk_size); 6942 if (converted == -2) 6943 converted = decode_code_page_errors(code_page, &v, 6944 s, chunk_size, 6945 errors); 6946 assert(converted != 0); 6947 6948 if (converted < 0) { 6949 Py_XDECREF(v); 6950 return NULL; 6951 } 6952 6953 if (consumed) 6954 *consumed += converted; 6955 6956 s += converted; 6957 size -= converted; 6958 } while (!done); 6959 6960 return unicode_result(v); 6961} 6962 6963PyObject * 6964PyUnicode_DecodeCodePageStateful(int code_page, 6965 const char *s, 6966 Py_ssize_t size, 6967 const char *errors, 6968 Py_ssize_t *consumed) 6969{ 6970 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6971} 6972 6973PyObject * 6974PyUnicode_DecodeMBCSStateful(const char *s, 6975 Py_ssize_t size, 6976 const char *errors, 6977 Py_ssize_t *consumed) 6978{ 6979 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6980} 6981 6982PyObject * 6983PyUnicode_DecodeMBCS(const char *s, 6984 Py_ssize_t size, 6985 const char *errors) 6986{ 6987 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6988} 6989 6990static DWORD 6991encode_code_page_flags(UINT code_page, const char *errors) 6992{ 6993 if (code_page == CP_UTF8) { 6994 if (winver.dwMajorVersion >= 6) 6995 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 6996 and later */ 6997 return WC_ERR_INVALID_CHARS; 6998 else 6999 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7000 return 0; 7001 } 7002 else if (code_page == CP_UTF7) { 7003 /* CP_UTF7 only supports flags=0 */ 7004 return 0; 7005 } 7006 else { 7007 if (errors != NULL && strcmp(errors, "replace") == 0) 7008 return 0; 7009 else 7010 return WC_NO_BEST_FIT_CHARS; 7011 } 7012} 7013 7014/* 7015 * Encode a Unicode string to a Windows code page into a byte string in strict 7016 * mode. 7017 * 7018 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7019 * a WindowsError and returns -1 on other error. 7020 */ 7021static int 7022encode_code_page_strict(UINT code_page, PyObject **outbytes, 7023 PyObject *unicode, Py_ssize_t offset, int len, 7024 const char* errors) 7025{ 7026 BOOL usedDefaultChar = FALSE; 7027 BOOL *pusedDefaultChar = &usedDefaultChar; 7028 int outsize; 7029 PyObject *exc = NULL; 7030 wchar_t *p; 7031 Py_ssize_t size; 7032 const DWORD flags = encode_code_page_flags(code_page, NULL); 7033 char *out; 7034 /* Create a substring so that we can get the UTF-16 representation 7035 of just the slice under consideration. */ 7036 PyObject *substring; 7037 7038 assert(len > 0); 7039 7040 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7041 pusedDefaultChar = &usedDefaultChar; 7042 else 7043 pusedDefaultChar = NULL; 7044 7045 substring = PyUnicode_Substring(unicode, offset, offset+len); 7046 if (substring == NULL) 7047 return -1; 7048 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7049 if (p == NULL) { 7050 Py_DECREF(substring); 7051 return -1; 7052 } 7053 7054 /* First get the size of the result */ 7055 outsize = WideCharToMultiByte(code_page, flags, 7056 p, size, 7057 NULL, 0, 7058 NULL, pusedDefaultChar); 7059 if (outsize <= 0) 7060 goto error; 7061 /* If we used a default char, then we failed! */ 7062 if (pusedDefaultChar && *pusedDefaultChar) { 7063 Py_DECREF(substring); 7064 return -2; 7065 } 7066 7067 if (*outbytes == NULL) { 7068 /* Create string object */ 7069 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7070 if (*outbytes == NULL) { 7071 Py_DECREF(substring); 7072 return -1; 7073 } 7074 out = PyBytes_AS_STRING(*outbytes); 7075 } 7076 else { 7077 /* Extend string object */ 7078 const Py_ssize_t n = PyBytes_Size(*outbytes); 7079 if (outsize > PY_SSIZE_T_MAX - n) { 7080 PyErr_NoMemory(); 7081 Py_DECREF(substring); 7082 return -1; 7083 } 7084 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7085 Py_DECREF(substring); 7086 return -1; 7087 } 7088 out = PyBytes_AS_STRING(*outbytes) + n; 7089 } 7090 7091 /* Do the conversion */ 7092 outsize = WideCharToMultiByte(code_page, flags, 7093 p, size, 7094 out, outsize, 7095 NULL, pusedDefaultChar); 7096 Py_CLEAR(substring); 7097 if (outsize <= 0) 7098 goto error; 7099 if (pusedDefaultChar && *pusedDefaultChar) 7100 return -2; 7101 return 0; 7102 7103error: 7104 Py_XDECREF(substring); 7105 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7106 return -2; 7107 PyErr_SetFromWindowsErr(0); 7108 return -1; 7109} 7110 7111/* 7112 * Encode a Unicode string to a Windows code page into a byte string using a 7113 * error handler. 7114 * 7115 * Returns consumed characters if succeed, or raise a WindowsError and returns 7116 * -1 on other error. 7117 */ 7118static int 7119encode_code_page_errors(UINT code_page, PyObject **outbytes, 7120 PyObject *unicode, Py_ssize_t unicode_offset, 7121 Py_ssize_t insize, const char* errors) 7122{ 7123 const DWORD flags = encode_code_page_flags(code_page, errors); 7124 Py_ssize_t pos = unicode_offset; 7125 Py_ssize_t endin = unicode_offset + insize; 7126 /* Ideally, we should get reason from FormatMessage. This is the Windows 7127 2000 English version of the message. */ 7128 const char *reason = "invalid character"; 7129 /* 4=maximum length of a UTF-8 sequence */ 7130 char buffer[4]; 7131 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7132 Py_ssize_t outsize; 7133 char *out; 7134 PyObject *errorHandler = NULL; 7135 PyObject *exc = NULL; 7136 PyObject *encoding_obj = NULL; 7137 char *encoding; 7138 Py_ssize_t newpos, newoutsize; 7139 PyObject *rep; 7140 int ret = -1; 7141 7142 assert(insize > 0); 7143 7144 encoding = code_page_name(code_page, &encoding_obj); 7145 if (encoding == NULL) 7146 return -1; 7147 7148 if (errors == NULL || strcmp(errors, "strict") == 0) { 7149 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7150 then we raise a UnicodeEncodeError. */ 7151 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7152 if (exc != NULL) { 7153 PyCodec_StrictErrors(exc); 7154 Py_DECREF(exc); 7155 } 7156 Py_XDECREF(encoding_obj); 7157 return -1; 7158 } 7159 7160 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7161 pusedDefaultChar = &usedDefaultChar; 7162 else 7163 pusedDefaultChar = NULL; 7164 7165 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7166 PyErr_NoMemory(); 7167 goto error; 7168 } 7169 outsize = insize * Py_ARRAY_LENGTH(buffer); 7170 7171 if (*outbytes == NULL) { 7172 /* Create string object */ 7173 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7174 if (*outbytes == NULL) 7175 goto error; 7176 out = PyBytes_AS_STRING(*outbytes); 7177 } 7178 else { 7179 /* Extend string object */ 7180 Py_ssize_t n = PyBytes_Size(*outbytes); 7181 if (n > PY_SSIZE_T_MAX - outsize) { 7182 PyErr_NoMemory(); 7183 goto error; 7184 } 7185 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7186 goto error; 7187 out = PyBytes_AS_STRING(*outbytes) + n; 7188 } 7189 7190 /* Encode the string character per character */ 7191 while (pos < endin) 7192 { 7193 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7194 wchar_t chars[2]; 7195 int charsize; 7196 if (ch < 0x10000) { 7197 chars[0] = (wchar_t)ch; 7198 charsize = 1; 7199 } 7200 else { 7201 ch -= 0x10000; 7202 chars[0] = 0xd800 + (ch >> 10); 7203 chars[1] = 0xdc00 + (ch & 0x3ff); 7204 charsize = 2; 7205 } 7206 7207 outsize = WideCharToMultiByte(code_page, flags, 7208 chars, charsize, 7209 buffer, Py_ARRAY_LENGTH(buffer), 7210 NULL, pusedDefaultChar); 7211 if (outsize > 0) { 7212 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7213 { 7214 pos++; 7215 memcpy(out, buffer, outsize); 7216 out += outsize; 7217 continue; 7218 } 7219 } 7220 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7221 PyErr_SetFromWindowsErr(0); 7222 goto error; 7223 } 7224 7225 rep = unicode_encode_call_errorhandler( 7226 errors, &errorHandler, encoding, reason, 7227 unicode, &exc, 7228 pos, pos + 1, &newpos); 7229 if (rep == NULL) 7230 goto error; 7231 pos = newpos; 7232 7233 if (PyBytes_Check(rep)) { 7234 outsize = PyBytes_GET_SIZE(rep); 7235 if (outsize != 1) { 7236 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7237 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7238 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7239 Py_DECREF(rep); 7240 goto error; 7241 } 7242 out = PyBytes_AS_STRING(*outbytes) + offset; 7243 } 7244 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7245 out += outsize; 7246 } 7247 else { 7248 Py_ssize_t i; 7249 enum PyUnicode_Kind kind; 7250 void *data; 7251 7252 if (PyUnicode_READY(rep) == -1) { 7253 Py_DECREF(rep); 7254 goto error; 7255 } 7256 7257 outsize = PyUnicode_GET_LENGTH(rep); 7258 if (outsize != 1) { 7259 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7260 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7261 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7262 Py_DECREF(rep); 7263 goto error; 7264 } 7265 out = PyBytes_AS_STRING(*outbytes) + offset; 7266 } 7267 kind = PyUnicode_KIND(rep); 7268 data = PyUnicode_DATA(rep); 7269 for (i=0; i < outsize; i++) { 7270 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7271 if (ch > 127) { 7272 raise_encode_exception(&exc, 7273 encoding, unicode, 7274 pos, pos + 1, 7275 "unable to encode error handler result to ASCII"); 7276 Py_DECREF(rep); 7277 goto error; 7278 } 7279 *out = (unsigned char)ch; 7280 out++; 7281 } 7282 } 7283 Py_DECREF(rep); 7284 } 7285 /* write a NUL byte */ 7286 *out = 0; 7287 outsize = out - PyBytes_AS_STRING(*outbytes); 7288 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7289 if (_PyBytes_Resize(outbytes, outsize) < 0) 7290 goto error; 7291 ret = 0; 7292 7293error: 7294 Py_XDECREF(encoding_obj); 7295 Py_XDECREF(errorHandler); 7296 Py_XDECREF(exc); 7297 return ret; 7298} 7299 7300static PyObject * 7301encode_code_page(int code_page, 7302 PyObject *unicode, 7303 const char *errors) 7304{ 7305 Py_ssize_t len; 7306 PyObject *outbytes = NULL; 7307 Py_ssize_t offset; 7308 int chunk_len, ret, done; 7309 7310 if (PyUnicode_READY(unicode) == -1) 7311 return NULL; 7312 len = PyUnicode_GET_LENGTH(unicode); 7313 7314 if (code_page < 0) { 7315 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7316 return NULL; 7317 } 7318 7319 if (len == 0) 7320 return PyBytes_FromStringAndSize(NULL, 0); 7321 7322 offset = 0; 7323 do 7324 { 7325#ifdef NEED_RETRY 7326 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7327 chunks. */ 7328 if (len > INT_MAX/2) { 7329 chunk_len = INT_MAX/2; 7330 done = 0; 7331 } 7332 else 7333#endif 7334 { 7335 chunk_len = (int)len; 7336 done = 1; 7337 } 7338 7339 ret = encode_code_page_strict(code_page, &outbytes, 7340 unicode, offset, chunk_len, 7341 errors); 7342 if (ret == -2) 7343 ret = encode_code_page_errors(code_page, &outbytes, 7344 unicode, offset, 7345 chunk_len, errors); 7346 if (ret < 0) { 7347 Py_XDECREF(outbytes); 7348 return NULL; 7349 } 7350 7351 offset += chunk_len; 7352 len -= chunk_len; 7353 } while (!done); 7354 7355 return outbytes; 7356} 7357 7358PyObject * 7359PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7360 Py_ssize_t size, 7361 const char *errors) 7362{ 7363 PyObject *unicode, *res; 7364 unicode = PyUnicode_FromUnicode(p, size); 7365 if (unicode == NULL) 7366 return NULL; 7367 res = encode_code_page(CP_ACP, unicode, errors); 7368 Py_DECREF(unicode); 7369 return res; 7370} 7371 7372PyObject * 7373PyUnicode_EncodeCodePage(int code_page, 7374 PyObject *unicode, 7375 const char *errors) 7376{ 7377 return encode_code_page(code_page, unicode, errors); 7378} 7379 7380PyObject * 7381PyUnicode_AsMBCSString(PyObject *unicode) 7382{ 7383 if (!PyUnicode_Check(unicode)) { 7384 PyErr_BadArgument(); 7385 return NULL; 7386 } 7387 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7388} 7389 7390#undef NEED_RETRY 7391 7392#endif /* HAVE_MBCS */ 7393 7394/* --- Character Mapping Codec -------------------------------------------- */ 7395 7396PyObject * 7397PyUnicode_DecodeCharmap(const char *s, 7398 Py_ssize_t size, 7399 PyObject *mapping, 7400 const char *errors) 7401{ 7402 const char *starts = s; 7403 Py_ssize_t startinpos; 7404 Py_ssize_t endinpos; 7405 Py_ssize_t outpos; 7406 const char *e; 7407 PyObject *v; 7408 Py_ssize_t extrachars = 0; 7409 PyObject *errorHandler = NULL; 7410 PyObject *exc = NULL; 7411 7412 /* Default to Latin-1 */ 7413 if (mapping == NULL) 7414 return PyUnicode_DecodeLatin1(s, size, errors); 7415 7416 v = PyUnicode_New(size, 127); 7417 if (v == NULL) 7418 goto onError; 7419 if (size == 0) 7420 return v; 7421 outpos = 0; 7422 e = s + size; 7423 if (PyUnicode_CheckExact(mapping)) { 7424 Py_ssize_t maplen; 7425 enum PyUnicode_Kind mapkind; 7426 void *mapdata; 7427 Py_UCS4 x; 7428 7429 if (PyUnicode_READY(mapping) == -1) 7430 return NULL; 7431 7432 maplen = PyUnicode_GET_LENGTH(mapping); 7433 mapdata = PyUnicode_DATA(mapping); 7434 mapkind = PyUnicode_KIND(mapping); 7435 while (s < e) { 7436 unsigned char ch; 7437 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7438 enum PyUnicode_Kind outkind = PyUnicode_KIND(v); 7439 if (outkind == PyUnicode_1BYTE_KIND) { 7440 void *outdata = PyUnicode_DATA(v); 7441 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v); 7442 while (s < e) { 7443 unsigned char ch = *s; 7444 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7445 if (x > maxchar) 7446 goto Error; 7447 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x); 7448 ++s; 7449 } 7450 break; 7451 } 7452 else if (outkind == PyUnicode_2BYTE_KIND) { 7453 void *outdata = PyUnicode_DATA(v); 7454 while (s < e) { 7455 unsigned char ch = *s; 7456 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7457 if (x == 0xFFFE) 7458 goto Error; 7459 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x); 7460 ++s; 7461 } 7462 break; 7463 } 7464 } 7465 ch = *s; 7466 7467 if (ch < maplen) 7468 x = PyUnicode_READ(mapkind, mapdata, ch); 7469 else 7470 x = 0xfffe; /* invalid value */ 7471Error: 7472 if (x == 0xfffe) 7473 { 7474 /* undefined mapping */ 7475 startinpos = s-starts; 7476 endinpos = startinpos+1; 7477 if (unicode_decode_call_errorhandler( 7478 errors, &errorHandler, 7479 "charmap", "character maps to <undefined>", 7480 &starts, &e, &startinpos, &endinpos, &exc, &s, 7481 &v, &outpos)) { 7482 goto onError; 7483 } 7484 continue; 7485 } 7486 7487 if (unicode_putchar(&v, &outpos, x) < 0) 7488 goto onError; 7489 ++s; 7490 } 7491 } 7492 else { 7493 while (s < e) { 7494 unsigned char ch = *s; 7495 PyObject *w, *x; 7496 7497 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7498 w = PyLong_FromLong((long)ch); 7499 if (w == NULL) 7500 goto onError; 7501 x = PyObject_GetItem(mapping, w); 7502 Py_DECREF(w); 7503 if (x == NULL) { 7504 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7505 /* No mapping found means: mapping is undefined. */ 7506 PyErr_Clear(); 7507 x = Py_None; 7508 Py_INCREF(x); 7509 } else 7510 goto onError; 7511 } 7512 7513 /* Apply mapping */ 7514 if (PyLong_Check(x)) { 7515 long value = PyLong_AS_LONG(x); 7516 if (value < 0 || value > MAX_UNICODE) { 7517 PyErr_Format(PyExc_TypeError, 7518 "character mapping must be in range(0x%lx)", 7519 (unsigned long)MAX_UNICODE + 1); 7520 Py_DECREF(x); 7521 goto onError; 7522 } 7523 if (unicode_putchar(&v, &outpos, value) < 0) 7524 goto onError; 7525 } 7526 else if (x == Py_None) { 7527 /* undefined mapping */ 7528 startinpos = s-starts; 7529 endinpos = startinpos+1; 7530 if (unicode_decode_call_errorhandler( 7531 errors, &errorHandler, 7532 "charmap", "character maps to <undefined>", 7533 &starts, &e, &startinpos, &endinpos, &exc, &s, 7534 &v, &outpos)) { 7535 Py_DECREF(x); 7536 goto onError; 7537 } 7538 Py_DECREF(x); 7539 continue; 7540 } 7541 else if (PyUnicode_Check(x)) { 7542 Py_ssize_t targetsize; 7543 7544 if (PyUnicode_READY(x) == -1) 7545 goto onError; 7546 targetsize = PyUnicode_GET_LENGTH(x); 7547 7548 if (targetsize == 1) { 7549 /* 1-1 mapping */ 7550 if (unicode_putchar(&v, &outpos, 7551 PyUnicode_READ_CHAR(x, 0)) < 0) 7552 goto onError; 7553 } 7554 else if (targetsize > 1) { 7555 /* 1-n mapping */ 7556 if (targetsize > extrachars) { 7557 /* resize first */ 7558 Py_ssize_t needed = (targetsize - extrachars) + \ 7559 (targetsize << 2); 7560 extrachars += needed; 7561 /* XXX overflow detection missing */ 7562 if (unicode_resize(&v, 7563 PyUnicode_GET_LENGTH(v) + needed) < 0) 7564 { 7565 Py_DECREF(x); 7566 goto onError; 7567 } 7568 } 7569 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7570 goto onError; 7571 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7572 outpos += targetsize; 7573 extrachars -= targetsize; 7574 } 7575 /* 1-0 mapping: skip the character */ 7576 } 7577 else { 7578 /* wrong return value */ 7579 PyErr_SetString(PyExc_TypeError, 7580 "character mapping must return integer, None or str"); 7581 Py_DECREF(x); 7582 goto onError; 7583 } 7584 Py_DECREF(x); 7585 ++s; 7586 } 7587 } 7588 if (unicode_resize(&v, outpos) < 0) 7589 goto onError; 7590 Py_XDECREF(errorHandler); 7591 Py_XDECREF(exc); 7592 return unicode_result(v); 7593 7594 onError: 7595 Py_XDECREF(errorHandler); 7596 Py_XDECREF(exc); 7597 Py_XDECREF(v); 7598 return NULL; 7599} 7600 7601/* Charmap encoding: the lookup table */ 7602 7603struct encoding_map { 7604 PyObject_HEAD 7605 unsigned char level1[32]; 7606 int count2, count3; 7607 unsigned char level23[1]; 7608}; 7609 7610static PyObject* 7611encoding_map_size(PyObject *obj, PyObject* args) 7612{ 7613 struct encoding_map *map = (struct encoding_map*)obj; 7614 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7615 128*map->count3); 7616} 7617 7618static PyMethodDef encoding_map_methods[] = { 7619 {"size", encoding_map_size, METH_NOARGS, 7620 PyDoc_STR("Return the size (in bytes) of this object") }, 7621 { 0 } 7622}; 7623 7624static void 7625encoding_map_dealloc(PyObject* o) 7626{ 7627 PyObject_FREE(o); 7628} 7629 7630static PyTypeObject EncodingMapType = { 7631 PyVarObject_HEAD_INIT(NULL, 0) 7632 "EncodingMap", /*tp_name*/ 7633 sizeof(struct encoding_map), /*tp_basicsize*/ 7634 0, /*tp_itemsize*/ 7635 /* methods */ 7636 encoding_map_dealloc, /*tp_dealloc*/ 7637 0, /*tp_print*/ 7638 0, /*tp_getattr*/ 7639 0, /*tp_setattr*/ 7640 0, /*tp_reserved*/ 7641 0, /*tp_repr*/ 7642 0, /*tp_as_number*/ 7643 0, /*tp_as_sequence*/ 7644 0, /*tp_as_mapping*/ 7645 0, /*tp_hash*/ 7646 0, /*tp_call*/ 7647 0, /*tp_str*/ 7648 0, /*tp_getattro*/ 7649 0, /*tp_setattro*/ 7650 0, /*tp_as_buffer*/ 7651 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7652 0, /*tp_doc*/ 7653 0, /*tp_traverse*/ 7654 0, /*tp_clear*/ 7655 0, /*tp_richcompare*/ 7656 0, /*tp_weaklistoffset*/ 7657 0, /*tp_iter*/ 7658 0, /*tp_iternext*/ 7659 encoding_map_methods, /*tp_methods*/ 7660 0, /*tp_members*/ 7661 0, /*tp_getset*/ 7662 0, /*tp_base*/ 7663 0, /*tp_dict*/ 7664 0, /*tp_descr_get*/ 7665 0, /*tp_descr_set*/ 7666 0, /*tp_dictoffset*/ 7667 0, /*tp_init*/ 7668 0, /*tp_alloc*/ 7669 0, /*tp_new*/ 7670 0, /*tp_free*/ 7671 0, /*tp_is_gc*/ 7672}; 7673 7674PyObject* 7675PyUnicode_BuildEncodingMap(PyObject* string) 7676{ 7677 PyObject *result; 7678 struct encoding_map *mresult; 7679 int i; 7680 int need_dict = 0; 7681 unsigned char level1[32]; 7682 unsigned char level2[512]; 7683 unsigned char *mlevel1, *mlevel2, *mlevel3; 7684 int count2 = 0, count3 = 0; 7685 int kind; 7686 void *data; 7687 Py_ssize_t length; 7688 Py_UCS4 ch; 7689 7690 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7691 PyErr_BadArgument(); 7692 return NULL; 7693 } 7694 kind = PyUnicode_KIND(string); 7695 data = PyUnicode_DATA(string); 7696 length = PyUnicode_GET_LENGTH(string); 7697 length = Py_MIN(length, 256); 7698 memset(level1, 0xFF, sizeof level1); 7699 memset(level2, 0xFF, sizeof level2); 7700 7701 /* If there isn't a one-to-one mapping of NULL to \0, 7702 or if there are non-BMP characters, we need to use 7703 a mapping dictionary. */ 7704 if (PyUnicode_READ(kind, data, 0) != 0) 7705 need_dict = 1; 7706 for (i = 1; i < length; i++) { 7707 int l1, l2; 7708 ch = PyUnicode_READ(kind, data, i); 7709 if (ch == 0 || ch > 0xFFFF) { 7710 need_dict = 1; 7711 break; 7712 } 7713 if (ch == 0xFFFE) 7714 /* unmapped character */ 7715 continue; 7716 l1 = ch >> 11; 7717 l2 = ch >> 7; 7718 if (level1[l1] == 0xFF) 7719 level1[l1] = count2++; 7720 if (level2[l2] == 0xFF) 7721 level2[l2] = count3++; 7722 } 7723 7724 if (count2 >= 0xFF || count3 >= 0xFF) 7725 need_dict = 1; 7726 7727 if (need_dict) { 7728 PyObject *result = PyDict_New(); 7729 PyObject *key, *value; 7730 if (!result) 7731 return NULL; 7732 for (i = 0; i < length; i++) { 7733 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7734 value = PyLong_FromLong(i); 7735 if (!key || !value) 7736 goto failed1; 7737 if (PyDict_SetItem(result, key, value) == -1) 7738 goto failed1; 7739 Py_DECREF(key); 7740 Py_DECREF(value); 7741 } 7742 return result; 7743 failed1: 7744 Py_XDECREF(key); 7745 Py_XDECREF(value); 7746 Py_DECREF(result); 7747 return NULL; 7748 } 7749 7750 /* Create a three-level trie */ 7751 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7752 16*count2 + 128*count3 - 1); 7753 if (!result) 7754 return PyErr_NoMemory(); 7755 PyObject_Init(result, &EncodingMapType); 7756 mresult = (struct encoding_map*)result; 7757 mresult->count2 = count2; 7758 mresult->count3 = count3; 7759 mlevel1 = mresult->level1; 7760 mlevel2 = mresult->level23; 7761 mlevel3 = mresult->level23 + 16*count2; 7762 memcpy(mlevel1, level1, 32); 7763 memset(mlevel2, 0xFF, 16*count2); 7764 memset(mlevel3, 0, 128*count3); 7765 count3 = 0; 7766 for (i = 1; i < length; i++) { 7767 int o1, o2, o3, i2, i3; 7768 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7769 if (ch == 0xFFFE) 7770 /* unmapped character */ 7771 continue; 7772 o1 = ch>>11; 7773 o2 = (ch>>7) & 0xF; 7774 i2 = 16*mlevel1[o1] + o2; 7775 if (mlevel2[i2] == 0xFF) 7776 mlevel2[i2] = count3++; 7777 o3 = ch & 0x7F; 7778 i3 = 128*mlevel2[i2] + o3; 7779 mlevel3[i3] = i; 7780 } 7781 return result; 7782} 7783 7784static int 7785encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7786{ 7787 struct encoding_map *map = (struct encoding_map*)mapping; 7788 int l1 = c>>11; 7789 int l2 = (c>>7) & 0xF; 7790 int l3 = c & 0x7F; 7791 int i; 7792 7793 if (c > 0xFFFF) 7794 return -1; 7795 if (c == 0) 7796 return 0; 7797 /* level 1*/ 7798 i = map->level1[l1]; 7799 if (i == 0xFF) { 7800 return -1; 7801 } 7802 /* level 2*/ 7803 i = map->level23[16*i+l2]; 7804 if (i == 0xFF) { 7805 return -1; 7806 } 7807 /* level 3 */ 7808 i = map->level23[16*map->count2 + 128*i + l3]; 7809 if (i == 0) { 7810 return -1; 7811 } 7812 return i; 7813} 7814 7815/* Lookup the character ch in the mapping. If the character 7816 can't be found, Py_None is returned (or NULL, if another 7817 error occurred). */ 7818static PyObject * 7819charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7820{ 7821 PyObject *w = PyLong_FromLong((long)c); 7822 PyObject *x; 7823 7824 if (w == NULL) 7825 return NULL; 7826 x = PyObject_GetItem(mapping, w); 7827 Py_DECREF(w); 7828 if (x == NULL) { 7829 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7830 /* No mapping found means: mapping is undefined. */ 7831 PyErr_Clear(); 7832 x = Py_None; 7833 Py_INCREF(x); 7834 return x; 7835 } else 7836 return NULL; 7837 } 7838 else if (x == Py_None) 7839 return x; 7840 else if (PyLong_Check(x)) { 7841 long value = PyLong_AS_LONG(x); 7842 if (value < 0 || value > 255) { 7843 PyErr_SetString(PyExc_TypeError, 7844 "character mapping must be in range(256)"); 7845 Py_DECREF(x); 7846 return NULL; 7847 } 7848 return x; 7849 } 7850 else if (PyBytes_Check(x)) 7851 return x; 7852 else { 7853 /* wrong return value */ 7854 PyErr_Format(PyExc_TypeError, 7855 "character mapping must return integer, bytes or None, not %.400s", 7856 x->ob_type->tp_name); 7857 Py_DECREF(x); 7858 return NULL; 7859 } 7860} 7861 7862static int 7863charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7864{ 7865 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7866 /* exponentially overallocate to minimize reallocations */ 7867 if (requiredsize < 2*outsize) 7868 requiredsize = 2*outsize; 7869 if (_PyBytes_Resize(outobj, requiredsize)) 7870 return -1; 7871 return 0; 7872} 7873 7874typedef enum charmapencode_result { 7875 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7876} charmapencode_result; 7877/* lookup the character, put the result in the output string and adjust 7878 various state variables. Resize the output bytes object if not enough 7879 space is available. Return a new reference to the object that 7880 was put in the output buffer, or Py_None, if the mapping was undefined 7881 (in which case no character was written) or NULL, if a 7882 reallocation error occurred. The caller must decref the result */ 7883static charmapencode_result 7884charmapencode_output(Py_UCS4 c, PyObject *mapping, 7885 PyObject **outobj, Py_ssize_t *outpos) 7886{ 7887 PyObject *rep; 7888 char *outstart; 7889 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7890 7891 if (Py_TYPE(mapping) == &EncodingMapType) { 7892 int res = encoding_map_lookup(c, mapping); 7893 Py_ssize_t requiredsize = *outpos+1; 7894 if (res == -1) 7895 return enc_FAILED; 7896 if (outsize<requiredsize) 7897 if (charmapencode_resize(outobj, outpos, requiredsize)) 7898 return enc_EXCEPTION; 7899 outstart = PyBytes_AS_STRING(*outobj); 7900 outstart[(*outpos)++] = (char)res; 7901 return enc_SUCCESS; 7902 } 7903 7904 rep = charmapencode_lookup(c, mapping); 7905 if (rep==NULL) 7906 return enc_EXCEPTION; 7907 else if (rep==Py_None) { 7908 Py_DECREF(rep); 7909 return enc_FAILED; 7910 } else { 7911 if (PyLong_Check(rep)) { 7912 Py_ssize_t requiredsize = *outpos+1; 7913 if (outsize<requiredsize) 7914 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7915 Py_DECREF(rep); 7916 return enc_EXCEPTION; 7917 } 7918 outstart = PyBytes_AS_STRING(*outobj); 7919 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7920 } 7921 else { 7922 const char *repchars = PyBytes_AS_STRING(rep); 7923 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7924 Py_ssize_t requiredsize = *outpos+repsize; 7925 if (outsize<requiredsize) 7926 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7927 Py_DECREF(rep); 7928 return enc_EXCEPTION; 7929 } 7930 outstart = PyBytes_AS_STRING(*outobj); 7931 memcpy(outstart + *outpos, repchars, repsize); 7932 *outpos += repsize; 7933 } 7934 } 7935 Py_DECREF(rep); 7936 return enc_SUCCESS; 7937} 7938 7939/* handle an error in PyUnicode_EncodeCharmap 7940 Return 0 on success, -1 on error */ 7941static int 7942charmap_encoding_error( 7943 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7944 PyObject **exceptionObject, 7945 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7946 PyObject **res, Py_ssize_t *respos) 7947{ 7948 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7949 Py_ssize_t size, repsize; 7950 Py_ssize_t newpos; 7951 enum PyUnicode_Kind kind; 7952 void *data; 7953 Py_ssize_t index; 7954 /* startpos for collecting unencodable chars */ 7955 Py_ssize_t collstartpos = *inpos; 7956 Py_ssize_t collendpos = *inpos+1; 7957 Py_ssize_t collpos; 7958 char *encoding = "charmap"; 7959 char *reason = "character maps to <undefined>"; 7960 charmapencode_result x; 7961 Py_UCS4 ch; 7962 int val; 7963 7964 if (PyUnicode_READY(unicode) == -1) 7965 return -1; 7966 size = PyUnicode_GET_LENGTH(unicode); 7967 /* find all unencodable characters */ 7968 while (collendpos < size) { 7969 PyObject *rep; 7970 if (Py_TYPE(mapping) == &EncodingMapType) { 7971 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7972 val = encoding_map_lookup(ch, mapping); 7973 if (val != -1) 7974 break; 7975 ++collendpos; 7976 continue; 7977 } 7978 7979 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7980 rep = charmapencode_lookup(ch, mapping); 7981 if (rep==NULL) 7982 return -1; 7983 else if (rep!=Py_None) { 7984 Py_DECREF(rep); 7985 break; 7986 } 7987 Py_DECREF(rep); 7988 ++collendpos; 7989 } 7990 /* cache callback name lookup 7991 * (if not done yet, i.e. it's the first error) */ 7992 if (*known_errorHandler==-1) { 7993 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7994 *known_errorHandler = 1; 7995 else if (!strcmp(errors, "replace")) 7996 *known_errorHandler = 2; 7997 else if (!strcmp(errors, "ignore")) 7998 *known_errorHandler = 3; 7999 else if (!strcmp(errors, "xmlcharrefreplace")) 8000 *known_errorHandler = 4; 8001 else 8002 *known_errorHandler = 0; 8003 } 8004 switch (*known_errorHandler) { 8005 case 1: /* strict */ 8006 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8007 return -1; 8008 case 2: /* replace */ 8009 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8010 x = charmapencode_output('?', mapping, res, respos); 8011 if (x==enc_EXCEPTION) { 8012 return -1; 8013 } 8014 else if (x==enc_FAILED) { 8015 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8016 return -1; 8017 } 8018 } 8019 /* fall through */ 8020 case 3: /* ignore */ 8021 *inpos = collendpos; 8022 break; 8023 case 4: /* xmlcharrefreplace */ 8024 /* generate replacement (temporarily (mis)uses p) */ 8025 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8026 char buffer[2+29+1+1]; 8027 char *cp; 8028 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8029 for (cp = buffer; *cp; ++cp) { 8030 x = charmapencode_output(*cp, mapping, res, respos); 8031 if (x==enc_EXCEPTION) 8032 return -1; 8033 else if (x==enc_FAILED) { 8034 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8035 return -1; 8036 } 8037 } 8038 } 8039 *inpos = collendpos; 8040 break; 8041 default: 8042 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8043 encoding, reason, unicode, exceptionObject, 8044 collstartpos, collendpos, &newpos); 8045 if (repunicode == NULL) 8046 return -1; 8047 if (PyBytes_Check(repunicode)) { 8048 /* Directly copy bytes result to output. */ 8049 Py_ssize_t outsize = PyBytes_Size(*res); 8050 Py_ssize_t requiredsize; 8051 repsize = PyBytes_Size(repunicode); 8052 requiredsize = *respos + repsize; 8053 if (requiredsize > outsize) 8054 /* Make room for all additional bytes. */ 8055 if (charmapencode_resize(res, respos, requiredsize)) { 8056 Py_DECREF(repunicode); 8057 return -1; 8058 } 8059 memcpy(PyBytes_AsString(*res) + *respos, 8060 PyBytes_AsString(repunicode), repsize); 8061 *respos += repsize; 8062 *inpos = newpos; 8063 Py_DECREF(repunicode); 8064 break; 8065 } 8066 /* generate replacement */ 8067 if (PyUnicode_READY(repunicode) == -1) { 8068 Py_DECREF(repunicode); 8069 return -1; 8070 } 8071 repsize = PyUnicode_GET_LENGTH(repunicode); 8072 data = PyUnicode_DATA(repunicode); 8073 kind = PyUnicode_KIND(repunicode); 8074 for (index = 0; index < repsize; index++) { 8075 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8076 x = charmapencode_output(repch, mapping, res, respos); 8077 if (x==enc_EXCEPTION) { 8078 Py_DECREF(repunicode); 8079 return -1; 8080 } 8081 else if (x==enc_FAILED) { 8082 Py_DECREF(repunicode); 8083 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8084 return -1; 8085 } 8086 } 8087 *inpos = newpos; 8088 Py_DECREF(repunicode); 8089 } 8090 return 0; 8091} 8092 8093PyObject * 8094_PyUnicode_EncodeCharmap(PyObject *unicode, 8095 PyObject *mapping, 8096 const char *errors) 8097{ 8098 /* output object */ 8099 PyObject *res = NULL; 8100 /* current input position */ 8101 Py_ssize_t inpos = 0; 8102 Py_ssize_t size; 8103 /* current output position */ 8104 Py_ssize_t respos = 0; 8105 PyObject *errorHandler = NULL; 8106 PyObject *exc = NULL; 8107 /* the following variable is used for caching string comparisons 8108 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8109 * 3=ignore, 4=xmlcharrefreplace */ 8110 int known_errorHandler = -1; 8111 8112 if (PyUnicode_READY(unicode) == -1) 8113 return NULL; 8114 size = PyUnicode_GET_LENGTH(unicode); 8115 8116 /* Default to Latin-1 */ 8117 if (mapping == NULL) 8118 return unicode_encode_ucs1(unicode, errors, 256); 8119 8120 /* allocate enough for a simple encoding without 8121 replacements, if we need more, we'll resize */ 8122 res = PyBytes_FromStringAndSize(NULL, size); 8123 if (res == NULL) 8124 goto onError; 8125 if (size == 0) 8126 return res; 8127 8128 while (inpos<size) { 8129 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8130 /* try to encode it */ 8131 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8132 if (x==enc_EXCEPTION) /* error */ 8133 goto onError; 8134 if (x==enc_FAILED) { /* unencodable character */ 8135 if (charmap_encoding_error(unicode, &inpos, mapping, 8136 &exc, 8137 &known_errorHandler, &errorHandler, errors, 8138 &res, &respos)) { 8139 goto onError; 8140 } 8141 } 8142 else 8143 /* done with this character => adjust input position */ 8144 ++inpos; 8145 } 8146 8147 /* Resize if we allocated to much */ 8148 if (respos<PyBytes_GET_SIZE(res)) 8149 if (_PyBytes_Resize(&res, respos) < 0) 8150 goto onError; 8151 8152 Py_XDECREF(exc); 8153 Py_XDECREF(errorHandler); 8154 return res; 8155 8156 onError: 8157 Py_XDECREF(res); 8158 Py_XDECREF(exc); 8159 Py_XDECREF(errorHandler); 8160 return NULL; 8161} 8162 8163/* Deprecated */ 8164PyObject * 8165PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8166 Py_ssize_t size, 8167 PyObject *mapping, 8168 const char *errors) 8169{ 8170 PyObject *result; 8171 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8172 if (unicode == NULL) 8173 return NULL; 8174 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8175 Py_DECREF(unicode); 8176 return result; 8177} 8178 8179PyObject * 8180PyUnicode_AsCharmapString(PyObject *unicode, 8181 PyObject *mapping) 8182{ 8183 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8184 PyErr_BadArgument(); 8185 return NULL; 8186 } 8187 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8188} 8189 8190/* create or adjust a UnicodeTranslateError */ 8191static void 8192make_translate_exception(PyObject **exceptionObject, 8193 PyObject *unicode, 8194 Py_ssize_t startpos, Py_ssize_t endpos, 8195 const char *reason) 8196{ 8197 if (*exceptionObject == NULL) { 8198 *exceptionObject = _PyUnicodeTranslateError_Create( 8199 unicode, startpos, endpos, reason); 8200 } 8201 else { 8202 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8203 goto onError; 8204 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8205 goto onError; 8206 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8207 goto onError; 8208 return; 8209 onError: 8210 Py_DECREF(*exceptionObject); 8211 *exceptionObject = NULL; 8212 } 8213} 8214 8215/* raises a UnicodeTranslateError */ 8216static void 8217raise_translate_exception(PyObject **exceptionObject, 8218 PyObject *unicode, 8219 Py_ssize_t startpos, Py_ssize_t endpos, 8220 const char *reason) 8221{ 8222 make_translate_exception(exceptionObject, 8223 unicode, startpos, endpos, reason); 8224 if (*exceptionObject != NULL) 8225 PyCodec_StrictErrors(*exceptionObject); 8226} 8227 8228/* error handling callback helper: 8229 build arguments, call the callback and check the arguments, 8230 put the result into newpos and return the replacement string, which 8231 has to be freed by the caller */ 8232static PyObject * 8233unicode_translate_call_errorhandler(const char *errors, 8234 PyObject **errorHandler, 8235 const char *reason, 8236 PyObject *unicode, PyObject **exceptionObject, 8237 Py_ssize_t startpos, Py_ssize_t endpos, 8238 Py_ssize_t *newpos) 8239{ 8240 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8241 8242 Py_ssize_t i_newpos; 8243 PyObject *restuple; 8244 PyObject *resunicode; 8245 8246 if (*errorHandler == NULL) { 8247 *errorHandler = PyCodec_LookupError(errors); 8248 if (*errorHandler == NULL) 8249 return NULL; 8250 } 8251 8252 make_translate_exception(exceptionObject, 8253 unicode, startpos, endpos, reason); 8254 if (*exceptionObject == NULL) 8255 return NULL; 8256 8257 restuple = PyObject_CallFunctionObjArgs( 8258 *errorHandler, *exceptionObject, NULL); 8259 if (restuple == NULL) 8260 return NULL; 8261 if (!PyTuple_Check(restuple)) { 8262 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8263 Py_DECREF(restuple); 8264 return NULL; 8265 } 8266 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8267 &resunicode, &i_newpos)) { 8268 Py_DECREF(restuple); 8269 return NULL; 8270 } 8271 if (i_newpos<0) 8272 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8273 else 8274 *newpos = i_newpos; 8275 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8276 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8277 Py_DECREF(restuple); 8278 return NULL; 8279 } 8280 Py_INCREF(resunicode); 8281 Py_DECREF(restuple); 8282 return resunicode; 8283} 8284 8285/* Lookup the character ch in the mapping and put the result in result, 8286 which must be decrefed by the caller. 8287 Return 0 on success, -1 on error */ 8288static int 8289charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8290{ 8291 PyObject *w = PyLong_FromLong((long)c); 8292 PyObject *x; 8293 8294 if (w == NULL) 8295 return -1; 8296 x = PyObject_GetItem(mapping, w); 8297 Py_DECREF(w); 8298 if (x == NULL) { 8299 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8300 /* No mapping found means: use 1:1 mapping. */ 8301 PyErr_Clear(); 8302 *result = NULL; 8303 return 0; 8304 } else 8305 return -1; 8306 } 8307 else if (x == Py_None) { 8308 *result = x; 8309 return 0; 8310 } 8311 else if (PyLong_Check(x)) { 8312 long value = PyLong_AS_LONG(x); 8313 long max = PyUnicode_GetMax(); 8314 if (value < 0 || value > max) { 8315 PyErr_Format(PyExc_TypeError, 8316 "character mapping must be in range(0x%x)", max+1); 8317 Py_DECREF(x); 8318 return -1; 8319 } 8320 *result = x; 8321 return 0; 8322 } 8323 else if (PyUnicode_Check(x)) { 8324 *result = x; 8325 return 0; 8326 } 8327 else { 8328 /* wrong return value */ 8329 PyErr_SetString(PyExc_TypeError, 8330 "character mapping must return integer, None or str"); 8331 Py_DECREF(x); 8332 return -1; 8333 } 8334} 8335/* ensure that *outobj is at least requiredsize characters long, 8336 if not reallocate and adjust various state variables. 8337 Return 0 on success, -1 on error */ 8338static int 8339charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8340 Py_ssize_t requiredsize) 8341{ 8342 Py_ssize_t oldsize = *psize; 8343 Py_UCS4 *new_outobj; 8344 if (requiredsize > oldsize) { 8345 /* exponentially overallocate to minimize reallocations */ 8346 if (requiredsize < 2 * oldsize) 8347 requiredsize = 2 * oldsize; 8348 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8349 if (new_outobj == 0) 8350 return -1; 8351 *outobj = new_outobj; 8352 *psize = requiredsize; 8353 } 8354 return 0; 8355} 8356/* lookup the character, put the result in the output string and adjust 8357 various state variables. Return a new reference to the object that 8358 was put in the output buffer in *result, or Py_None, if the mapping was 8359 undefined (in which case no character was written). 8360 The called must decref result. 8361 Return 0 on success, -1 on error. */ 8362static int 8363charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8364 PyObject *mapping, Py_UCS4 **output, 8365 Py_ssize_t *osize, Py_ssize_t *opos, 8366 PyObject **res) 8367{ 8368 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8369 if (charmaptranslate_lookup(curinp, mapping, res)) 8370 return -1; 8371 if (*res==NULL) { 8372 /* not found => default to 1:1 mapping */ 8373 (*output)[(*opos)++] = curinp; 8374 } 8375 else if (*res==Py_None) 8376 ; 8377 else if (PyLong_Check(*res)) { 8378 /* no overflow check, because we know that the space is enough */ 8379 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8380 } 8381 else if (PyUnicode_Check(*res)) { 8382 Py_ssize_t repsize; 8383 if (PyUnicode_READY(*res) == -1) 8384 return -1; 8385 repsize = PyUnicode_GET_LENGTH(*res); 8386 if (repsize==1) { 8387 /* no overflow check, because we know that the space is enough */ 8388 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8389 } 8390 else if (repsize!=0) { 8391 /* more than one character */ 8392 Py_ssize_t requiredsize = *opos + 8393 (PyUnicode_GET_LENGTH(input) - ipos) + 8394 repsize - 1; 8395 Py_ssize_t i; 8396 if (charmaptranslate_makespace(output, osize, requiredsize)) 8397 return -1; 8398 for(i = 0; i < repsize; i++) 8399 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8400 } 8401 } 8402 else 8403 return -1; 8404 return 0; 8405} 8406 8407PyObject * 8408_PyUnicode_TranslateCharmap(PyObject *input, 8409 PyObject *mapping, 8410 const char *errors) 8411{ 8412 /* input object */ 8413 char *idata; 8414 Py_ssize_t size, i; 8415 int kind; 8416 /* output buffer */ 8417 Py_UCS4 *output = NULL; 8418 Py_ssize_t osize; 8419 PyObject *res; 8420 /* current output position */ 8421 Py_ssize_t opos; 8422 char *reason = "character maps to <undefined>"; 8423 PyObject *errorHandler = NULL; 8424 PyObject *exc = NULL; 8425 /* the following variable is used for caching string comparisons 8426 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8427 * 3=ignore, 4=xmlcharrefreplace */ 8428 int known_errorHandler = -1; 8429 8430 if (mapping == NULL) { 8431 PyErr_BadArgument(); 8432 return NULL; 8433 } 8434 8435 if (PyUnicode_READY(input) == -1) 8436 return NULL; 8437 idata = (char*)PyUnicode_DATA(input); 8438 kind = PyUnicode_KIND(input); 8439 size = PyUnicode_GET_LENGTH(input); 8440 i = 0; 8441 8442 if (size == 0) { 8443 Py_INCREF(input); 8444 return input; 8445 } 8446 8447 /* allocate enough for a simple 1:1 translation without 8448 replacements, if we need more, we'll resize */ 8449 osize = size; 8450 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8451 opos = 0; 8452 if (output == NULL) { 8453 PyErr_NoMemory(); 8454 goto onError; 8455 } 8456 8457 while (i<size) { 8458 /* try to encode it */ 8459 PyObject *x = NULL; 8460 if (charmaptranslate_output(input, i, mapping, 8461 &output, &osize, &opos, &x)) { 8462 Py_XDECREF(x); 8463 goto onError; 8464 } 8465 Py_XDECREF(x); 8466 if (x!=Py_None) /* it worked => adjust input pointer */ 8467 ++i; 8468 else { /* untranslatable character */ 8469 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8470 Py_ssize_t repsize; 8471 Py_ssize_t newpos; 8472 Py_ssize_t uni2; 8473 /* startpos for collecting untranslatable chars */ 8474 Py_ssize_t collstart = i; 8475 Py_ssize_t collend = i+1; 8476 Py_ssize_t coll; 8477 8478 /* find all untranslatable characters */ 8479 while (collend < size) { 8480 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8481 goto onError; 8482 Py_XDECREF(x); 8483 if (x!=Py_None) 8484 break; 8485 ++collend; 8486 } 8487 /* cache callback name lookup 8488 * (if not done yet, i.e. it's the first error) */ 8489 if (known_errorHandler==-1) { 8490 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8491 known_errorHandler = 1; 8492 else if (!strcmp(errors, "replace")) 8493 known_errorHandler = 2; 8494 else if (!strcmp(errors, "ignore")) 8495 known_errorHandler = 3; 8496 else if (!strcmp(errors, "xmlcharrefreplace")) 8497 known_errorHandler = 4; 8498 else 8499 known_errorHandler = 0; 8500 } 8501 switch (known_errorHandler) { 8502 case 1: /* strict */ 8503 raise_translate_exception(&exc, input, collstart, 8504 collend, reason); 8505 goto onError; 8506 case 2: /* replace */ 8507 /* No need to check for space, this is a 1:1 replacement */ 8508 for (coll = collstart; coll<collend; coll++) 8509 output[opos++] = '?'; 8510 /* fall through */ 8511 case 3: /* ignore */ 8512 i = collend; 8513 break; 8514 case 4: /* xmlcharrefreplace */ 8515 /* generate replacement (temporarily (mis)uses i) */ 8516 for (i = collstart; i < collend; ++i) { 8517 char buffer[2+29+1+1]; 8518 char *cp; 8519 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8520 if (charmaptranslate_makespace(&output, &osize, 8521 opos+strlen(buffer)+(size-collend))) 8522 goto onError; 8523 for (cp = buffer; *cp; ++cp) 8524 output[opos++] = *cp; 8525 } 8526 i = collend; 8527 break; 8528 default: 8529 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8530 reason, input, &exc, 8531 collstart, collend, &newpos); 8532 if (repunicode == NULL) 8533 goto onError; 8534 if (PyUnicode_READY(repunicode) == -1) { 8535 Py_DECREF(repunicode); 8536 goto onError; 8537 } 8538 /* generate replacement */ 8539 repsize = PyUnicode_GET_LENGTH(repunicode); 8540 if (charmaptranslate_makespace(&output, &osize, 8541 opos+repsize+(size-collend))) { 8542 Py_DECREF(repunicode); 8543 goto onError; 8544 } 8545 for (uni2 = 0; repsize-->0; ++uni2) 8546 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8547 i = newpos; 8548 Py_DECREF(repunicode); 8549 } 8550 } 8551 } 8552 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8553 if (!res) 8554 goto onError; 8555 PyMem_Free(output); 8556 Py_XDECREF(exc); 8557 Py_XDECREF(errorHandler); 8558 return res; 8559 8560 onError: 8561 PyMem_Free(output); 8562 Py_XDECREF(exc); 8563 Py_XDECREF(errorHandler); 8564 return NULL; 8565} 8566 8567/* Deprecated. Use PyUnicode_Translate instead. */ 8568PyObject * 8569PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8570 Py_ssize_t size, 8571 PyObject *mapping, 8572 const char *errors) 8573{ 8574 PyObject *result; 8575 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8576 if (!unicode) 8577 return NULL; 8578 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8579 Py_DECREF(unicode); 8580 return result; 8581} 8582 8583PyObject * 8584PyUnicode_Translate(PyObject *str, 8585 PyObject *mapping, 8586 const char *errors) 8587{ 8588 PyObject *result; 8589 8590 str = PyUnicode_FromObject(str); 8591 if (str == NULL) 8592 return NULL; 8593 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8594 Py_DECREF(str); 8595 return result; 8596} 8597 8598static Py_UCS4 8599fix_decimal_and_space_to_ascii(PyObject *self) 8600{ 8601 /* No need to call PyUnicode_READY(self) because this function is only 8602 called as a callback from fixup() which does it already. */ 8603 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8604 const int kind = PyUnicode_KIND(self); 8605 void *data = PyUnicode_DATA(self); 8606 Py_UCS4 maxchar = 127, ch, fixed; 8607 int modified = 0; 8608 Py_ssize_t i; 8609 8610 for (i = 0; i < len; ++i) { 8611 ch = PyUnicode_READ(kind, data, i); 8612 fixed = 0; 8613 if (ch > 127) { 8614 if (Py_UNICODE_ISSPACE(ch)) 8615 fixed = ' '; 8616 else { 8617 const int decimal = Py_UNICODE_TODECIMAL(ch); 8618 if (decimal >= 0) 8619 fixed = '0' + decimal; 8620 } 8621 if (fixed != 0) { 8622 modified = 1; 8623 maxchar = MAX_MAXCHAR(maxchar, fixed); 8624 PyUnicode_WRITE(kind, data, i, fixed); 8625 } 8626 else 8627 maxchar = MAX_MAXCHAR(maxchar, ch); 8628 } 8629 } 8630 8631 return (modified) ? maxchar : 0; 8632} 8633 8634PyObject * 8635_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8636{ 8637 if (!PyUnicode_Check(unicode)) { 8638 PyErr_BadInternalCall(); 8639 return NULL; 8640 } 8641 if (PyUnicode_READY(unicode) == -1) 8642 return NULL; 8643 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8644 /* If the string is already ASCII, just return the same string */ 8645 Py_INCREF(unicode); 8646 return unicode; 8647 } 8648 return fixup(unicode, fix_decimal_and_space_to_ascii); 8649} 8650 8651PyObject * 8652PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8653 Py_ssize_t length) 8654{ 8655 PyObject *decimal; 8656 Py_ssize_t i; 8657 Py_UCS4 maxchar; 8658 enum PyUnicode_Kind kind; 8659 void *data; 8660 8661 maxchar = 127; 8662 for (i = 0; i < length; i++) { 8663 Py_UNICODE ch = s[i]; 8664 if (ch > 127) { 8665 int decimal = Py_UNICODE_TODECIMAL(ch); 8666 if (decimal >= 0) 8667 ch = '0' + decimal; 8668 maxchar = MAX_MAXCHAR(maxchar, ch); 8669 } 8670 } 8671 8672 /* Copy to a new string */ 8673 decimal = PyUnicode_New(length, maxchar); 8674 if (decimal == NULL) 8675 return decimal; 8676 kind = PyUnicode_KIND(decimal); 8677 data = PyUnicode_DATA(decimal); 8678 /* Iterate over code points */ 8679 for (i = 0; i < length; i++) { 8680 Py_UNICODE ch = s[i]; 8681 if (ch > 127) { 8682 int decimal = Py_UNICODE_TODECIMAL(ch); 8683 if (decimal >= 0) 8684 ch = '0' + decimal; 8685 } 8686 PyUnicode_WRITE(kind, data, i, ch); 8687 } 8688 return unicode_result(decimal); 8689} 8690/* --- Decimal Encoder ---------------------------------------------------- */ 8691 8692int 8693PyUnicode_EncodeDecimal(Py_UNICODE *s, 8694 Py_ssize_t length, 8695 char *output, 8696 const char *errors) 8697{ 8698 PyObject *unicode; 8699 Py_ssize_t i; 8700 enum PyUnicode_Kind kind; 8701 void *data; 8702 8703 if (output == NULL) { 8704 PyErr_BadArgument(); 8705 return -1; 8706 } 8707 8708 unicode = PyUnicode_FromUnicode(s, length); 8709 if (unicode == NULL) 8710 return -1; 8711 8712 if (PyUnicode_READY(unicode) == -1) { 8713 Py_DECREF(unicode); 8714 return -1; 8715 } 8716 kind = PyUnicode_KIND(unicode); 8717 data = PyUnicode_DATA(unicode); 8718 8719 for (i=0; i < length; ) { 8720 PyObject *exc; 8721 Py_UCS4 ch; 8722 int decimal; 8723 Py_ssize_t startpos; 8724 8725 ch = PyUnicode_READ(kind, data, i); 8726 8727 if (Py_UNICODE_ISSPACE(ch)) { 8728 *output++ = ' '; 8729 i++; 8730 continue; 8731 } 8732 decimal = Py_UNICODE_TODECIMAL(ch); 8733 if (decimal >= 0) { 8734 *output++ = '0' + decimal; 8735 i++; 8736 continue; 8737 } 8738 if (0 < ch && ch < 256) { 8739 *output++ = (char)ch; 8740 i++; 8741 continue; 8742 } 8743 8744 startpos = i; 8745 exc = NULL; 8746 raise_encode_exception(&exc, "decimal", unicode, 8747 startpos, startpos+1, 8748 "invalid decimal Unicode string"); 8749 Py_XDECREF(exc); 8750 Py_DECREF(unicode); 8751 return -1; 8752 } 8753 /* 0-terminate the output string */ 8754 *output++ = '\0'; 8755 Py_DECREF(unicode); 8756 return 0; 8757} 8758 8759/* --- Helpers ------------------------------------------------------------ */ 8760 8761static Py_ssize_t 8762any_find_slice(int direction, PyObject* s1, PyObject* s2, 8763 Py_ssize_t start, 8764 Py_ssize_t end) 8765{ 8766 int kind1, kind2, kind; 8767 void *buf1, *buf2; 8768 Py_ssize_t len1, len2, result; 8769 8770 kind1 = PyUnicode_KIND(s1); 8771 kind2 = PyUnicode_KIND(s2); 8772 kind = kind1 > kind2 ? kind1 : kind2; 8773 buf1 = PyUnicode_DATA(s1); 8774 buf2 = PyUnicode_DATA(s2); 8775 if (kind1 != kind) 8776 buf1 = _PyUnicode_AsKind(s1, kind); 8777 if (!buf1) 8778 return -2; 8779 if (kind2 != kind) 8780 buf2 = _PyUnicode_AsKind(s2, kind); 8781 if (!buf2) { 8782 if (kind1 != kind) PyMem_Free(buf1); 8783 return -2; 8784 } 8785 len1 = PyUnicode_GET_LENGTH(s1); 8786 len2 = PyUnicode_GET_LENGTH(s2); 8787 8788 if (direction > 0) { 8789 switch (kind) { 8790 case PyUnicode_1BYTE_KIND: 8791 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8792 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8793 else 8794 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8795 break; 8796 case PyUnicode_2BYTE_KIND: 8797 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8798 break; 8799 case PyUnicode_4BYTE_KIND: 8800 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8801 break; 8802 default: 8803 assert(0); result = -2; 8804 } 8805 } 8806 else { 8807 switch (kind) { 8808 case PyUnicode_1BYTE_KIND: 8809 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8810 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8811 else 8812 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8813 break; 8814 case PyUnicode_2BYTE_KIND: 8815 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8816 break; 8817 case PyUnicode_4BYTE_KIND: 8818 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8819 break; 8820 default: 8821 assert(0); result = -2; 8822 } 8823 } 8824 8825 if (kind1 != kind) 8826 PyMem_Free(buf1); 8827 if (kind2 != kind) 8828 PyMem_Free(buf2); 8829 8830 return result; 8831} 8832 8833Py_ssize_t 8834_PyUnicode_InsertThousandsGrouping( 8835 PyObject *unicode, Py_ssize_t index, 8836 Py_ssize_t n_buffer, 8837 void *digits, Py_ssize_t n_digits, 8838 Py_ssize_t min_width, 8839 const char *grouping, PyObject *thousands_sep, 8840 Py_UCS4 *maxchar) 8841{ 8842 unsigned int kind, thousands_sep_kind; 8843 char *data, *thousands_sep_data; 8844 Py_ssize_t thousands_sep_len; 8845 Py_ssize_t len; 8846 8847 if (unicode != NULL) { 8848 kind = PyUnicode_KIND(unicode); 8849 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8850 } 8851 else { 8852 kind = PyUnicode_1BYTE_KIND; 8853 data = NULL; 8854 } 8855 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8856 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8857 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8858 if (unicode != NULL && thousands_sep_kind != kind) { 8859 if (thousands_sep_kind < kind) { 8860 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8861 if (!thousands_sep_data) 8862 return -1; 8863 } 8864 else { 8865 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8866 if (!data) 8867 return -1; 8868 } 8869 } 8870 8871 switch (kind) { 8872 case PyUnicode_1BYTE_KIND: 8873 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8874 len = asciilib_InsertThousandsGrouping( 8875 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8876 min_width, grouping, 8877 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8878 else 8879 len = ucs1lib_InsertThousandsGrouping( 8880 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8881 min_width, grouping, 8882 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8883 break; 8884 case PyUnicode_2BYTE_KIND: 8885 len = ucs2lib_InsertThousandsGrouping( 8886 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8887 min_width, grouping, 8888 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8889 break; 8890 case PyUnicode_4BYTE_KIND: 8891 len = ucs4lib_InsertThousandsGrouping( 8892 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8893 min_width, grouping, 8894 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8895 break; 8896 default: 8897 assert(0); 8898 return -1; 8899 } 8900 if (unicode != NULL && thousands_sep_kind != kind) { 8901 if (thousands_sep_kind < kind) 8902 PyMem_Free(thousands_sep_data); 8903 else 8904 PyMem_Free(data); 8905 } 8906 if (unicode == NULL) { 8907 *maxchar = 127; 8908 if (len != n_digits) { 8909 *maxchar = MAX_MAXCHAR(*maxchar, 8910 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8911 } 8912 } 8913 return len; 8914} 8915 8916 8917/* helper macro to fixup start/end slice values */ 8918#define ADJUST_INDICES(start, end, len) \ 8919 if (end > len) \ 8920 end = len; \ 8921 else if (end < 0) { \ 8922 end += len; \ 8923 if (end < 0) \ 8924 end = 0; \ 8925 } \ 8926 if (start < 0) { \ 8927 start += len; \ 8928 if (start < 0) \ 8929 start = 0; \ 8930 } 8931 8932Py_ssize_t 8933PyUnicode_Count(PyObject *str, 8934 PyObject *substr, 8935 Py_ssize_t start, 8936 Py_ssize_t end) 8937{ 8938 Py_ssize_t result; 8939 PyObject* str_obj; 8940 PyObject* sub_obj; 8941 int kind1, kind2, kind; 8942 void *buf1 = NULL, *buf2 = NULL; 8943 Py_ssize_t len1, len2; 8944 8945 str_obj = PyUnicode_FromObject(str); 8946 if (!str_obj) 8947 return -1; 8948 sub_obj = PyUnicode_FromObject(substr); 8949 if (!sub_obj) { 8950 Py_DECREF(str_obj); 8951 return -1; 8952 } 8953 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8954 Py_DECREF(sub_obj); 8955 Py_DECREF(str_obj); 8956 return -1; 8957 } 8958 8959 kind1 = PyUnicode_KIND(str_obj); 8960 kind2 = PyUnicode_KIND(sub_obj); 8961 kind = kind1; 8962 buf1 = PyUnicode_DATA(str_obj); 8963 buf2 = PyUnicode_DATA(sub_obj); 8964 if (kind2 != kind) { 8965 if (kind2 > kind) { 8966 Py_DECREF(sub_obj); 8967 Py_DECREF(str_obj); 8968 return 0; 8969 } 8970 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8971 } 8972 if (!buf2) 8973 goto onError; 8974 len1 = PyUnicode_GET_LENGTH(str_obj); 8975 len2 = PyUnicode_GET_LENGTH(sub_obj); 8976 8977 ADJUST_INDICES(start, end, len1); 8978 switch (kind) { 8979 case PyUnicode_1BYTE_KIND: 8980 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8981 result = asciilib_count( 8982 ((Py_UCS1*)buf1) + start, end - start, 8983 buf2, len2, PY_SSIZE_T_MAX 8984 ); 8985 else 8986 result = ucs1lib_count( 8987 ((Py_UCS1*)buf1) + start, end - start, 8988 buf2, len2, PY_SSIZE_T_MAX 8989 ); 8990 break; 8991 case PyUnicode_2BYTE_KIND: 8992 result = ucs2lib_count( 8993 ((Py_UCS2*)buf1) + start, end - start, 8994 buf2, len2, PY_SSIZE_T_MAX 8995 ); 8996 break; 8997 case PyUnicode_4BYTE_KIND: 8998 result = ucs4lib_count( 8999 ((Py_UCS4*)buf1) + start, end - start, 9000 buf2, len2, PY_SSIZE_T_MAX 9001 ); 9002 break; 9003 default: 9004 assert(0); result = 0; 9005 } 9006 9007 Py_DECREF(sub_obj); 9008 Py_DECREF(str_obj); 9009 9010 if (kind2 != kind) 9011 PyMem_Free(buf2); 9012 9013 return result; 9014 onError: 9015 Py_DECREF(sub_obj); 9016 Py_DECREF(str_obj); 9017 if (kind2 != kind && buf2) 9018 PyMem_Free(buf2); 9019 return -1; 9020} 9021 9022Py_ssize_t 9023PyUnicode_Find(PyObject *str, 9024 PyObject *sub, 9025 Py_ssize_t start, 9026 Py_ssize_t end, 9027 int direction) 9028{ 9029 Py_ssize_t result; 9030 9031 str = PyUnicode_FromObject(str); 9032 if (!str) 9033 return -2; 9034 sub = PyUnicode_FromObject(sub); 9035 if (!sub) { 9036 Py_DECREF(str); 9037 return -2; 9038 } 9039 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9040 Py_DECREF(sub); 9041 Py_DECREF(str); 9042 return -2; 9043 } 9044 9045 result = any_find_slice(direction, 9046 str, sub, start, end 9047 ); 9048 9049 Py_DECREF(str); 9050 Py_DECREF(sub); 9051 9052 return result; 9053} 9054 9055Py_ssize_t 9056PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9057 Py_ssize_t start, Py_ssize_t end, 9058 int direction) 9059{ 9060 int kind; 9061 Py_ssize_t result; 9062 if (PyUnicode_READY(str) == -1) 9063 return -2; 9064 if (start < 0 || end < 0) { 9065 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9066 return -2; 9067 } 9068 if (end > PyUnicode_GET_LENGTH(str)) 9069 end = PyUnicode_GET_LENGTH(str); 9070 kind = PyUnicode_KIND(str); 9071 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9072 kind, end-start, ch, direction); 9073 if (result == -1) 9074 return -1; 9075 else 9076 return start + result; 9077} 9078 9079static int 9080tailmatch(PyObject *self, 9081 PyObject *substring, 9082 Py_ssize_t start, 9083 Py_ssize_t end, 9084 int direction) 9085{ 9086 int kind_self; 9087 int kind_sub; 9088 void *data_self; 9089 void *data_sub; 9090 Py_ssize_t offset; 9091 Py_ssize_t i; 9092 Py_ssize_t end_sub; 9093 9094 if (PyUnicode_READY(self) == -1 || 9095 PyUnicode_READY(substring) == -1) 9096 return 0; 9097 9098 if (PyUnicode_GET_LENGTH(substring) == 0) 9099 return 1; 9100 9101 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9102 end -= PyUnicode_GET_LENGTH(substring); 9103 if (end < start) 9104 return 0; 9105 9106 kind_self = PyUnicode_KIND(self); 9107 data_self = PyUnicode_DATA(self); 9108 kind_sub = PyUnicode_KIND(substring); 9109 data_sub = PyUnicode_DATA(substring); 9110 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9111 9112 if (direction > 0) 9113 offset = end; 9114 else 9115 offset = start; 9116 9117 if (PyUnicode_READ(kind_self, data_self, offset) == 9118 PyUnicode_READ(kind_sub, data_sub, 0) && 9119 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9120 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9121 /* If both are of the same kind, memcmp is sufficient */ 9122 if (kind_self == kind_sub) { 9123 return ! memcmp((char *)data_self + 9124 (offset * PyUnicode_KIND(substring)), 9125 data_sub, 9126 PyUnicode_GET_LENGTH(substring) * 9127 PyUnicode_KIND(substring)); 9128 } 9129 /* otherwise we have to compare each character by first accesing it */ 9130 else { 9131 /* We do not need to compare 0 and len(substring)-1 because 9132 the if statement above ensured already that they are equal 9133 when we end up here. */ 9134 /* TODO: honor direction and do a forward or backwards search */ 9135 for (i = 1; i < end_sub; ++i) { 9136 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9137 PyUnicode_READ(kind_sub, data_sub, i)) 9138 return 0; 9139 } 9140 return 1; 9141 } 9142 } 9143 9144 return 0; 9145} 9146 9147Py_ssize_t 9148PyUnicode_Tailmatch(PyObject *str, 9149 PyObject *substr, 9150 Py_ssize_t start, 9151 Py_ssize_t end, 9152 int direction) 9153{ 9154 Py_ssize_t result; 9155 9156 str = PyUnicode_FromObject(str); 9157 if (str == NULL) 9158 return -1; 9159 substr = PyUnicode_FromObject(substr); 9160 if (substr == NULL) { 9161 Py_DECREF(str); 9162 return -1; 9163 } 9164 9165 result = tailmatch(str, substr, 9166 start, end, direction); 9167 Py_DECREF(str); 9168 Py_DECREF(substr); 9169 return result; 9170} 9171 9172/* Apply fixfct filter to the Unicode object self and return a 9173 reference to the modified object */ 9174 9175static PyObject * 9176fixup(PyObject *self, 9177 Py_UCS4 (*fixfct)(PyObject *s)) 9178{ 9179 PyObject *u; 9180 Py_UCS4 maxchar_old, maxchar_new = 0; 9181 PyObject *v; 9182 9183 u = _PyUnicode_Copy(self); 9184 if (u == NULL) 9185 return NULL; 9186 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9187 9188 /* fix functions return the new maximum character in a string, 9189 if the kind of the resulting unicode object does not change, 9190 everything is fine. Otherwise we need to change the string kind 9191 and re-run the fix function. */ 9192 maxchar_new = fixfct(u); 9193 9194 if (maxchar_new == 0) { 9195 /* no changes */; 9196 if (PyUnicode_CheckExact(self)) { 9197 Py_DECREF(u); 9198 Py_INCREF(self); 9199 return self; 9200 } 9201 else 9202 return u; 9203 } 9204 9205 maxchar_new = align_maxchar(maxchar_new); 9206 9207 if (maxchar_new == maxchar_old) 9208 return u; 9209 9210 /* In case the maximum character changed, we need to 9211 convert the string to the new category. */ 9212 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9213 if (v == NULL) { 9214 Py_DECREF(u); 9215 return NULL; 9216 } 9217 if (maxchar_new > maxchar_old) { 9218 /* If the maxchar increased so that the kind changed, not all 9219 characters are representable anymore and we need to fix the 9220 string again. This only happens in very few cases. */ 9221 _PyUnicode_FastCopyCharacters(v, 0, 9222 self, 0, PyUnicode_GET_LENGTH(self)); 9223 maxchar_old = fixfct(v); 9224 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9225 } 9226 else { 9227 _PyUnicode_FastCopyCharacters(v, 0, 9228 u, 0, PyUnicode_GET_LENGTH(self)); 9229 } 9230 Py_DECREF(u); 9231 assert(_PyUnicode_CheckConsistency(v, 1)); 9232 return v; 9233} 9234 9235static PyObject * 9236ascii_upper_or_lower(PyObject *self, int lower) 9237{ 9238 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9239 char *resdata, *data = PyUnicode_DATA(self); 9240 PyObject *res; 9241 9242 res = PyUnicode_New(len, 127); 9243 if (res == NULL) 9244 return NULL; 9245 resdata = PyUnicode_DATA(res); 9246 if (lower) 9247 _Py_bytes_lower(resdata, data, len); 9248 else 9249 _Py_bytes_upper(resdata, data, len); 9250 return res; 9251} 9252 9253static Py_UCS4 9254handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9255{ 9256 Py_ssize_t j; 9257 int final_sigma; 9258 Py_UCS4 c; 9259 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9260 9261 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9262 9263 where ! is a negation and \p{xxx} is a character with property xxx. 9264 */ 9265 for (j = i - 1; j >= 0; j--) { 9266 c = PyUnicode_READ(kind, data, j); 9267 if (!_PyUnicode_IsCaseIgnorable(c)) 9268 break; 9269 } 9270 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9271 if (final_sigma) { 9272 for (j = i + 1; j < length; j++) { 9273 c = PyUnicode_READ(kind, data, j); 9274 if (!_PyUnicode_IsCaseIgnorable(c)) 9275 break; 9276 } 9277 final_sigma = j == length || !_PyUnicode_IsCased(c); 9278 } 9279 return (final_sigma) ? 0x3C2 : 0x3C3; 9280} 9281 9282static int 9283lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9284 Py_UCS4 c, Py_UCS4 *mapped) 9285{ 9286 /* Obscure special case. */ 9287 if (c == 0x3A3) { 9288 mapped[0] = handle_capital_sigma(kind, data, length, i); 9289 return 1; 9290 } 9291 return _PyUnicode_ToLowerFull(c, mapped); 9292} 9293 9294static Py_ssize_t 9295do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9296{ 9297 Py_ssize_t i, k = 0; 9298 int n_res, j; 9299 Py_UCS4 c, mapped[3]; 9300 9301 c = PyUnicode_READ(kind, data, 0); 9302 n_res = _PyUnicode_ToUpperFull(c, mapped); 9303 for (j = 0; j < n_res; j++) { 9304 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9305 res[k++] = mapped[j]; 9306 } 9307 for (i = 1; i < length; i++) { 9308 c = PyUnicode_READ(kind, data, i); 9309 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9310 for (j = 0; j < n_res; j++) { 9311 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9312 res[k++] = mapped[j]; 9313 } 9314 } 9315 return k; 9316} 9317 9318static Py_ssize_t 9319do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9320 Py_ssize_t i, k = 0; 9321 9322 for (i = 0; i < length; i++) { 9323 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9324 int n_res, j; 9325 if (Py_UNICODE_ISUPPER(c)) { 9326 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9327 } 9328 else if (Py_UNICODE_ISLOWER(c)) { 9329 n_res = _PyUnicode_ToUpperFull(c, mapped); 9330 } 9331 else { 9332 n_res = 1; 9333 mapped[0] = c; 9334 } 9335 for (j = 0; j < n_res; j++) { 9336 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9337 res[k++] = mapped[j]; 9338 } 9339 } 9340 return k; 9341} 9342 9343static Py_ssize_t 9344do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9345 Py_UCS4 *maxchar, int lower) 9346{ 9347 Py_ssize_t i, k = 0; 9348 9349 for (i = 0; i < length; i++) { 9350 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9351 int n_res, j; 9352 if (lower) 9353 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9354 else 9355 n_res = _PyUnicode_ToUpperFull(c, mapped); 9356 for (j = 0; j < n_res; j++) { 9357 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9358 res[k++] = mapped[j]; 9359 } 9360 } 9361 return k; 9362} 9363 9364static Py_ssize_t 9365do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9366{ 9367 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9368} 9369 9370static Py_ssize_t 9371do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9372{ 9373 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9374} 9375 9376static Py_ssize_t 9377do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9378{ 9379 Py_ssize_t i, k = 0; 9380 9381 for (i = 0; i < length; i++) { 9382 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9383 Py_UCS4 mapped[3]; 9384 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9385 for (j = 0; j < n_res; j++) { 9386 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9387 res[k++] = mapped[j]; 9388 } 9389 } 9390 return k; 9391} 9392 9393static Py_ssize_t 9394do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9395{ 9396 Py_ssize_t i, k = 0; 9397 int previous_is_cased; 9398 9399 previous_is_cased = 0; 9400 for (i = 0; i < length; i++) { 9401 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9402 Py_UCS4 mapped[3]; 9403 int n_res, j; 9404 9405 if (previous_is_cased) 9406 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9407 else 9408 n_res = _PyUnicode_ToTitleFull(c, mapped); 9409 9410 for (j = 0; j < n_res; j++) { 9411 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9412 res[k++] = mapped[j]; 9413 } 9414 9415 previous_is_cased = _PyUnicode_IsCased(c); 9416 } 9417 return k; 9418} 9419 9420static PyObject * 9421case_operation(PyObject *self, 9422 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9423{ 9424 PyObject *res = NULL; 9425 Py_ssize_t length, newlength = 0; 9426 int kind, outkind; 9427 void *data, *outdata; 9428 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9429 9430 assert(PyUnicode_IS_READY(self)); 9431 9432 kind = PyUnicode_KIND(self); 9433 data = PyUnicode_DATA(self); 9434 length = PyUnicode_GET_LENGTH(self); 9435 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9436 if (tmp == NULL) 9437 return PyErr_NoMemory(); 9438 newlength = perform(kind, data, length, tmp, &maxchar); 9439 res = PyUnicode_New(newlength, maxchar); 9440 if (res == NULL) 9441 goto leave; 9442 tmpend = tmp + newlength; 9443 outdata = PyUnicode_DATA(res); 9444 outkind = PyUnicode_KIND(res); 9445 switch (outkind) { 9446 case PyUnicode_1BYTE_KIND: 9447 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9448 break; 9449 case PyUnicode_2BYTE_KIND: 9450 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9451 break; 9452 case PyUnicode_4BYTE_KIND: 9453 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9454 break; 9455 default: 9456 assert(0); 9457 break; 9458 } 9459 leave: 9460 PyMem_FREE(tmp); 9461 return res; 9462} 9463 9464PyObject * 9465PyUnicode_Join(PyObject *separator, PyObject *seq) 9466{ 9467 PyObject *sep = NULL; 9468 Py_ssize_t seplen; 9469 PyObject *res = NULL; /* the result */ 9470 PyObject *fseq; /* PySequence_Fast(seq) */ 9471 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9472 PyObject **items; 9473 PyObject *item; 9474 Py_ssize_t sz, i, res_offset; 9475 Py_UCS4 maxchar; 9476 Py_UCS4 item_maxchar; 9477 int use_memcpy; 9478 unsigned char *res_data = NULL, *sep_data = NULL; 9479 PyObject *last_obj; 9480 unsigned int kind = 0; 9481 9482 fseq = PySequence_Fast(seq, ""); 9483 if (fseq == NULL) { 9484 return NULL; 9485 } 9486 9487 /* NOTE: the following code can't call back into Python code, 9488 * so we are sure that fseq won't be mutated. 9489 */ 9490 9491 seqlen = PySequence_Fast_GET_SIZE(fseq); 9492 /* If empty sequence, return u"". */ 9493 if (seqlen == 0) { 9494 Py_DECREF(fseq); 9495 Py_INCREF(unicode_empty); 9496 res = unicode_empty; 9497 return res; 9498 } 9499 9500 /* If singleton sequence with an exact Unicode, return that. */ 9501 last_obj = NULL; 9502 items = PySequence_Fast_ITEMS(fseq); 9503 if (seqlen == 1) { 9504 if (PyUnicode_CheckExact(items[0])) { 9505 res = items[0]; 9506 Py_INCREF(res); 9507 Py_DECREF(fseq); 9508 return res; 9509 } 9510 seplen = 0; 9511 maxchar = 0; 9512 } 9513 else { 9514 /* Set up sep and seplen */ 9515 if (separator == NULL) { 9516 /* fall back to a blank space separator */ 9517 sep = PyUnicode_FromOrdinal(' '); 9518 if (!sep) 9519 goto onError; 9520 seplen = 1; 9521 maxchar = 32; 9522 } 9523 else { 9524 if (!PyUnicode_Check(separator)) { 9525 PyErr_Format(PyExc_TypeError, 9526 "separator: expected str instance," 9527 " %.80s found", 9528 Py_TYPE(separator)->tp_name); 9529 goto onError; 9530 } 9531 if (PyUnicode_READY(separator)) 9532 goto onError; 9533 sep = separator; 9534 seplen = PyUnicode_GET_LENGTH(separator); 9535 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9536 /* inc refcount to keep this code path symmetric with the 9537 above case of a blank separator */ 9538 Py_INCREF(sep); 9539 } 9540 last_obj = sep; 9541 } 9542 9543 /* There are at least two things to join, or else we have a subclass 9544 * of str in the sequence. 9545 * Do a pre-pass to figure out the total amount of space we'll 9546 * need (sz), and see whether all argument are strings. 9547 */ 9548 sz = 0; 9549#ifdef Py_DEBUG 9550 use_memcpy = 0; 9551#else 9552 use_memcpy = 1; 9553#endif 9554 for (i = 0; i < seqlen; i++) { 9555 const Py_ssize_t old_sz = sz; 9556 item = items[i]; 9557 if (!PyUnicode_Check(item)) { 9558 PyErr_Format(PyExc_TypeError, 9559 "sequence item %zd: expected str instance," 9560 " %.80s found", 9561 i, Py_TYPE(item)->tp_name); 9562 goto onError; 9563 } 9564 if (PyUnicode_READY(item) == -1) 9565 goto onError; 9566 sz += PyUnicode_GET_LENGTH(item); 9567 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9568 maxchar = MAX_MAXCHAR(maxchar, item_maxchar); 9569 if (i != 0) 9570 sz += seplen; 9571 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9572 PyErr_SetString(PyExc_OverflowError, 9573 "join() result is too long for a Python string"); 9574 goto onError; 9575 } 9576 if (use_memcpy && last_obj != NULL) { 9577 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9578 use_memcpy = 0; 9579 } 9580 last_obj = item; 9581 } 9582 9583 res = PyUnicode_New(sz, maxchar); 9584 if (res == NULL) 9585 goto onError; 9586 9587 /* Catenate everything. */ 9588#ifdef Py_DEBUG 9589 use_memcpy = 0; 9590#else 9591 if (use_memcpy) { 9592 res_data = PyUnicode_1BYTE_DATA(res); 9593 kind = PyUnicode_KIND(res); 9594 if (seplen != 0) 9595 sep_data = PyUnicode_1BYTE_DATA(sep); 9596 } 9597#endif 9598 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9599 Py_ssize_t itemlen; 9600 item = items[i]; 9601 /* Copy item, and maybe the separator. */ 9602 if (i && seplen != 0) { 9603 if (use_memcpy) { 9604 Py_MEMCPY(res_data, 9605 sep_data, 9606 kind * seplen); 9607 res_data += kind * seplen; 9608 } 9609 else { 9610 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9611 res_offset += seplen; 9612 } 9613 } 9614 itemlen = PyUnicode_GET_LENGTH(item); 9615 if (itemlen != 0) { 9616 if (use_memcpy) { 9617 Py_MEMCPY(res_data, 9618 PyUnicode_DATA(item), 9619 kind * itemlen); 9620 res_data += kind * itemlen; 9621 } 9622 else { 9623 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9624 res_offset += itemlen; 9625 } 9626 } 9627 } 9628 if (use_memcpy) 9629 assert(res_data == PyUnicode_1BYTE_DATA(res) 9630 + kind * PyUnicode_GET_LENGTH(res)); 9631 else 9632 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9633 9634 Py_DECREF(fseq); 9635 Py_XDECREF(sep); 9636 assert(_PyUnicode_CheckConsistency(res, 1)); 9637 return res; 9638 9639 onError: 9640 Py_DECREF(fseq); 9641 Py_XDECREF(sep); 9642 Py_XDECREF(res); 9643 return NULL; 9644} 9645 9646#define FILL(kind, data, value, start, length) \ 9647 do { \ 9648 Py_ssize_t i_ = 0; \ 9649 assert(kind != PyUnicode_WCHAR_KIND); \ 9650 switch ((kind)) { \ 9651 case PyUnicode_1BYTE_KIND: { \ 9652 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9653 memset(to_, (unsigned char)value, (length)); \ 9654 break; \ 9655 } \ 9656 case PyUnicode_2BYTE_KIND: { \ 9657 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9658 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9659 break; \ 9660 } \ 9661 case PyUnicode_4BYTE_KIND: { \ 9662 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9663 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9664 break; \ 9665 default: assert(0); \ 9666 } \ 9667 } \ 9668 } while (0) 9669 9670void 9671_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9672 Py_UCS4 fill_char) 9673{ 9674 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9675 const void *data = PyUnicode_DATA(unicode); 9676 assert(PyUnicode_IS_READY(unicode)); 9677 assert(unicode_modifiable(unicode)); 9678 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9679 assert(start >= 0); 9680 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9681 FILL(kind, data, fill_char, start, length); 9682} 9683 9684Py_ssize_t 9685PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9686 Py_UCS4 fill_char) 9687{ 9688 Py_ssize_t maxlen; 9689 9690 if (!PyUnicode_Check(unicode)) { 9691 PyErr_BadInternalCall(); 9692 return -1; 9693 } 9694 if (PyUnicode_READY(unicode) == -1) 9695 return -1; 9696 if (unicode_check_modifiable(unicode)) 9697 return -1; 9698 9699 if (start < 0) { 9700 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9701 return -1; 9702 } 9703 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9704 PyErr_SetString(PyExc_ValueError, 9705 "fill character is bigger than " 9706 "the string maximum character"); 9707 return -1; 9708 } 9709 9710 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9711 length = Py_MIN(maxlen, length); 9712 if (length <= 0) 9713 return 0; 9714 9715 _PyUnicode_FastFill(unicode, start, length, fill_char); 9716 return length; 9717} 9718 9719static PyObject * 9720pad(PyObject *self, 9721 Py_ssize_t left, 9722 Py_ssize_t right, 9723 Py_UCS4 fill) 9724{ 9725 PyObject *u; 9726 Py_UCS4 maxchar; 9727 int kind; 9728 void *data; 9729 9730 if (left < 0) 9731 left = 0; 9732 if (right < 0) 9733 right = 0; 9734 9735 if (left == 0 && right == 0) 9736 return unicode_result_unchanged(self); 9737 9738 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9739 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9740 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9741 return NULL; 9742 } 9743 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9744 maxchar = MAX_MAXCHAR(maxchar, fill); 9745 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9746 if (!u) 9747 return NULL; 9748 9749 kind = PyUnicode_KIND(u); 9750 data = PyUnicode_DATA(u); 9751 if (left) 9752 FILL(kind, data, fill, 0, left); 9753 if (right) 9754 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9755 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9756 assert(_PyUnicode_CheckConsistency(u, 1)); 9757 return u; 9758} 9759 9760PyObject * 9761PyUnicode_Splitlines(PyObject *string, int keepends) 9762{ 9763 PyObject *list; 9764 9765 string = PyUnicode_FromObject(string); 9766 if (string == NULL) 9767 return NULL; 9768 if (PyUnicode_READY(string) == -1) { 9769 Py_DECREF(string); 9770 return NULL; 9771 } 9772 9773 switch (PyUnicode_KIND(string)) { 9774 case PyUnicode_1BYTE_KIND: 9775 if (PyUnicode_IS_ASCII(string)) 9776 list = asciilib_splitlines( 9777 string, PyUnicode_1BYTE_DATA(string), 9778 PyUnicode_GET_LENGTH(string), keepends); 9779 else 9780 list = ucs1lib_splitlines( 9781 string, PyUnicode_1BYTE_DATA(string), 9782 PyUnicode_GET_LENGTH(string), keepends); 9783 break; 9784 case PyUnicode_2BYTE_KIND: 9785 list = ucs2lib_splitlines( 9786 string, PyUnicode_2BYTE_DATA(string), 9787 PyUnicode_GET_LENGTH(string), keepends); 9788 break; 9789 case PyUnicode_4BYTE_KIND: 9790 list = ucs4lib_splitlines( 9791 string, PyUnicode_4BYTE_DATA(string), 9792 PyUnicode_GET_LENGTH(string), keepends); 9793 break; 9794 default: 9795 assert(0); 9796 list = 0; 9797 } 9798 Py_DECREF(string); 9799 return list; 9800} 9801 9802static PyObject * 9803split(PyObject *self, 9804 PyObject *substring, 9805 Py_ssize_t maxcount) 9806{ 9807 int kind1, kind2, kind; 9808 void *buf1, *buf2; 9809 Py_ssize_t len1, len2; 9810 PyObject* out; 9811 9812 if (maxcount < 0) 9813 maxcount = PY_SSIZE_T_MAX; 9814 9815 if (PyUnicode_READY(self) == -1) 9816 return NULL; 9817 9818 if (substring == NULL) 9819 switch (PyUnicode_KIND(self)) { 9820 case PyUnicode_1BYTE_KIND: 9821 if (PyUnicode_IS_ASCII(self)) 9822 return asciilib_split_whitespace( 9823 self, PyUnicode_1BYTE_DATA(self), 9824 PyUnicode_GET_LENGTH(self), maxcount 9825 ); 9826 else 9827 return ucs1lib_split_whitespace( 9828 self, PyUnicode_1BYTE_DATA(self), 9829 PyUnicode_GET_LENGTH(self), maxcount 9830 ); 9831 case PyUnicode_2BYTE_KIND: 9832 return ucs2lib_split_whitespace( 9833 self, PyUnicode_2BYTE_DATA(self), 9834 PyUnicode_GET_LENGTH(self), maxcount 9835 ); 9836 case PyUnicode_4BYTE_KIND: 9837 return ucs4lib_split_whitespace( 9838 self, PyUnicode_4BYTE_DATA(self), 9839 PyUnicode_GET_LENGTH(self), maxcount 9840 ); 9841 default: 9842 assert(0); 9843 return NULL; 9844 } 9845 9846 if (PyUnicode_READY(substring) == -1) 9847 return NULL; 9848 9849 kind1 = PyUnicode_KIND(self); 9850 kind2 = PyUnicode_KIND(substring); 9851 kind = kind1 > kind2 ? kind1 : kind2; 9852 buf1 = PyUnicode_DATA(self); 9853 buf2 = PyUnicode_DATA(substring); 9854 if (kind1 != kind) 9855 buf1 = _PyUnicode_AsKind(self, kind); 9856 if (!buf1) 9857 return NULL; 9858 if (kind2 != kind) 9859 buf2 = _PyUnicode_AsKind(substring, kind); 9860 if (!buf2) { 9861 if (kind1 != kind) PyMem_Free(buf1); 9862 return NULL; 9863 } 9864 len1 = PyUnicode_GET_LENGTH(self); 9865 len2 = PyUnicode_GET_LENGTH(substring); 9866 9867 switch (kind) { 9868 case PyUnicode_1BYTE_KIND: 9869 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9870 out = asciilib_split( 9871 self, buf1, len1, buf2, len2, maxcount); 9872 else 9873 out = ucs1lib_split( 9874 self, buf1, len1, buf2, len2, maxcount); 9875 break; 9876 case PyUnicode_2BYTE_KIND: 9877 out = ucs2lib_split( 9878 self, buf1, len1, buf2, len2, maxcount); 9879 break; 9880 case PyUnicode_4BYTE_KIND: 9881 out = ucs4lib_split( 9882 self, buf1, len1, buf2, len2, maxcount); 9883 break; 9884 default: 9885 out = NULL; 9886 } 9887 if (kind1 != kind) 9888 PyMem_Free(buf1); 9889 if (kind2 != kind) 9890 PyMem_Free(buf2); 9891 return out; 9892} 9893 9894static PyObject * 9895rsplit(PyObject *self, 9896 PyObject *substring, 9897 Py_ssize_t maxcount) 9898{ 9899 int kind1, kind2, kind; 9900 void *buf1, *buf2; 9901 Py_ssize_t len1, len2; 9902 PyObject* out; 9903 9904 if (maxcount < 0) 9905 maxcount = PY_SSIZE_T_MAX; 9906 9907 if (PyUnicode_READY(self) == -1) 9908 return NULL; 9909 9910 if (substring == NULL) 9911 switch (PyUnicode_KIND(self)) { 9912 case PyUnicode_1BYTE_KIND: 9913 if (PyUnicode_IS_ASCII(self)) 9914 return asciilib_rsplit_whitespace( 9915 self, PyUnicode_1BYTE_DATA(self), 9916 PyUnicode_GET_LENGTH(self), maxcount 9917 ); 9918 else 9919 return ucs1lib_rsplit_whitespace( 9920 self, PyUnicode_1BYTE_DATA(self), 9921 PyUnicode_GET_LENGTH(self), maxcount 9922 ); 9923 case PyUnicode_2BYTE_KIND: 9924 return ucs2lib_rsplit_whitespace( 9925 self, PyUnicode_2BYTE_DATA(self), 9926 PyUnicode_GET_LENGTH(self), maxcount 9927 ); 9928 case PyUnicode_4BYTE_KIND: 9929 return ucs4lib_rsplit_whitespace( 9930 self, PyUnicode_4BYTE_DATA(self), 9931 PyUnicode_GET_LENGTH(self), maxcount 9932 ); 9933 default: 9934 assert(0); 9935 return NULL; 9936 } 9937 9938 if (PyUnicode_READY(substring) == -1) 9939 return NULL; 9940 9941 kind1 = PyUnicode_KIND(self); 9942 kind2 = PyUnicode_KIND(substring); 9943 kind = kind1 > kind2 ? kind1 : kind2; 9944 buf1 = PyUnicode_DATA(self); 9945 buf2 = PyUnicode_DATA(substring); 9946 if (kind1 != kind) 9947 buf1 = _PyUnicode_AsKind(self, kind); 9948 if (!buf1) 9949 return NULL; 9950 if (kind2 != kind) 9951 buf2 = _PyUnicode_AsKind(substring, kind); 9952 if (!buf2) { 9953 if (kind1 != kind) PyMem_Free(buf1); 9954 return NULL; 9955 } 9956 len1 = PyUnicode_GET_LENGTH(self); 9957 len2 = PyUnicode_GET_LENGTH(substring); 9958 9959 switch (kind) { 9960 case PyUnicode_1BYTE_KIND: 9961 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9962 out = asciilib_rsplit( 9963 self, buf1, len1, buf2, len2, maxcount); 9964 else 9965 out = ucs1lib_rsplit( 9966 self, buf1, len1, buf2, len2, maxcount); 9967 break; 9968 case PyUnicode_2BYTE_KIND: 9969 out = ucs2lib_rsplit( 9970 self, buf1, len1, buf2, len2, maxcount); 9971 break; 9972 case PyUnicode_4BYTE_KIND: 9973 out = ucs4lib_rsplit( 9974 self, buf1, len1, buf2, len2, maxcount); 9975 break; 9976 default: 9977 out = NULL; 9978 } 9979 if (kind1 != kind) 9980 PyMem_Free(buf1); 9981 if (kind2 != kind) 9982 PyMem_Free(buf2); 9983 return out; 9984} 9985 9986static Py_ssize_t 9987anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9988 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9989{ 9990 switch (kind) { 9991 case PyUnicode_1BYTE_KIND: 9992 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9993 return asciilib_find(buf1, len1, buf2, len2, offset); 9994 else 9995 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9996 case PyUnicode_2BYTE_KIND: 9997 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9998 case PyUnicode_4BYTE_KIND: 9999 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10000 } 10001 assert(0); 10002 return -1; 10003} 10004 10005static Py_ssize_t 10006anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10007 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10008{ 10009 switch (kind) { 10010 case PyUnicode_1BYTE_KIND: 10011 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10012 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10013 else 10014 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10015 case PyUnicode_2BYTE_KIND: 10016 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10017 case PyUnicode_4BYTE_KIND: 10018 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10019 } 10020 assert(0); 10021 return 0; 10022} 10023 10024static PyObject * 10025replace(PyObject *self, PyObject *str1, 10026 PyObject *str2, Py_ssize_t maxcount) 10027{ 10028 PyObject *u; 10029 char *sbuf = PyUnicode_DATA(self); 10030 char *buf1 = PyUnicode_DATA(str1); 10031 char *buf2 = PyUnicode_DATA(str2); 10032 int srelease = 0, release1 = 0, release2 = 0; 10033 int skind = PyUnicode_KIND(self); 10034 int kind1 = PyUnicode_KIND(str1); 10035 int kind2 = PyUnicode_KIND(str2); 10036 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10037 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10038 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10039 int mayshrink; 10040 Py_UCS4 maxchar, maxchar_str2; 10041 10042 if (maxcount < 0) 10043 maxcount = PY_SSIZE_T_MAX; 10044 else if (maxcount == 0 || slen == 0) 10045 goto nothing; 10046 10047 if (str1 == str2) 10048 goto nothing; 10049 if (skind < kind1) 10050 /* substring too wide to be present */ 10051 goto nothing; 10052 10053 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10054 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10055 /* Replacing str1 with str2 may cause a maxchar reduction in the 10056 result string. */ 10057 mayshrink = (maxchar_str2 < maxchar); 10058 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); 10059 10060 if (len1 == len2) { 10061 /* same length */ 10062 if (len1 == 0) 10063 goto nothing; 10064 if (len1 == 1) { 10065 /* replace characters */ 10066 Py_UCS4 u1, u2; 10067 int rkind; 10068 Py_ssize_t index, pos; 10069 char *src; 10070 10071 u1 = PyUnicode_READ_CHAR(str1, 0); 10072 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); 10073 if (pos < 0) 10074 goto nothing; 10075 u2 = PyUnicode_READ_CHAR(str2, 0); 10076 u = PyUnicode_New(slen, maxchar); 10077 if (!u) 10078 goto error; 10079 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10080 rkind = PyUnicode_KIND(u); 10081 10082 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); 10083 index = 0; 10084 src = sbuf; 10085 while (--maxcount) 10086 { 10087 pos++; 10088 src += pos * PyUnicode_KIND(self); 10089 slen -= pos; 10090 index += pos; 10091 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); 10092 if (pos < 0) 10093 break; 10094 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); 10095 } 10096 } 10097 else { 10098 int rkind = skind; 10099 char *res; 10100 Py_ssize_t i; 10101 10102 if (kind1 < rkind) { 10103 /* widen substring */ 10104 buf1 = _PyUnicode_AsKind(str1, rkind); 10105 if (!buf1) goto error; 10106 release1 = 1; 10107 } 10108 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10109 if (i < 0) 10110 goto nothing; 10111 if (rkind > kind2) { 10112 /* widen replacement */ 10113 buf2 = _PyUnicode_AsKind(str2, rkind); 10114 if (!buf2) goto error; 10115 release2 = 1; 10116 } 10117 else if (rkind < kind2) { 10118 /* widen self and buf1 */ 10119 rkind = kind2; 10120 if (release1) PyMem_Free(buf1); 10121 release1 = 0; 10122 sbuf = _PyUnicode_AsKind(self, rkind); 10123 if (!sbuf) goto error; 10124 srelease = 1; 10125 buf1 = _PyUnicode_AsKind(str1, rkind); 10126 if (!buf1) goto error; 10127 release1 = 1; 10128 } 10129 u = PyUnicode_New(slen, maxchar); 10130 if (!u) 10131 goto error; 10132 assert(PyUnicode_KIND(u) == rkind); 10133 res = PyUnicode_DATA(u); 10134 10135 memcpy(res, sbuf, rkind * slen); 10136 /* change everything in-place, starting with this one */ 10137 memcpy(res + rkind * i, 10138 buf2, 10139 rkind * len2); 10140 i += len1; 10141 10142 while ( --maxcount > 0) { 10143 i = anylib_find(rkind, self, 10144 sbuf+rkind*i, slen-i, 10145 str1, buf1, len1, i); 10146 if (i == -1) 10147 break; 10148 memcpy(res + rkind * i, 10149 buf2, 10150 rkind * len2); 10151 i += len1; 10152 } 10153 } 10154 } 10155 else { 10156 Py_ssize_t n, i, j, ires; 10157 Py_ssize_t new_size; 10158 int rkind = skind; 10159 char *res; 10160 10161 if (kind1 < rkind) { 10162 /* widen substring */ 10163 buf1 = _PyUnicode_AsKind(str1, rkind); 10164 if (!buf1) goto error; 10165 release1 = 1; 10166 } 10167 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10168 if (n == 0) 10169 goto nothing; 10170 if (kind2 < rkind) { 10171 /* widen replacement */ 10172 buf2 = _PyUnicode_AsKind(str2, rkind); 10173 if (!buf2) goto error; 10174 release2 = 1; 10175 } 10176 else if (kind2 > rkind) { 10177 /* widen self and buf1 */ 10178 rkind = kind2; 10179 sbuf = _PyUnicode_AsKind(self, rkind); 10180 if (!sbuf) goto error; 10181 srelease = 1; 10182 if (release1) PyMem_Free(buf1); 10183 release1 = 0; 10184 buf1 = _PyUnicode_AsKind(str1, rkind); 10185 if (!buf1) goto error; 10186 release1 = 1; 10187 } 10188 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10189 PyUnicode_GET_LENGTH(str1))); */ 10190 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10191 PyErr_SetString(PyExc_OverflowError, 10192 "replace string is too long"); 10193 goto error; 10194 } 10195 new_size = slen + n * (len2 - len1); 10196 if (new_size == 0) { 10197 Py_INCREF(unicode_empty); 10198 u = unicode_empty; 10199 goto done; 10200 } 10201 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10202 PyErr_SetString(PyExc_OverflowError, 10203 "replace string is too long"); 10204 goto error; 10205 } 10206 u = PyUnicode_New(new_size, maxchar); 10207 if (!u) 10208 goto error; 10209 assert(PyUnicode_KIND(u) == rkind); 10210 res = PyUnicode_DATA(u); 10211 ires = i = 0; 10212 if (len1 > 0) { 10213 while (n-- > 0) { 10214 /* look for next match */ 10215 j = anylib_find(rkind, self, 10216 sbuf + rkind * i, slen-i, 10217 str1, buf1, len1, i); 10218 if (j == -1) 10219 break; 10220 else if (j > i) { 10221 /* copy unchanged part [i:j] */ 10222 memcpy(res + rkind * ires, 10223 sbuf + rkind * i, 10224 rkind * (j-i)); 10225 ires += j - i; 10226 } 10227 /* copy substitution string */ 10228 if (len2 > 0) { 10229 memcpy(res + rkind * ires, 10230 buf2, 10231 rkind * len2); 10232 ires += len2; 10233 } 10234 i = j + len1; 10235 } 10236 if (i < slen) 10237 /* copy tail [i:] */ 10238 memcpy(res + rkind * ires, 10239 sbuf + rkind * i, 10240 rkind * (slen-i)); 10241 } 10242 else { 10243 /* interleave */ 10244 while (n > 0) { 10245 memcpy(res + rkind * ires, 10246 buf2, 10247 rkind * len2); 10248 ires += len2; 10249 if (--n <= 0) 10250 break; 10251 memcpy(res + rkind * ires, 10252 sbuf + rkind * i, 10253 rkind); 10254 ires++; 10255 i++; 10256 } 10257 memcpy(res + rkind * ires, 10258 sbuf + rkind * i, 10259 rkind * (slen-i)); 10260 } 10261 } 10262 10263 if (mayshrink) { 10264 unicode_adjust_maxchar(&u); 10265 if (u == NULL) 10266 goto error; 10267 } 10268 10269 done: 10270 if (srelease) 10271 PyMem_FREE(sbuf); 10272 if (release1) 10273 PyMem_FREE(buf1); 10274 if (release2) 10275 PyMem_FREE(buf2); 10276 assert(_PyUnicode_CheckConsistency(u, 1)); 10277 return u; 10278 10279 nothing: 10280 /* nothing to replace; return original string (when possible) */ 10281 if (srelease) 10282 PyMem_FREE(sbuf); 10283 if (release1) 10284 PyMem_FREE(buf1); 10285 if (release2) 10286 PyMem_FREE(buf2); 10287 return unicode_result_unchanged(self); 10288 10289 error: 10290 if (srelease && sbuf) 10291 PyMem_FREE(sbuf); 10292 if (release1 && buf1) 10293 PyMem_FREE(buf1); 10294 if (release2 && buf2) 10295 PyMem_FREE(buf2); 10296 return NULL; 10297} 10298 10299/* --- Unicode Object Methods --------------------------------------------- */ 10300 10301PyDoc_STRVAR(title__doc__, 10302 "S.title() -> str\n\ 10303\n\ 10304Return a titlecased version of S, i.e. words start with title case\n\ 10305characters, all remaining cased characters have lower case."); 10306 10307static PyObject* 10308unicode_title(PyObject *self) 10309{ 10310 if (PyUnicode_READY(self) == -1) 10311 return NULL; 10312 return case_operation(self, do_title); 10313} 10314 10315PyDoc_STRVAR(capitalize__doc__, 10316 "S.capitalize() -> str\n\ 10317\n\ 10318Return a capitalized version of S, i.e. make the first character\n\ 10319have upper case and the rest lower case."); 10320 10321static PyObject* 10322unicode_capitalize(PyObject *self) 10323{ 10324 if (PyUnicode_READY(self) == -1) 10325 return NULL; 10326 if (PyUnicode_GET_LENGTH(self) == 0) 10327 return unicode_result_unchanged(self); 10328 return case_operation(self, do_capitalize); 10329} 10330 10331PyDoc_STRVAR(casefold__doc__, 10332 "S.casefold() -> str\n\ 10333\n\ 10334Return a version of S suitable for caseless comparisons."); 10335 10336static PyObject * 10337unicode_casefold(PyObject *self) 10338{ 10339 if (PyUnicode_READY(self) == -1) 10340 return NULL; 10341 if (PyUnicode_IS_ASCII(self)) 10342 return ascii_upper_or_lower(self, 1); 10343 return case_operation(self, do_casefold); 10344} 10345 10346 10347/* Argument converter. Coerces to a single unicode character */ 10348 10349static int 10350convert_uc(PyObject *obj, void *addr) 10351{ 10352 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10353 PyObject *uniobj; 10354 10355 uniobj = PyUnicode_FromObject(obj); 10356 if (uniobj == NULL) { 10357 PyErr_SetString(PyExc_TypeError, 10358 "The fill character cannot be converted to Unicode"); 10359 return 0; 10360 } 10361 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10362 PyErr_SetString(PyExc_TypeError, 10363 "The fill character must be exactly one character long"); 10364 Py_DECREF(uniobj); 10365 return 0; 10366 } 10367 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10368 Py_DECREF(uniobj); 10369 return 1; 10370} 10371 10372PyDoc_STRVAR(center__doc__, 10373 "S.center(width[, fillchar]) -> str\n\ 10374\n\ 10375Return S centered in a string of length width. Padding is\n\ 10376done using the specified fill character (default is a space)"); 10377 10378static PyObject * 10379unicode_center(PyObject *self, PyObject *args) 10380{ 10381 Py_ssize_t marg, left; 10382 Py_ssize_t width; 10383 Py_UCS4 fillchar = ' '; 10384 10385 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10386 return NULL; 10387 10388 if (PyUnicode_READY(self) == -1) 10389 return NULL; 10390 10391 if (PyUnicode_GET_LENGTH(self) >= width) 10392 return unicode_result_unchanged(self); 10393 10394 marg = width - PyUnicode_GET_LENGTH(self); 10395 left = marg / 2 + (marg & width & 1); 10396 10397 return pad(self, left, marg - left, fillchar); 10398} 10399 10400/* This function assumes that str1 and str2 are readied by the caller. */ 10401 10402static int 10403unicode_compare(PyObject *str1, PyObject *str2) 10404{ 10405 int kind1, kind2; 10406 void *data1, *data2; 10407 Py_ssize_t len1, len2, i; 10408 10409 kind1 = PyUnicode_KIND(str1); 10410 kind2 = PyUnicode_KIND(str2); 10411 data1 = PyUnicode_DATA(str1); 10412 data2 = PyUnicode_DATA(str2); 10413 len1 = PyUnicode_GET_LENGTH(str1); 10414 len2 = PyUnicode_GET_LENGTH(str2); 10415 10416 for (i = 0; i < len1 && i < len2; ++i) { 10417 Py_UCS4 c1, c2; 10418 c1 = PyUnicode_READ(kind1, data1, i); 10419 c2 = PyUnicode_READ(kind2, data2, i); 10420 10421 if (c1 != c2) 10422 return (c1 < c2) ? -1 : 1; 10423 } 10424 10425 return (len1 < len2) ? -1 : (len1 != len2); 10426} 10427 10428int 10429PyUnicode_Compare(PyObject *left, PyObject *right) 10430{ 10431 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10432 if (PyUnicode_READY(left) == -1 || 10433 PyUnicode_READY(right) == -1) 10434 return -1; 10435 return unicode_compare(left, right); 10436 } 10437 PyErr_Format(PyExc_TypeError, 10438 "Can't compare %.100s and %.100s", 10439 left->ob_type->tp_name, 10440 right->ob_type->tp_name); 10441 return -1; 10442} 10443 10444int 10445PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10446{ 10447 Py_ssize_t i; 10448 int kind; 10449 void *data; 10450 Py_UCS4 chr; 10451 10452 assert(_PyUnicode_CHECK(uni)); 10453 if (PyUnicode_READY(uni) == -1) 10454 return -1; 10455 kind = PyUnicode_KIND(uni); 10456 data = PyUnicode_DATA(uni); 10457 /* Compare Unicode string and source character set string */ 10458 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10459 if (chr != str[i]) 10460 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10461 /* This check keeps Python strings that end in '\0' from comparing equal 10462 to C strings identical up to that point. */ 10463 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10464 return 1; /* uni is longer */ 10465 if (str[i]) 10466 return -1; /* str is longer */ 10467 return 0; 10468} 10469 10470 10471#define TEST_COND(cond) \ 10472 ((cond) ? Py_True : Py_False) 10473 10474PyObject * 10475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10476{ 10477 int result; 10478 10479 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10480 PyObject *v; 10481 if (PyUnicode_READY(left) == -1 || 10482 PyUnicode_READY(right) == -1) 10483 return NULL; 10484 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10485 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10486 if (op == Py_EQ) { 10487 Py_INCREF(Py_False); 10488 return Py_False; 10489 } 10490 if (op == Py_NE) { 10491 Py_INCREF(Py_True); 10492 return Py_True; 10493 } 10494 } 10495 if (left == right) 10496 result = 0; 10497 else 10498 result = unicode_compare(left, right); 10499 10500 /* Convert the return value to a Boolean */ 10501 switch (op) { 10502 case Py_EQ: 10503 v = TEST_COND(result == 0); 10504 break; 10505 case Py_NE: 10506 v = TEST_COND(result != 0); 10507 break; 10508 case Py_LE: 10509 v = TEST_COND(result <= 0); 10510 break; 10511 case Py_GE: 10512 v = TEST_COND(result >= 0); 10513 break; 10514 case Py_LT: 10515 v = TEST_COND(result == -1); 10516 break; 10517 case Py_GT: 10518 v = TEST_COND(result == 1); 10519 break; 10520 default: 10521 PyErr_BadArgument(); 10522 return NULL; 10523 } 10524 Py_INCREF(v); 10525 return v; 10526 } 10527 10528 Py_RETURN_NOTIMPLEMENTED; 10529} 10530 10531int 10532PyUnicode_Contains(PyObject *container, PyObject *element) 10533{ 10534 PyObject *str, *sub; 10535 int kind1, kind2, kind; 10536 void *buf1, *buf2; 10537 Py_ssize_t len1, len2; 10538 int result; 10539 10540 /* Coerce the two arguments */ 10541 sub = PyUnicode_FromObject(element); 10542 if (!sub) { 10543 PyErr_Format(PyExc_TypeError, 10544 "'in <string>' requires string as left operand, not %s", 10545 element->ob_type->tp_name); 10546 return -1; 10547 } 10548 10549 str = PyUnicode_FromObject(container); 10550 if (!str) { 10551 Py_DECREF(sub); 10552 return -1; 10553 } 10554 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 10555 Py_DECREF(sub); 10556 Py_DECREF(str); 10557 } 10558 10559 kind1 = PyUnicode_KIND(str); 10560 kind2 = PyUnicode_KIND(sub); 10561 kind = kind1; 10562 buf1 = PyUnicode_DATA(str); 10563 buf2 = PyUnicode_DATA(sub); 10564 if (kind2 != kind) { 10565 if (kind2 > kind) { 10566 Py_DECREF(sub); 10567 Py_DECREF(str); 10568 return 0; 10569 } 10570 buf2 = _PyUnicode_AsKind(sub, kind); 10571 } 10572 if (!buf2) { 10573 Py_DECREF(sub); 10574 Py_DECREF(str); 10575 return -1; 10576 } 10577 len1 = PyUnicode_GET_LENGTH(str); 10578 len2 = PyUnicode_GET_LENGTH(sub); 10579 10580 switch (kind) { 10581 case PyUnicode_1BYTE_KIND: 10582 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10583 break; 10584 case PyUnicode_2BYTE_KIND: 10585 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10586 break; 10587 case PyUnicode_4BYTE_KIND: 10588 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10589 break; 10590 default: 10591 result = -1; 10592 assert(0); 10593 } 10594 10595 Py_DECREF(str); 10596 Py_DECREF(sub); 10597 10598 if (kind2 != kind) 10599 PyMem_Free(buf2); 10600 10601 return result; 10602} 10603 10604/* Concat to string or Unicode object giving a new Unicode object. */ 10605 10606PyObject * 10607PyUnicode_Concat(PyObject *left, PyObject *right) 10608{ 10609 PyObject *u = NULL, *v = NULL, *w; 10610 Py_UCS4 maxchar, maxchar2; 10611 Py_ssize_t u_len, v_len, new_len; 10612 10613 /* Coerce the two arguments */ 10614 u = PyUnicode_FromObject(left); 10615 if (u == NULL) 10616 goto onError; 10617 v = PyUnicode_FromObject(right); 10618 if (v == NULL) 10619 goto onError; 10620 10621 /* Shortcuts */ 10622 if (v == unicode_empty) { 10623 Py_DECREF(v); 10624 return u; 10625 } 10626 if (u == unicode_empty) { 10627 Py_DECREF(u); 10628 return v; 10629 } 10630 10631 u_len = PyUnicode_GET_LENGTH(u); 10632 v_len = PyUnicode_GET_LENGTH(v); 10633 if (u_len > PY_SSIZE_T_MAX - v_len) { 10634 PyErr_SetString(PyExc_OverflowError, 10635 "strings are too large to concat"); 10636 goto onError; 10637 } 10638 new_len = u_len + v_len; 10639 10640 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10641 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10642 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10643 10644 /* Concat the two Unicode strings */ 10645 w = PyUnicode_New(new_len, maxchar); 10646 if (w == NULL) 10647 goto onError; 10648 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10649 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10650 Py_DECREF(u); 10651 Py_DECREF(v); 10652 assert(_PyUnicode_CheckConsistency(w, 1)); 10653 return w; 10654 10655 onError: 10656 Py_XDECREF(u); 10657 Py_XDECREF(v); 10658 return NULL; 10659} 10660 10661void 10662PyUnicode_Append(PyObject **p_left, PyObject *right) 10663{ 10664 PyObject *left, *res; 10665 Py_UCS4 maxchar, maxchar2; 10666 Py_ssize_t left_len, right_len, new_len; 10667 10668 if (p_left == NULL) { 10669 if (!PyErr_Occurred()) 10670 PyErr_BadInternalCall(); 10671 return; 10672 } 10673 left = *p_left; 10674 if (right == NULL || !PyUnicode_Check(left)) { 10675 if (!PyErr_Occurred()) 10676 PyErr_BadInternalCall(); 10677 goto error; 10678 } 10679 10680 if (PyUnicode_READY(left) == -1) 10681 goto error; 10682 if (PyUnicode_READY(right) == -1) 10683 goto error; 10684 10685 /* Shortcuts */ 10686 if (left == unicode_empty) { 10687 Py_DECREF(left); 10688 Py_INCREF(right); 10689 *p_left = right; 10690 return; 10691 } 10692 if (right == unicode_empty) 10693 return; 10694 10695 left_len = PyUnicode_GET_LENGTH(left); 10696 right_len = PyUnicode_GET_LENGTH(right); 10697 if (left_len > PY_SSIZE_T_MAX - right_len) { 10698 PyErr_SetString(PyExc_OverflowError, 10699 "strings are too large to concat"); 10700 goto error; 10701 } 10702 new_len = left_len + right_len; 10703 10704 if (unicode_modifiable(left) 10705 && PyUnicode_CheckExact(right) 10706 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10707 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10708 to change the structure size, but characters are stored just after 10709 the structure, and so it requires to move all characters which is 10710 not so different than duplicating the string. */ 10711 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10712 { 10713 /* append inplace */ 10714 if (unicode_resize(p_left, new_len) != 0) { 10715 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10716 * deallocated so it cannot be put back into 10717 * 'variable'. The MemoryError is raised when there 10718 * is no value in 'variable', which might (very 10719 * remotely) be a cause of incompatibilities. 10720 */ 10721 goto error; 10722 } 10723 /* copy 'right' into the newly allocated area of 'left' */ 10724 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10725 } 10726 else { 10727 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10728 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10729 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10730 10731 /* Concat the two Unicode strings */ 10732 res = PyUnicode_New(new_len, maxchar); 10733 if (res == NULL) 10734 goto error; 10735 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10736 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10737 Py_DECREF(left); 10738 *p_left = res; 10739 } 10740 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10741 return; 10742 10743error: 10744 Py_CLEAR(*p_left); 10745} 10746 10747void 10748PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10749{ 10750 PyUnicode_Append(pleft, right); 10751 Py_XDECREF(right); 10752} 10753 10754PyDoc_STRVAR(count__doc__, 10755 "S.count(sub[, start[, end]]) -> int\n\ 10756\n\ 10757Return the number of non-overlapping occurrences of substring sub in\n\ 10758string S[start:end]. Optional arguments start and end are\n\ 10759interpreted as in slice notation."); 10760 10761static PyObject * 10762unicode_count(PyObject *self, PyObject *args) 10763{ 10764 PyObject *substring; 10765 Py_ssize_t start = 0; 10766 Py_ssize_t end = PY_SSIZE_T_MAX; 10767 PyObject *result; 10768 int kind1, kind2, kind; 10769 void *buf1, *buf2; 10770 Py_ssize_t len1, len2, iresult; 10771 10772 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10773 &start, &end)) 10774 return NULL; 10775 10776 kind1 = PyUnicode_KIND(self); 10777 kind2 = PyUnicode_KIND(substring); 10778 if (kind2 > kind1) 10779 return PyLong_FromLong(0); 10780 kind = kind1; 10781 buf1 = PyUnicode_DATA(self); 10782 buf2 = PyUnicode_DATA(substring); 10783 if (kind2 != kind) 10784 buf2 = _PyUnicode_AsKind(substring, kind); 10785 if (!buf2) { 10786 Py_DECREF(substring); 10787 return NULL; 10788 } 10789 len1 = PyUnicode_GET_LENGTH(self); 10790 len2 = PyUnicode_GET_LENGTH(substring); 10791 10792 ADJUST_INDICES(start, end, len1); 10793 switch (kind) { 10794 case PyUnicode_1BYTE_KIND: 10795 iresult = ucs1lib_count( 10796 ((Py_UCS1*)buf1) + start, end - start, 10797 buf2, len2, PY_SSIZE_T_MAX 10798 ); 10799 break; 10800 case PyUnicode_2BYTE_KIND: 10801 iresult = ucs2lib_count( 10802 ((Py_UCS2*)buf1) + start, end - start, 10803 buf2, len2, PY_SSIZE_T_MAX 10804 ); 10805 break; 10806 case PyUnicode_4BYTE_KIND: 10807 iresult = ucs4lib_count( 10808 ((Py_UCS4*)buf1) + start, end - start, 10809 buf2, len2, PY_SSIZE_T_MAX 10810 ); 10811 break; 10812 default: 10813 assert(0); iresult = 0; 10814 } 10815 10816 result = PyLong_FromSsize_t(iresult); 10817 10818 if (kind2 != kind) 10819 PyMem_Free(buf2); 10820 10821 Py_DECREF(substring); 10822 10823 return result; 10824} 10825 10826PyDoc_STRVAR(encode__doc__, 10827 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10828\n\ 10829Encode S using the codec registered for encoding. Default encoding\n\ 10830is 'utf-8'. errors may be given to set a different error\n\ 10831handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10832a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10833'xmlcharrefreplace' as well as any other name registered with\n\ 10834codecs.register_error that can handle UnicodeEncodeErrors."); 10835 10836static PyObject * 10837unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10838{ 10839 static char *kwlist[] = {"encoding", "errors", 0}; 10840 char *encoding = NULL; 10841 char *errors = NULL; 10842 10843 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10844 kwlist, &encoding, &errors)) 10845 return NULL; 10846 return PyUnicode_AsEncodedString(self, encoding, errors); 10847} 10848 10849PyDoc_STRVAR(expandtabs__doc__, 10850 "S.expandtabs([tabsize]) -> str\n\ 10851\n\ 10852Return a copy of S where all tab characters are expanded using spaces.\n\ 10853If tabsize is not given, a tab size of 8 characters is assumed."); 10854 10855static PyObject* 10856unicode_expandtabs(PyObject *self, PyObject *args) 10857{ 10858 Py_ssize_t i, j, line_pos, src_len, incr; 10859 Py_UCS4 ch; 10860 PyObject *u; 10861 void *src_data, *dest_data; 10862 int tabsize = 8; 10863 int kind; 10864 int found; 10865 10866 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10867 return NULL; 10868 10869 if (PyUnicode_READY(self) == -1) 10870 return NULL; 10871 10872 /* First pass: determine size of output string */ 10873 src_len = PyUnicode_GET_LENGTH(self); 10874 i = j = line_pos = 0; 10875 kind = PyUnicode_KIND(self); 10876 src_data = PyUnicode_DATA(self); 10877 found = 0; 10878 for (; i < src_len; i++) { 10879 ch = PyUnicode_READ(kind, src_data, i); 10880 if (ch == '\t') { 10881 found = 1; 10882 if (tabsize > 0) { 10883 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10884 if (j > PY_SSIZE_T_MAX - incr) 10885 goto overflow; 10886 line_pos += incr; 10887 j += incr; 10888 } 10889 } 10890 else { 10891 if (j > PY_SSIZE_T_MAX - 1) 10892 goto overflow; 10893 line_pos++; 10894 j++; 10895 if (ch == '\n' || ch == '\r') 10896 line_pos = 0; 10897 } 10898 } 10899 if (!found) 10900 return unicode_result_unchanged(self); 10901 10902 /* Second pass: create output string and fill it */ 10903 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10904 if (!u) 10905 return NULL; 10906 dest_data = PyUnicode_DATA(u); 10907 10908 i = j = line_pos = 0; 10909 10910 for (; i < src_len; i++) { 10911 ch = PyUnicode_READ(kind, src_data, i); 10912 if (ch == '\t') { 10913 if (tabsize > 0) { 10914 incr = tabsize - (line_pos % tabsize); 10915 line_pos += incr; 10916 FILL(kind, dest_data, ' ', j, incr); 10917 j += incr; 10918 } 10919 } 10920 else { 10921 line_pos++; 10922 PyUnicode_WRITE(kind, dest_data, j, ch); 10923 j++; 10924 if (ch == '\n' || ch == '\r') 10925 line_pos = 0; 10926 } 10927 } 10928 assert (j == PyUnicode_GET_LENGTH(u)); 10929 return unicode_result(u); 10930 10931 overflow: 10932 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10933 return NULL; 10934} 10935 10936PyDoc_STRVAR(find__doc__, 10937 "S.find(sub[, start[, end]]) -> int\n\ 10938\n\ 10939Return the lowest index in S where substring sub is found,\n\ 10940such that sub is contained within S[start:end]. Optional\n\ 10941arguments start and end are interpreted as in slice notation.\n\ 10942\n\ 10943Return -1 on failure."); 10944 10945static PyObject * 10946unicode_find(PyObject *self, PyObject *args) 10947{ 10948 PyObject *substring; 10949 Py_ssize_t start; 10950 Py_ssize_t end; 10951 Py_ssize_t result; 10952 10953 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10954 &start, &end)) 10955 return NULL; 10956 10957 if (PyUnicode_READY(self) == -1) 10958 return NULL; 10959 if (PyUnicode_READY(substring) == -1) 10960 return NULL; 10961 10962 result = any_find_slice(1, self, substring, start, end); 10963 10964 Py_DECREF(substring); 10965 10966 if (result == -2) 10967 return NULL; 10968 10969 return PyLong_FromSsize_t(result); 10970} 10971 10972static PyObject * 10973unicode_getitem(PyObject *self, Py_ssize_t index) 10974{ 10975 void *data; 10976 enum PyUnicode_Kind kind; 10977 Py_UCS4 ch; 10978 PyObject *res; 10979 10980 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 10981 PyErr_BadArgument(); 10982 return NULL; 10983 } 10984 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 10985 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10986 return NULL; 10987 } 10988 kind = PyUnicode_KIND(self); 10989 data = PyUnicode_DATA(self); 10990 ch = PyUnicode_READ(kind, data, index); 10991 if (ch < 256) 10992 return get_latin1_char(ch); 10993 10994 res = PyUnicode_New(1, ch); 10995 if (res == NULL) 10996 return NULL; 10997 kind = PyUnicode_KIND(res); 10998 data = PyUnicode_DATA(res); 10999 PyUnicode_WRITE(kind, data, 0, ch); 11000 assert(_PyUnicode_CheckConsistency(res, 1)); 11001 return res; 11002} 11003 11004/* Believe it or not, this produces the same value for ASCII strings 11005 as bytes_hash(). */ 11006static Py_hash_t 11007unicode_hash(PyObject *self) 11008{ 11009 Py_ssize_t len; 11010 Py_uhash_t x; 11011 11012#ifdef Py_DEBUG 11013 assert(_Py_HashSecret_Initialized); 11014#endif 11015 if (_PyUnicode_HASH(self) != -1) 11016 return _PyUnicode_HASH(self); 11017 if (PyUnicode_READY(self) == -1) 11018 return -1; 11019 len = PyUnicode_GET_LENGTH(self); 11020 /* 11021 We make the hash of the empty string be 0, rather than using 11022 (prefix ^ suffix), since this slightly obfuscates the hash secret 11023 */ 11024 if (len == 0) { 11025 _PyUnicode_HASH(self) = 0; 11026 return 0; 11027 } 11028 11029 /* The hash function as a macro, gets expanded three times below. */ 11030#define HASH(P) \ 11031 x ^= (Py_uhash_t) *P << 7; \ 11032 while (--len >= 0) \ 11033 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 11034 11035 x = (Py_uhash_t) _Py_HashSecret.prefix; 11036 switch (PyUnicode_KIND(self)) { 11037 case PyUnicode_1BYTE_KIND: { 11038 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11039 HASH(c); 11040 break; 11041 } 11042 case PyUnicode_2BYTE_KIND: { 11043 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11044 HASH(s); 11045 break; 11046 } 11047 default: { 11048 Py_UCS4 *l; 11049 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11050 "Impossible switch case in unicode_hash"); 11051 l = PyUnicode_4BYTE_DATA(self); 11052 HASH(l); 11053 break; 11054 } 11055 } 11056 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 11057 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 11058 11059 if (x == -1) 11060 x = -2; 11061 _PyUnicode_HASH(self) = x; 11062 return x; 11063} 11064#undef HASH 11065 11066PyDoc_STRVAR(index__doc__, 11067 "S.index(sub[, start[, end]]) -> int\n\ 11068\n\ 11069Like S.find() but raise ValueError when the substring is not found."); 11070 11071static PyObject * 11072unicode_index(PyObject *self, PyObject *args) 11073{ 11074 Py_ssize_t result; 11075 PyObject *substring; 11076 Py_ssize_t start; 11077 Py_ssize_t end; 11078 11079 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11080 &start, &end)) 11081 return NULL; 11082 11083 if (PyUnicode_READY(self) == -1) 11084 return NULL; 11085 if (PyUnicode_READY(substring) == -1) 11086 return NULL; 11087 11088 result = any_find_slice(1, self, substring, start, end); 11089 11090 Py_DECREF(substring); 11091 11092 if (result == -2) 11093 return NULL; 11094 11095 if (result < 0) { 11096 PyErr_SetString(PyExc_ValueError, "substring not found"); 11097 return NULL; 11098 } 11099 11100 return PyLong_FromSsize_t(result); 11101} 11102 11103PyDoc_STRVAR(islower__doc__, 11104 "S.islower() -> bool\n\ 11105\n\ 11106Return True if all cased characters in S are lowercase and there is\n\ 11107at least one cased character in S, False otherwise."); 11108 11109static PyObject* 11110unicode_islower(PyObject *self) 11111{ 11112 Py_ssize_t i, length; 11113 int kind; 11114 void *data; 11115 int cased; 11116 11117 if (PyUnicode_READY(self) == -1) 11118 return NULL; 11119 length = PyUnicode_GET_LENGTH(self); 11120 kind = PyUnicode_KIND(self); 11121 data = PyUnicode_DATA(self); 11122 11123 /* Shortcut for single character strings */ 11124 if (length == 1) 11125 return PyBool_FromLong( 11126 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11127 11128 /* Special case for empty strings */ 11129 if (length == 0) 11130 return PyBool_FromLong(0); 11131 11132 cased = 0; 11133 for (i = 0; i < length; i++) { 11134 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11135 11136 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11137 return PyBool_FromLong(0); 11138 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11139 cased = 1; 11140 } 11141 return PyBool_FromLong(cased); 11142} 11143 11144PyDoc_STRVAR(isupper__doc__, 11145 "S.isupper() -> bool\n\ 11146\n\ 11147Return True if all cased characters in S are uppercase and there is\n\ 11148at least one cased character in S, False otherwise."); 11149 11150static PyObject* 11151unicode_isupper(PyObject *self) 11152{ 11153 Py_ssize_t i, length; 11154 int kind; 11155 void *data; 11156 int cased; 11157 11158 if (PyUnicode_READY(self) == -1) 11159 return NULL; 11160 length = PyUnicode_GET_LENGTH(self); 11161 kind = PyUnicode_KIND(self); 11162 data = PyUnicode_DATA(self); 11163 11164 /* Shortcut for single character strings */ 11165 if (length == 1) 11166 return PyBool_FromLong( 11167 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11168 11169 /* Special case for empty strings */ 11170 if (length == 0) 11171 return PyBool_FromLong(0); 11172 11173 cased = 0; 11174 for (i = 0; i < length; i++) { 11175 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11176 11177 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11178 return PyBool_FromLong(0); 11179 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11180 cased = 1; 11181 } 11182 return PyBool_FromLong(cased); 11183} 11184 11185PyDoc_STRVAR(istitle__doc__, 11186 "S.istitle() -> bool\n\ 11187\n\ 11188Return True if S is a titlecased string and there is at least one\n\ 11189character in S, i.e. upper- and titlecase characters may only\n\ 11190follow uncased characters and lowercase characters only cased ones.\n\ 11191Return False otherwise."); 11192 11193static PyObject* 11194unicode_istitle(PyObject *self) 11195{ 11196 Py_ssize_t i, length; 11197 int kind; 11198 void *data; 11199 int cased, previous_is_cased; 11200 11201 if (PyUnicode_READY(self) == -1) 11202 return NULL; 11203 length = PyUnicode_GET_LENGTH(self); 11204 kind = PyUnicode_KIND(self); 11205 data = PyUnicode_DATA(self); 11206 11207 /* Shortcut for single character strings */ 11208 if (length == 1) { 11209 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11210 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11211 (Py_UNICODE_ISUPPER(ch) != 0)); 11212 } 11213 11214 /* Special case for empty strings */ 11215 if (length == 0) 11216 return PyBool_FromLong(0); 11217 11218 cased = 0; 11219 previous_is_cased = 0; 11220 for (i = 0; i < length; i++) { 11221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11222 11223 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11224 if (previous_is_cased) 11225 return PyBool_FromLong(0); 11226 previous_is_cased = 1; 11227 cased = 1; 11228 } 11229 else if (Py_UNICODE_ISLOWER(ch)) { 11230 if (!previous_is_cased) 11231 return PyBool_FromLong(0); 11232 previous_is_cased = 1; 11233 cased = 1; 11234 } 11235 else 11236 previous_is_cased = 0; 11237 } 11238 return PyBool_FromLong(cased); 11239} 11240 11241PyDoc_STRVAR(isspace__doc__, 11242 "S.isspace() -> bool\n\ 11243\n\ 11244Return True if all characters in S are whitespace\n\ 11245and there is at least one character in S, False otherwise."); 11246 11247static PyObject* 11248unicode_isspace(PyObject *self) 11249{ 11250 Py_ssize_t i, length; 11251 int kind; 11252 void *data; 11253 11254 if (PyUnicode_READY(self) == -1) 11255 return NULL; 11256 length = PyUnicode_GET_LENGTH(self); 11257 kind = PyUnicode_KIND(self); 11258 data = PyUnicode_DATA(self); 11259 11260 /* Shortcut for single character strings */ 11261 if (length == 1) 11262 return PyBool_FromLong( 11263 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11264 11265 /* Special case for empty strings */ 11266 if (length == 0) 11267 return PyBool_FromLong(0); 11268 11269 for (i = 0; i < length; i++) { 11270 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11271 if (!Py_UNICODE_ISSPACE(ch)) 11272 return PyBool_FromLong(0); 11273 } 11274 return PyBool_FromLong(1); 11275} 11276 11277PyDoc_STRVAR(isalpha__doc__, 11278 "S.isalpha() -> bool\n\ 11279\n\ 11280Return True if all characters in S are alphabetic\n\ 11281and there is at least one character in S, False otherwise."); 11282 11283static PyObject* 11284unicode_isalpha(PyObject *self) 11285{ 11286 Py_ssize_t i, length; 11287 int kind; 11288 void *data; 11289 11290 if (PyUnicode_READY(self) == -1) 11291 return NULL; 11292 length = PyUnicode_GET_LENGTH(self); 11293 kind = PyUnicode_KIND(self); 11294 data = PyUnicode_DATA(self); 11295 11296 /* Shortcut for single character strings */ 11297 if (length == 1) 11298 return PyBool_FromLong( 11299 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11300 11301 /* Special case for empty strings */ 11302 if (length == 0) 11303 return PyBool_FromLong(0); 11304 11305 for (i = 0; i < length; i++) { 11306 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11307 return PyBool_FromLong(0); 11308 } 11309 return PyBool_FromLong(1); 11310} 11311 11312PyDoc_STRVAR(isalnum__doc__, 11313 "S.isalnum() -> bool\n\ 11314\n\ 11315Return True if all characters in S are alphanumeric\n\ 11316and there is at least one character in S, False otherwise."); 11317 11318static PyObject* 11319unicode_isalnum(PyObject *self) 11320{ 11321 int kind; 11322 void *data; 11323 Py_ssize_t len, i; 11324 11325 if (PyUnicode_READY(self) == -1) 11326 return NULL; 11327 11328 kind = PyUnicode_KIND(self); 11329 data = PyUnicode_DATA(self); 11330 len = PyUnicode_GET_LENGTH(self); 11331 11332 /* Shortcut for single character strings */ 11333 if (len == 1) { 11334 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11335 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11336 } 11337 11338 /* Special case for empty strings */ 11339 if (len == 0) 11340 return PyBool_FromLong(0); 11341 11342 for (i = 0; i < len; i++) { 11343 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11344 if (!Py_UNICODE_ISALNUM(ch)) 11345 return PyBool_FromLong(0); 11346 } 11347 return PyBool_FromLong(1); 11348} 11349 11350PyDoc_STRVAR(isdecimal__doc__, 11351 "S.isdecimal() -> bool\n\ 11352\n\ 11353Return True if there are only decimal characters in S,\n\ 11354False otherwise."); 11355 11356static PyObject* 11357unicode_isdecimal(PyObject *self) 11358{ 11359 Py_ssize_t i, length; 11360 int kind; 11361 void *data; 11362 11363 if (PyUnicode_READY(self) == -1) 11364 return NULL; 11365 length = PyUnicode_GET_LENGTH(self); 11366 kind = PyUnicode_KIND(self); 11367 data = PyUnicode_DATA(self); 11368 11369 /* Shortcut for single character strings */ 11370 if (length == 1) 11371 return PyBool_FromLong( 11372 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11373 11374 /* Special case for empty strings */ 11375 if (length == 0) 11376 return PyBool_FromLong(0); 11377 11378 for (i = 0; i < length; i++) { 11379 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11380 return PyBool_FromLong(0); 11381 } 11382 return PyBool_FromLong(1); 11383} 11384 11385PyDoc_STRVAR(isdigit__doc__, 11386 "S.isdigit() -> bool\n\ 11387\n\ 11388Return True if all characters in S are digits\n\ 11389and there is at least one character in S, False otherwise."); 11390 11391static PyObject* 11392unicode_isdigit(PyObject *self) 11393{ 11394 Py_ssize_t i, length; 11395 int kind; 11396 void *data; 11397 11398 if (PyUnicode_READY(self) == -1) 11399 return NULL; 11400 length = PyUnicode_GET_LENGTH(self); 11401 kind = PyUnicode_KIND(self); 11402 data = PyUnicode_DATA(self); 11403 11404 /* Shortcut for single character strings */ 11405 if (length == 1) { 11406 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11407 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11408 } 11409 11410 /* Special case for empty strings */ 11411 if (length == 0) 11412 return PyBool_FromLong(0); 11413 11414 for (i = 0; i < length; i++) { 11415 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11416 return PyBool_FromLong(0); 11417 } 11418 return PyBool_FromLong(1); 11419} 11420 11421PyDoc_STRVAR(isnumeric__doc__, 11422 "S.isnumeric() -> bool\n\ 11423\n\ 11424Return True if there are only numeric characters in S,\n\ 11425False otherwise."); 11426 11427static PyObject* 11428unicode_isnumeric(PyObject *self) 11429{ 11430 Py_ssize_t i, length; 11431 int kind; 11432 void *data; 11433 11434 if (PyUnicode_READY(self) == -1) 11435 return NULL; 11436 length = PyUnicode_GET_LENGTH(self); 11437 kind = PyUnicode_KIND(self); 11438 data = PyUnicode_DATA(self); 11439 11440 /* Shortcut for single character strings */ 11441 if (length == 1) 11442 return PyBool_FromLong( 11443 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11444 11445 /* Special case for empty strings */ 11446 if (length == 0) 11447 return PyBool_FromLong(0); 11448 11449 for (i = 0; i < length; i++) { 11450 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11451 return PyBool_FromLong(0); 11452 } 11453 return PyBool_FromLong(1); 11454} 11455 11456int 11457PyUnicode_IsIdentifier(PyObject *self) 11458{ 11459 int kind; 11460 void *data; 11461 Py_ssize_t i; 11462 Py_UCS4 first; 11463 11464 if (PyUnicode_READY(self) == -1) { 11465 Py_FatalError("identifier not ready"); 11466 return 0; 11467 } 11468 11469 /* Special case for empty strings */ 11470 if (PyUnicode_GET_LENGTH(self) == 0) 11471 return 0; 11472 kind = PyUnicode_KIND(self); 11473 data = PyUnicode_DATA(self); 11474 11475 /* PEP 3131 says that the first character must be in 11476 XID_Start and subsequent characters in XID_Continue, 11477 and for the ASCII range, the 2.x rules apply (i.e 11478 start with letters and underscore, continue with 11479 letters, digits, underscore). However, given the current 11480 definition of XID_Start and XID_Continue, it is sufficient 11481 to check just for these, except that _ must be allowed 11482 as starting an identifier. */ 11483 first = PyUnicode_READ(kind, data, 0); 11484 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11485 return 0; 11486 11487 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11488 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11489 return 0; 11490 return 1; 11491} 11492 11493PyDoc_STRVAR(isidentifier__doc__, 11494 "S.isidentifier() -> bool\n\ 11495\n\ 11496Return True if S is a valid identifier according\n\ 11497to the language definition."); 11498 11499static PyObject* 11500unicode_isidentifier(PyObject *self) 11501{ 11502 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11503} 11504 11505PyDoc_STRVAR(isprintable__doc__, 11506 "S.isprintable() -> bool\n\ 11507\n\ 11508Return True if all characters in S are considered\n\ 11509printable in repr() or S is empty, False otherwise."); 11510 11511static PyObject* 11512unicode_isprintable(PyObject *self) 11513{ 11514 Py_ssize_t i, length; 11515 int kind; 11516 void *data; 11517 11518 if (PyUnicode_READY(self) == -1) 11519 return NULL; 11520 length = PyUnicode_GET_LENGTH(self); 11521 kind = PyUnicode_KIND(self); 11522 data = PyUnicode_DATA(self); 11523 11524 /* Shortcut for single character strings */ 11525 if (length == 1) 11526 return PyBool_FromLong( 11527 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11528 11529 for (i = 0; i < length; i++) { 11530 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11531 Py_RETURN_FALSE; 11532 } 11533 } 11534 Py_RETURN_TRUE; 11535} 11536 11537PyDoc_STRVAR(join__doc__, 11538 "S.join(iterable) -> str\n\ 11539\n\ 11540Return a string which is the concatenation of the strings in the\n\ 11541iterable. The separator between elements is S."); 11542 11543static PyObject* 11544unicode_join(PyObject *self, PyObject *data) 11545{ 11546 return PyUnicode_Join(self, data); 11547} 11548 11549static Py_ssize_t 11550unicode_length(PyObject *self) 11551{ 11552 if (PyUnicode_READY(self) == -1) 11553 return -1; 11554 return PyUnicode_GET_LENGTH(self); 11555} 11556 11557PyDoc_STRVAR(ljust__doc__, 11558 "S.ljust(width[, fillchar]) -> str\n\ 11559\n\ 11560Return S left-justified in a Unicode string of length width. Padding is\n\ 11561done using the specified fill character (default is a space)."); 11562 11563static PyObject * 11564unicode_ljust(PyObject *self, PyObject *args) 11565{ 11566 Py_ssize_t width; 11567 Py_UCS4 fillchar = ' '; 11568 11569 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11570 return NULL; 11571 11572 if (PyUnicode_READY(self) == -1) 11573 return NULL; 11574 11575 if (PyUnicode_GET_LENGTH(self) >= width) 11576 return unicode_result_unchanged(self); 11577 11578 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11579} 11580 11581PyDoc_STRVAR(lower__doc__, 11582 "S.lower() -> str\n\ 11583\n\ 11584Return a copy of the string S converted to lowercase."); 11585 11586static PyObject* 11587unicode_lower(PyObject *self) 11588{ 11589 if (PyUnicode_READY(self) == -1) 11590 return NULL; 11591 if (PyUnicode_IS_ASCII(self)) 11592 return ascii_upper_or_lower(self, 1); 11593 return case_operation(self, do_lower); 11594} 11595 11596#define LEFTSTRIP 0 11597#define RIGHTSTRIP 1 11598#define BOTHSTRIP 2 11599 11600/* Arrays indexed by above */ 11601static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11602 11603#define STRIPNAME(i) (stripformat[i]+3) 11604 11605/* externally visible for str.strip(unicode) */ 11606PyObject * 11607_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11608{ 11609 void *data; 11610 int kind; 11611 Py_ssize_t i, j, len; 11612 BLOOM_MASK sepmask; 11613 11614 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11615 return NULL; 11616 11617 kind = PyUnicode_KIND(self); 11618 data = PyUnicode_DATA(self); 11619 len = PyUnicode_GET_LENGTH(self); 11620 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11621 PyUnicode_DATA(sepobj), 11622 PyUnicode_GET_LENGTH(sepobj)); 11623 11624 i = 0; 11625 if (striptype != RIGHTSTRIP) { 11626 while (i < len && 11627 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11628 i++; 11629 } 11630 } 11631 11632 j = len; 11633 if (striptype != LEFTSTRIP) { 11634 do { 11635 j--; 11636 } while (j >= i && 11637 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11638 j++; 11639 } 11640 11641 return PyUnicode_Substring(self, i, j); 11642} 11643 11644PyObject* 11645PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11646{ 11647 unsigned char *data; 11648 int kind; 11649 Py_ssize_t length; 11650 11651 if (PyUnicode_READY(self) == -1) 11652 return NULL; 11653 11654 length = PyUnicode_GET_LENGTH(self); 11655 end = Py_MIN(end, length); 11656 11657 if (start == 0 && end == length) 11658 return unicode_result_unchanged(self); 11659 11660 if (start < 0 || end < 0) { 11661 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11662 return NULL; 11663 } 11664 if (start >= length || end < start) { 11665 Py_INCREF(unicode_empty); 11666 return unicode_empty; 11667 } 11668 11669 length = end - start; 11670 if (PyUnicode_IS_ASCII(self)) { 11671 data = PyUnicode_1BYTE_DATA(self); 11672 return _PyUnicode_FromASCII((char*)(data + start), length); 11673 } 11674 else { 11675 kind = PyUnicode_KIND(self); 11676 data = PyUnicode_1BYTE_DATA(self); 11677 return PyUnicode_FromKindAndData(kind, 11678 data + kind * start, 11679 length); 11680 } 11681} 11682 11683static PyObject * 11684do_strip(PyObject *self, int striptype) 11685{ 11686 int kind; 11687 void *data; 11688 Py_ssize_t len, i, j; 11689 11690 if (PyUnicode_READY(self) == -1) 11691 return NULL; 11692 11693 kind = PyUnicode_KIND(self); 11694 data = PyUnicode_DATA(self); 11695 len = PyUnicode_GET_LENGTH(self); 11696 11697 i = 0; 11698 if (striptype != RIGHTSTRIP) { 11699 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11700 i++; 11701 } 11702 } 11703 11704 j = len; 11705 if (striptype != LEFTSTRIP) { 11706 do { 11707 j--; 11708 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11709 j++; 11710 } 11711 11712 return PyUnicode_Substring(self, i, j); 11713} 11714 11715 11716static PyObject * 11717do_argstrip(PyObject *self, int striptype, PyObject *args) 11718{ 11719 PyObject *sep = NULL; 11720 11721 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11722 return NULL; 11723 11724 if (sep != NULL && sep != Py_None) { 11725 if (PyUnicode_Check(sep)) 11726 return _PyUnicode_XStrip(self, striptype, sep); 11727 else { 11728 PyErr_Format(PyExc_TypeError, 11729 "%s arg must be None or str", 11730 STRIPNAME(striptype)); 11731 return NULL; 11732 } 11733 } 11734 11735 return do_strip(self, striptype); 11736} 11737 11738 11739PyDoc_STRVAR(strip__doc__, 11740 "S.strip([chars]) -> str\n\ 11741\n\ 11742Return a copy of the string S with leading and trailing\n\ 11743whitespace removed.\n\ 11744If chars is given and not None, remove characters in chars instead."); 11745 11746static PyObject * 11747unicode_strip(PyObject *self, PyObject *args) 11748{ 11749 if (PyTuple_GET_SIZE(args) == 0) 11750 return do_strip(self, BOTHSTRIP); /* Common case */ 11751 else 11752 return do_argstrip(self, BOTHSTRIP, args); 11753} 11754 11755 11756PyDoc_STRVAR(lstrip__doc__, 11757 "S.lstrip([chars]) -> str\n\ 11758\n\ 11759Return a copy of the string S with leading whitespace removed.\n\ 11760If chars is given and not None, remove characters in chars instead."); 11761 11762static PyObject * 11763unicode_lstrip(PyObject *self, PyObject *args) 11764{ 11765 if (PyTuple_GET_SIZE(args) == 0) 11766 return do_strip(self, LEFTSTRIP); /* Common case */ 11767 else 11768 return do_argstrip(self, LEFTSTRIP, args); 11769} 11770 11771 11772PyDoc_STRVAR(rstrip__doc__, 11773 "S.rstrip([chars]) -> str\n\ 11774\n\ 11775Return a copy of the string S with trailing whitespace removed.\n\ 11776If chars is given and not None, remove characters in chars instead."); 11777 11778static PyObject * 11779unicode_rstrip(PyObject *self, PyObject *args) 11780{ 11781 if (PyTuple_GET_SIZE(args) == 0) 11782 return do_strip(self, RIGHTSTRIP); /* Common case */ 11783 else 11784 return do_argstrip(self, RIGHTSTRIP, args); 11785} 11786 11787 11788static PyObject* 11789unicode_repeat(PyObject *str, Py_ssize_t len) 11790{ 11791 PyObject *u; 11792 Py_ssize_t nchars, n; 11793 11794 if (len < 1) { 11795 Py_INCREF(unicode_empty); 11796 return unicode_empty; 11797 } 11798 11799 /* no repeat, return original string */ 11800 if (len == 1) 11801 return unicode_result_unchanged(str); 11802 11803 if (PyUnicode_READY(str) == -1) 11804 return NULL; 11805 11806 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11807 PyErr_SetString(PyExc_OverflowError, 11808 "repeated string is too long"); 11809 return NULL; 11810 } 11811 nchars = len * PyUnicode_GET_LENGTH(str); 11812 11813 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11814 if (!u) 11815 return NULL; 11816 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11817 11818 if (PyUnicode_GET_LENGTH(str) == 1) { 11819 const int kind = PyUnicode_KIND(str); 11820 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11821 if (kind == PyUnicode_1BYTE_KIND) { 11822 void *to = PyUnicode_DATA(u); 11823 memset(to, (unsigned char)fill_char, len); 11824 } 11825 else if (kind == PyUnicode_2BYTE_KIND) { 11826 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11827 for (n = 0; n < len; ++n) 11828 ucs2[n] = fill_char; 11829 } else { 11830 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11831 assert(kind == PyUnicode_4BYTE_KIND); 11832 for (n = 0; n < len; ++n) 11833 ucs4[n] = fill_char; 11834 } 11835 } 11836 else { 11837 /* number of characters copied this far */ 11838 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11839 const Py_ssize_t char_size = PyUnicode_KIND(str); 11840 char *to = (char *) PyUnicode_DATA(u); 11841 Py_MEMCPY(to, PyUnicode_DATA(str), 11842 PyUnicode_GET_LENGTH(str) * char_size); 11843 while (done < nchars) { 11844 n = (done <= nchars-done) ? done : nchars-done; 11845 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11846 done += n; 11847 } 11848 } 11849 11850 assert(_PyUnicode_CheckConsistency(u, 1)); 11851 return u; 11852} 11853 11854PyObject * 11855PyUnicode_Replace(PyObject *obj, 11856 PyObject *subobj, 11857 PyObject *replobj, 11858 Py_ssize_t maxcount) 11859{ 11860 PyObject *self; 11861 PyObject *str1; 11862 PyObject *str2; 11863 PyObject *result; 11864 11865 self = PyUnicode_FromObject(obj); 11866 if (self == NULL) 11867 return NULL; 11868 str1 = PyUnicode_FromObject(subobj); 11869 if (str1 == NULL) { 11870 Py_DECREF(self); 11871 return NULL; 11872 } 11873 str2 = PyUnicode_FromObject(replobj); 11874 if (str2 == NULL) { 11875 Py_DECREF(self); 11876 Py_DECREF(str1); 11877 return NULL; 11878 } 11879 if (PyUnicode_READY(self) == -1 || 11880 PyUnicode_READY(str1) == -1 || 11881 PyUnicode_READY(str2) == -1) 11882 result = NULL; 11883 else 11884 result = replace(self, str1, str2, maxcount); 11885 Py_DECREF(self); 11886 Py_DECREF(str1); 11887 Py_DECREF(str2); 11888 return result; 11889} 11890 11891PyDoc_STRVAR(replace__doc__, 11892 "S.replace(old, new[, count]) -> str\n\ 11893\n\ 11894Return a copy of S with all occurrences of substring\n\ 11895old replaced by new. If the optional argument count is\n\ 11896given, only the first count occurrences are replaced."); 11897 11898static PyObject* 11899unicode_replace(PyObject *self, PyObject *args) 11900{ 11901 PyObject *str1; 11902 PyObject *str2; 11903 Py_ssize_t maxcount = -1; 11904 PyObject *result; 11905 11906 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11907 return NULL; 11908 if (PyUnicode_READY(self) == -1) 11909 return NULL; 11910 str1 = PyUnicode_FromObject(str1); 11911 if (str1 == NULL) 11912 return NULL; 11913 str2 = PyUnicode_FromObject(str2); 11914 if (str2 == NULL) { 11915 Py_DECREF(str1); 11916 return NULL; 11917 } 11918 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 11919 result = NULL; 11920 else 11921 result = replace(self, str1, str2, maxcount); 11922 11923 Py_DECREF(str1); 11924 Py_DECREF(str2); 11925 return result; 11926} 11927 11928static PyObject * 11929unicode_repr(PyObject *unicode) 11930{ 11931 PyObject *repr; 11932 Py_ssize_t isize; 11933 Py_ssize_t osize, squote, dquote, i, o; 11934 Py_UCS4 max, quote; 11935 int ikind, okind; 11936 void *idata, *odata; 11937 11938 if (PyUnicode_READY(unicode) == -1) 11939 return NULL; 11940 11941 isize = PyUnicode_GET_LENGTH(unicode); 11942 idata = PyUnicode_DATA(unicode); 11943 11944 /* Compute length of output, quote characters, and 11945 maximum character */ 11946 osize = 2; /* quotes */ 11947 max = 127; 11948 squote = dquote = 0; 11949 ikind = PyUnicode_KIND(unicode); 11950 for (i = 0; i < isize; i++) { 11951 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11952 switch (ch) { 11953 case '\'': squote++; osize++; break; 11954 case '"': dquote++; osize++; break; 11955 case '\\': case '\t': case '\r': case '\n': 11956 osize += 2; break; 11957 default: 11958 /* Fast-path ASCII */ 11959 if (ch < ' ' || ch == 0x7f) 11960 osize += 4; /* \xHH */ 11961 else if (ch < 0x7f) 11962 osize++; 11963 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11964 osize++; 11965 max = ch > max ? ch : max; 11966 } 11967 else if (ch < 0x100) 11968 osize += 4; /* \xHH */ 11969 else if (ch < 0x10000) 11970 osize += 6; /* \uHHHH */ 11971 else 11972 osize += 10; /* \uHHHHHHHH */ 11973 } 11974 } 11975 11976 quote = '\''; 11977 if (squote) { 11978 if (dquote) 11979 /* Both squote and dquote present. Use squote, 11980 and escape them */ 11981 osize += squote; 11982 else 11983 quote = '"'; 11984 } 11985 11986 repr = PyUnicode_New(osize, max); 11987 if (repr == NULL) 11988 return NULL; 11989 okind = PyUnicode_KIND(repr); 11990 odata = PyUnicode_DATA(repr); 11991 11992 PyUnicode_WRITE(okind, odata, 0, quote); 11993 PyUnicode_WRITE(okind, odata, osize-1, quote); 11994 11995 for (i = 0, o = 1; i < isize; i++) { 11996 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11997 11998 /* Escape quotes and backslashes */ 11999 if ((ch == quote) || (ch == '\\')) { 12000 PyUnicode_WRITE(okind, odata, o++, '\\'); 12001 PyUnicode_WRITE(okind, odata, o++, ch); 12002 continue; 12003 } 12004 12005 /* Map special whitespace to '\t', \n', '\r' */ 12006 if (ch == '\t') { 12007 PyUnicode_WRITE(okind, odata, o++, '\\'); 12008 PyUnicode_WRITE(okind, odata, o++, 't'); 12009 } 12010 else if (ch == '\n') { 12011 PyUnicode_WRITE(okind, odata, o++, '\\'); 12012 PyUnicode_WRITE(okind, odata, o++, 'n'); 12013 } 12014 else if (ch == '\r') { 12015 PyUnicode_WRITE(okind, odata, o++, '\\'); 12016 PyUnicode_WRITE(okind, odata, o++, 'r'); 12017 } 12018 12019 /* Map non-printable US ASCII to '\xhh' */ 12020 else if (ch < ' ' || ch == 0x7F) { 12021 PyUnicode_WRITE(okind, odata, o++, '\\'); 12022 PyUnicode_WRITE(okind, odata, o++, 'x'); 12023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12025 } 12026 12027 /* Copy ASCII characters as-is */ 12028 else if (ch < 0x7F) { 12029 PyUnicode_WRITE(okind, odata, o++, ch); 12030 } 12031 12032 /* Non-ASCII characters */ 12033 else { 12034 /* Map Unicode whitespace and control characters 12035 (categories Z* and C* except ASCII space) 12036 */ 12037 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12038 PyUnicode_WRITE(okind, odata, o++, '\\'); 12039 /* Map 8-bit characters to '\xhh' */ 12040 if (ch <= 0xff) { 12041 PyUnicode_WRITE(okind, odata, o++, 'x'); 12042 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12044 } 12045 /* Map 16-bit characters to '\uxxxx' */ 12046 else if (ch <= 0xffff) { 12047 PyUnicode_WRITE(okind, odata, o++, 'u'); 12048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12052 } 12053 /* Map 21-bit characters to '\U00xxxxxx' */ 12054 else { 12055 PyUnicode_WRITE(okind, odata, o++, 'U'); 12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12064 } 12065 } 12066 /* Copy characters as-is */ 12067 else { 12068 PyUnicode_WRITE(okind, odata, o++, ch); 12069 } 12070 } 12071 } 12072 /* Closing quote already added at the beginning */ 12073 assert(_PyUnicode_CheckConsistency(repr, 1)); 12074 return repr; 12075} 12076 12077PyDoc_STRVAR(rfind__doc__, 12078 "S.rfind(sub[, start[, end]]) -> int\n\ 12079\n\ 12080Return the highest index in S where substring sub is found,\n\ 12081such that sub is contained within S[start:end]. Optional\n\ 12082arguments start and end are interpreted as in slice notation.\n\ 12083\n\ 12084Return -1 on failure."); 12085 12086static PyObject * 12087unicode_rfind(PyObject *self, PyObject *args) 12088{ 12089 PyObject *substring; 12090 Py_ssize_t start; 12091 Py_ssize_t end; 12092 Py_ssize_t result; 12093 12094 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12095 &start, &end)) 12096 return NULL; 12097 12098 if (PyUnicode_READY(self) == -1) 12099 return NULL; 12100 if (PyUnicode_READY(substring) == -1) 12101 return NULL; 12102 12103 result = any_find_slice(-1, self, substring, start, end); 12104 12105 Py_DECREF(substring); 12106 12107 if (result == -2) 12108 return NULL; 12109 12110 return PyLong_FromSsize_t(result); 12111} 12112 12113PyDoc_STRVAR(rindex__doc__, 12114 "S.rindex(sub[, start[, end]]) -> int\n\ 12115\n\ 12116Like S.rfind() but raise ValueError when the substring is not found."); 12117 12118static PyObject * 12119unicode_rindex(PyObject *self, PyObject *args) 12120{ 12121 PyObject *substring; 12122 Py_ssize_t start; 12123 Py_ssize_t end; 12124 Py_ssize_t result; 12125 12126 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12127 &start, &end)) 12128 return NULL; 12129 12130 if (PyUnicode_READY(self) == -1) 12131 return NULL; 12132 if (PyUnicode_READY(substring) == -1) 12133 return NULL; 12134 12135 result = any_find_slice(-1, self, substring, start, end); 12136 12137 Py_DECREF(substring); 12138 12139 if (result == -2) 12140 return NULL; 12141 12142 if (result < 0) { 12143 PyErr_SetString(PyExc_ValueError, "substring not found"); 12144 return NULL; 12145 } 12146 12147 return PyLong_FromSsize_t(result); 12148} 12149 12150PyDoc_STRVAR(rjust__doc__, 12151 "S.rjust(width[, fillchar]) -> str\n\ 12152\n\ 12153Return S right-justified in a string of length width. Padding is\n\ 12154done using the specified fill character (default is a space)."); 12155 12156static PyObject * 12157unicode_rjust(PyObject *self, PyObject *args) 12158{ 12159 Py_ssize_t width; 12160 Py_UCS4 fillchar = ' '; 12161 12162 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12163 return NULL; 12164 12165 if (PyUnicode_READY(self) == -1) 12166 return NULL; 12167 12168 if (PyUnicode_GET_LENGTH(self) >= width) 12169 return unicode_result_unchanged(self); 12170 12171 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12172} 12173 12174PyObject * 12175PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12176{ 12177 PyObject *result; 12178 12179 s = PyUnicode_FromObject(s); 12180 if (s == NULL) 12181 return NULL; 12182 if (sep != NULL) { 12183 sep = PyUnicode_FromObject(sep); 12184 if (sep == NULL) { 12185 Py_DECREF(s); 12186 return NULL; 12187 } 12188 } 12189 12190 result = split(s, sep, maxsplit); 12191 12192 Py_DECREF(s); 12193 Py_XDECREF(sep); 12194 return result; 12195} 12196 12197PyDoc_STRVAR(split__doc__, 12198 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12199\n\ 12200Return a list of the words in S, using sep as the\n\ 12201delimiter string. If maxsplit is given, at most maxsplit\n\ 12202splits are done. If sep is not specified or is None, any\n\ 12203whitespace string is a separator and empty strings are\n\ 12204removed from the result."); 12205 12206static PyObject* 12207unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12208{ 12209 static char *kwlist[] = {"sep", "maxsplit", 0}; 12210 PyObject *substring = Py_None; 12211 Py_ssize_t maxcount = -1; 12212 12213 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12214 kwlist, &substring, &maxcount)) 12215 return NULL; 12216 12217 if (substring == Py_None) 12218 return split(self, NULL, maxcount); 12219 else if (PyUnicode_Check(substring)) 12220 return split(self, substring, maxcount); 12221 else 12222 return PyUnicode_Split(self, substring, maxcount); 12223} 12224 12225PyObject * 12226PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12227{ 12228 PyObject* str_obj; 12229 PyObject* sep_obj; 12230 PyObject* out; 12231 int kind1, kind2, kind; 12232 void *buf1 = NULL, *buf2 = NULL; 12233 Py_ssize_t len1, len2; 12234 12235 str_obj = PyUnicode_FromObject(str_in); 12236 if (!str_obj) 12237 return NULL; 12238 sep_obj = PyUnicode_FromObject(sep_in); 12239 if (!sep_obj) { 12240 Py_DECREF(str_obj); 12241 return NULL; 12242 } 12243 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12244 Py_DECREF(sep_obj); 12245 Py_DECREF(str_obj); 12246 return NULL; 12247 } 12248 12249 kind1 = PyUnicode_KIND(str_obj); 12250 kind2 = PyUnicode_KIND(sep_obj); 12251 kind = Py_MAX(kind1, kind2); 12252 buf1 = PyUnicode_DATA(str_obj); 12253 if (kind1 != kind) 12254 buf1 = _PyUnicode_AsKind(str_obj, kind); 12255 if (!buf1) 12256 goto onError; 12257 buf2 = PyUnicode_DATA(sep_obj); 12258 if (kind2 != kind) 12259 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12260 if (!buf2) 12261 goto onError; 12262 len1 = PyUnicode_GET_LENGTH(str_obj); 12263 len2 = PyUnicode_GET_LENGTH(sep_obj); 12264 12265 switch (PyUnicode_KIND(str_obj)) { 12266 case PyUnicode_1BYTE_KIND: 12267 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12268 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12269 else 12270 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12271 break; 12272 case PyUnicode_2BYTE_KIND: 12273 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12274 break; 12275 case PyUnicode_4BYTE_KIND: 12276 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12277 break; 12278 default: 12279 assert(0); 12280 out = 0; 12281 } 12282 12283 Py_DECREF(sep_obj); 12284 Py_DECREF(str_obj); 12285 if (kind1 != kind) 12286 PyMem_Free(buf1); 12287 if (kind2 != kind) 12288 PyMem_Free(buf2); 12289 12290 return out; 12291 onError: 12292 Py_DECREF(sep_obj); 12293 Py_DECREF(str_obj); 12294 if (kind1 != kind && buf1) 12295 PyMem_Free(buf1); 12296 if (kind2 != kind && buf2) 12297 PyMem_Free(buf2); 12298 return NULL; 12299} 12300 12301 12302PyObject * 12303PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12304{ 12305 PyObject* str_obj; 12306 PyObject* sep_obj; 12307 PyObject* out; 12308 int kind1, kind2, kind; 12309 void *buf1 = NULL, *buf2 = NULL; 12310 Py_ssize_t len1, len2; 12311 12312 str_obj = PyUnicode_FromObject(str_in); 12313 if (!str_obj) 12314 return NULL; 12315 sep_obj = PyUnicode_FromObject(sep_in); 12316 if (!sep_obj) { 12317 Py_DECREF(str_obj); 12318 return NULL; 12319 } 12320 12321 kind1 = PyUnicode_KIND(str_in); 12322 kind2 = PyUnicode_KIND(sep_obj); 12323 kind = Py_MAX(kind1, kind2); 12324 buf1 = PyUnicode_DATA(str_in); 12325 if (kind1 != kind) 12326 buf1 = _PyUnicode_AsKind(str_in, kind); 12327 if (!buf1) 12328 goto onError; 12329 buf2 = PyUnicode_DATA(sep_obj); 12330 if (kind2 != kind) 12331 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12332 if (!buf2) 12333 goto onError; 12334 len1 = PyUnicode_GET_LENGTH(str_obj); 12335 len2 = PyUnicode_GET_LENGTH(sep_obj); 12336 12337 switch (PyUnicode_KIND(str_in)) { 12338 case PyUnicode_1BYTE_KIND: 12339 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12340 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12341 else 12342 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12343 break; 12344 case PyUnicode_2BYTE_KIND: 12345 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12346 break; 12347 case PyUnicode_4BYTE_KIND: 12348 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12349 break; 12350 default: 12351 assert(0); 12352 out = 0; 12353 } 12354 12355 Py_DECREF(sep_obj); 12356 Py_DECREF(str_obj); 12357 if (kind1 != kind) 12358 PyMem_Free(buf1); 12359 if (kind2 != kind) 12360 PyMem_Free(buf2); 12361 12362 return out; 12363 onError: 12364 Py_DECREF(sep_obj); 12365 Py_DECREF(str_obj); 12366 if (kind1 != kind && buf1) 12367 PyMem_Free(buf1); 12368 if (kind2 != kind && buf2) 12369 PyMem_Free(buf2); 12370 return NULL; 12371} 12372 12373PyDoc_STRVAR(partition__doc__, 12374 "S.partition(sep) -> (head, sep, tail)\n\ 12375\n\ 12376Search for the separator sep in S, and return the part before it,\n\ 12377the separator itself, and the part after it. If the separator is not\n\ 12378found, return S and two empty strings."); 12379 12380static PyObject* 12381unicode_partition(PyObject *self, PyObject *separator) 12382{ 12383 return PyUnicode_Partition(self, separator); 12384} 12385 12386PyDoc_STRVAR(rpartition__doc__, 12387 "S.rpartition(sep) -> (head, sep, tail)\n\ 12388\n\ 12389Search for the separator sep in S, starting at the end of S, and return\n\ 12390the part before it, the separator itself, and the part after it. If the\n\ 12391separator is not found, return two empty strings and S."); 12392 12393static PyObject* 12394unicode_rpartition(PyObject *self, PyObject *separator) 12395{ 12396 return PyUnicode_RPartition(self, separator); 12397} 12398 12399PyObject * 12400PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12401{ 12402 PyObject *result; 12403 12404 s = PyUnicode_FromObject(s); 12405 if (s == NULL) 12406 return NULL; 12407 if (sep != NULL) { 12408 sep = PyUnicode_FromObject(sep); 12409 if (sep == NULL) { 12410 Py_DECREF(s); 12411 return NULL; 12412 } 12413 } 12414 12415 result = rsplit(s, sep, maxsplit); 12416 12417 Py_DECREF(s); 12418 Py_XDECREF(sep); 12419 return result; 12420} 12421 12422PyDoc_STRVAR(rsplit__doc__, 12423 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12424\n\ 12425Return a list of the words in S, using sep as the\n\ 12426delimiter string, starting at the end of the string and\n\ 12427working to the front. If maxsplit is given, at most maxsplit\n\ 12428splits are done. If sep is not specified, any whitespace string\n\ 12429is a separator."); 12430 12431static PyObject* 12432unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12433{ 12434 static char *kwlist[] = {"sep", "maxsplit", 0}; 12435 PyObject *substring = Py_None; 12436 Py_ssize_t maxcount = -1; 12437 12438 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12439 kwlist, &substring, &maxcount)) 12440 return NULL; 12441 12442 if (substring == Py_None) 12443 return rsplit(self, NULL, maxcount); 12444 else if (PyUnicode_Check(substring)) 12445 return rsplit(self, substring, maxcount); 12446 else 12447 return PyUnicode_RSplit(self, substring, maxcount); 12448} 12449 12450PyDoc_STRVAR(splitlines__doc__, 12451 "S.splitlines([keepends]) -> list of strings\n\ 12452\n\ 12453Return a list of the lines in S, breaking at line boundaries.\n\ 12454Line breaks are not included in the resulting list unless keepends\n\ 12455is given and true."); 12456 12457static PyObject* 12458unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12459{ 12460 static char *kwlist[] = {"keepends", 0}; 12461 int keepends = 0; 12462 12463 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12464 kwlist, &keepends)) 12465 return NULL; 12466 12467 return PyUnicode_Splitlines(self, keepends); 12468} 12469 12470static 12471PyObject *unicode_str(PyObject *self) 12472{ 12473 return unicode_result_unchanged(self); 12474} 12475 12476PyDoc_STRVAR(swapcase__doc__, 12477 "S.swapcase() -> str\n\ 12478\n\ 12479Return a copy of S with uppercase characters converted to lowercase\n\ 12480and vice versa."); 12481 12482static PyObject* 12483unicode_swapcase(PyObject *self) 12484{ 12485 if (PyUnicode_READY(self) == -1) 12486 return NULL; 12487 return case_operation(self, do_swapcase); 12488} 12489 12490PyDoc_STRVAR(maketrans__doc__, 12491 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12492\n\ 12493Return a translation table usable for str.translate().\n\ 12494If there is only one argument, it must be a dictionary mapping Unicode\n\ 12495ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12496Character keys will be then converted to ordinals.\n\ 12497If there are two arguments, they must be strings of equal length, and\n\ 12498in the resulting dictionary, each character in x will be mapped to the\n\ 12499character at the same position in y. If there is a third argument, it\n\ 12500must be a string, whose characters will be mapped to None in the result."); 12501 12502static PyObject* 12503unicode_maketrans(PyObject *null, PyObject *args) 12504{ 12505 PyObject *x, *y = NULL, *z = NULL; 12506 PyObject *new = NULL, *key, *value; 12507 Py_ssize_t i = 0; 12508 int res; 12509 12510 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12511 return NULL; 12512 new = PyDict_New(); 12513 if (!new) 12514 return NULL; 12515 if (y != NULL) { 12516 int x_kind, y_kind, z_kind; 12517 void *x_data, *y_data, *z_data; 12518 12519 /* x must be a string too, of equal length */ 12520 if (!PyUnicode_Check(x)) { 12521 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12522 "be a string if there is a second argument"); 12523 goto err; 12524 } 12525 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12526 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12527 "arguments must have equal length"); 12528 goto err; 12529 } 12530 /* create entries for translating chars in x to those in y */ 12531 x_kind = PyUnicode_KIND(x); 12532 y_kind = PyUnicode_KIND(y); 12533 x_data = PyUnicode_DATA(x); 12534 y_data = PyUnicode_DATA(y); 12535 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12536 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12537 if (!key) 12538 goto err; 12539 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12540 if (!value) { 12541 Py_DECREF(key); 12542 goto err; 12543 } 12544 res = PyDict_SetItem(new, key, value); 12545 Py_DECREF(key); 12546 Py_DECREF(value); 12547 if (res < 0) 12548 goto err; 12549 } 12550 /* create entries for deleting chars in z */ 12551 if (z != NULL) { 12552 z_kind = PyUnicode_KIND(z); 12553 z_data = PyUnicode_DATA(z); 12554 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12555 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12556 if (!key) 12557 goto err; 12558 res = PyDict_SetItem(new, key, Py_None); 12559 Py_DECREF(key); 12560 if (res < 0) 12561 goto err; 12562 } 12563 } 12564 } else { 12565 int kind; 12566 void *data; 12567 12568 /* x must be a dict */ 12569 if (!PyDict_CheckExact(x)) { 12570 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12571 "to maketrans it must be a dict"); 12572 goto err; 12573 } 12574 /* copy entries into the new dict, converting string keys to int keys */ 12575 while (PyDict_Next(x, &i, &key, &value)) { 12576 if (PyUnicode_Check(key)) { 12577 /* convert string keys to integer keys */ 12578 PyObject *newkey; 12579 if (PyUnicode_GET_LENGTH(key) != 1) { 12580 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12581 "table must be of length 1"); 12582 goto err; 12583 } 12584 kind = PyUnicode_KIND(key); 12585 data = PyUnicode_DATA(key); 12586 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12587 if (!newkey) 12588 goto err; 12589 res = PyDict_SetItem(new, newkey, value); 12590 Py_DECREF(newkey); 12591 if (res < 0) 12592 goto err; 12593 } else if (PyLong_Check(key)) { 12594 /* just keep integer keys */ 12595 if (PyDict_SetItem(new, key, value) < 0) 12596 goto err; 12597 } else { 12598 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12599 "be strings or integers"); 12600 goto err; 12601 } 12602 } 12603 } 12604 return new; 12605 err: 12606 Py_DECREF(new); 12607 return NULL; 12608} 12609 12610PyDoc_STRVAR(translate__doc__, 12611 "S.translate(table) -> str\n\ 12612\n\ 12613Return a copy of the string S, where all characters have been mapped\n\ 12614through the given translation table, which must be a mapping of\n\ 12615Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12616Unmapped characters are left untouched. Characters mapped to None\n\ 12617are deleted."); 12618 12619static PyObject* 12620unicode_translate(PyObject *self, PyObject *table) 12621{ 12622 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12623} 12624 12625PyDoc_STRVAR(upper__doc__, 12626 "S.upper() -> str\n\ 12627\n\ 12628Return a copy of S converted to uppercase."); 12629 12630static PyObject* 12631unicode_upper(PyObject *self) 12632{ 12633 if (PyUnicode_READY(self) == -1) 12634 return NULL; 12635 if (PyUnicode_IS_ASCII(self)) 12636 return ascii_upper_or_lower(self, 0); 12637 return case_operation(self, do_upper); 12638} 12639 12640PyDoc_STRVAR(zfill__doc__, 12641 "S.zfill(width) -> str\n\ 12642\n\ 12643Pad a numeric string S with zeros on the left, to fill a field\n\ 12644of the specified width. The string S is never truncated."); 12645 12646static PyObject * 12647unicode_zfill(PyObject *self, PyObject *args) 12648{ 12649 Py_ssize_t fill; 12650 PyObject *u; 12651 Py_ssize_t width; 12652 int kind; 12653 void *data; 12654 Py_UCS4 chr; 12655 12656 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12657 return NULL; 12658 12659 if (PyUnicode_READY(self) == -1) 12660 return NULL; 12661 12662 if (PyUnicode_GET_LENGTH(self) >= width) 12663 return unicode_result_unchanged(self); 12664 12665 fill = width - PyUnicode_GET_LENGTH(self); 12666 12667 u = pad(self, fill, 0, '0'); 12668 12669 if (u == NULL) 12670 return NULL; 12671 12672 kind = PyUnicode_KIND(u); 12673 data = PyUnicode_DATA(u); 12674 chr = PyUnicode_READ(kind, data, fill); 12675 12676 if (chr == '+' || chr == '-') { 12677 /* move sign to beginning of string */ 12678 PyUnicode_WRITE(kind, data, 0, chr); 12679 PyUnicode_WRITE(kind, data, fill, '0'); 12680 } 12681 12682 assert(_PyUnicode_CheckConsistency(u, 1)); 12683 return u; 12684} 12685 12686#if 0 12687static PyObject * 12688unicode__decimal2ascii(PyObject *self) 12689{ 12690 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12691} 12692#endif 12693 12694PyDoc_STRVAR(startswith__doc__, 12695 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12696\n\ 12697Return True if S starts with the specified prefix, False otherwise.\n\ 12698With optional start, test S beginning at that position.\n\ 12699With optional end, stop comparing S at that position.\n\ 12700prefix can also be a tuple of strings to try."); 12701 12702static PyObject * 12703unicode_startswith(PyObject *self, 12704 PyObject *args) 12705{ 12706 PyObject *subobj; 12707 PyObject *substring; 12708 Py_ssize_t start = 0; 12709 Py_ssize_t end = PY_SSIZE_T_MAX; 12710 int result; 12711 12712 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12713 return NULL; 12714 if (PyTuple_Check(subobj)) { 12715 Py_ssize_t i; 12716 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12717 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12718 if (substring == NULL) 12719 return NULL; 12720 result = tailmatch(self, substring, start, end, -1); 12721 Py_DECREF(substring); 12722 if (result) { 12723 Py_RETURN_TRUE; 12724 } 12725 } 12726 /* nothing matched */ 12727 Py_RETURN_FALSE; 12728 } 12729 substring = PyUnicode_FromObject(subobj); 12730 if (substring == NULL) { 12731 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12732 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12733 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12734 return NULL; 12735 } 12736 result = tailmatch(self, substring, start, end, -1); 12737 Py_DECREF(substring); 12738 return PyBool_FromLong(result); 12739} 12740 12741 12742PyDoc_STRVAR(endswith__doc__, 12743 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12744\n\ 12745Return True if S ends with the specified suffix, False otherwise.\n\ 12746With optional start, test S beginning at that position.\n\ 12747With optional end, stop comparing S at that position.\n\ 12748suffix can also be a tuple of strings to try."); 12749 12750static PyObject * 12751unicode_endswith(PyObject *self, 12752 PyObject *args) 12753{ 12754 PyObject *subobj; 12755 PyObject *substring; 12756 Py_ssize_t start = 0; 12757 Py_ssize_t end = PY_SSIZE_T_MAX; 12758 int result; 12759 12760 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12761 return NULL; 12762 if (PyTuple_Check(subobj)) { 12763 Py_ssize_t i; 12764 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12765 substring = PyUnicode_FromObject( 12766 PyTuple_GET_ITEM(subobj, i)); 12767 if (substring == NULL) 12768 return NULL; 12769 result = tailmatch(self, substring, start, end, +1); 12770 Py_DECREF(substring); 12771 if (result) { 12772 Py_RETURN_TRUE; 12773 } 12774 } 12775 Py_RETURN_FALSE; 12776 } 12777 substring = PyUnicode_FromObject(subobj); 12778 if (substring == NULL) { 12779 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12780 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12781 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12782 return NULL; 12783 } 12784 result = tailmatch(self, substring, start, end, +1); 12785 Py_DECREF(substring); 12786 return PyBool_FromLong(result); 12787} 12788 12789Py_LOCAL_INLINE(void) 12790_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12791{ 12792 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12793 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12794 writer->data = PyUnicode_DATA(writer->buffer); 12795 writer->kind = PyUnicode_KIND(writer->buffer); 12796} 12797 12798void 12799_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) 12800{ 12801 memset(writer, 0, sizeof(*writer)); 12802#ifdef Py_DEBUG 12803 writer->kind = 5; /* invalid kind */ 12804#endif 12805 writer->min_length = Py_MAX(min_length, 100); 12806 writer->overallocate = (min_length > 0); 12807} 12808 12809int 12810_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12811 Py_ssize_t length, Py_UCS4 maxchar) 12812{ 12813 Py_ssize_t newlen; 12814 PyObject *newbuffer; 12815 12816 assert(length > 0); 12817 12818 if (length > PY_SSIZE_T_MAX - writer->pos) { 12819 PyErr_NoMemory(); 12820 return -1; 12821 } 12822 newlen = writer->pos + length; 12823 12824 if (writer->buffer == NULL) { 12825 if (writer->overallocate) { 12826 /* overallocate 25% to limit the number of resize */ 12827 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12828 newlen += newlen / 4; 12829 if (newlen < writer->min_length) 12830 newlen = writer->min_length; 12831 } 12832 writer->buffer = PyUnicode_New(newlen, maxchar); 12833 if (writer->buffer == NULL) 12834 return -1; 12835 _PyUnicodeWriter_Update(writer); 12836 return 0; 12837 } 12838 12839 if (newlen > writer->size) { 12840 if (writer->overallocate) { 12841 /* overallocate 25% to limit the number of resize */ 12842 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12843 newlen += newlen / 4; 12844 if (newlen < writer->min_length) 12845 newlen = writer->min_length; 12846 } 12847 12848 if (maxchar > writer->maxchar || writer->readonly) { 12849 /* resize + widen */ 12850 newbuffer = PyUnicode_New(newlen, maxchar); 12851 if (newbuffer == NULL) 12852 return -1; 12853 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12854 writer->buffer, 0, writer->pos); 12855 Py_DECREF(writer->buffer); 12856 writer->readonly = 0; 12857 } 12858 else { 12859 newbuffer = resize_compact(writer->buffer, newlen); 12860 if (newbuffer == NULL) 12861 return -1; 12862 } 12863 writer->buffer = newbuffer; 12864 _PyUnicodeWriter_Update(writer); 12865 } 12866 else if (maxchar > writer->maxchar) { 12867 assert(!writer->readonly); 12868 newbuffer = PyUnicode_New(writer->size, maxchar); 12869 if (newbuffer == NULL) 12870 return -1; 12871 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12872 writer->buffer, 0, writer->pos); 12873 Py_DECREF(writer->buffer); 12874 writer->buffer = newbuffer; 12875 _PyUnicodeWriter_Update(writer); 12876 } 12877 return 0; 12878} 12879 12880int 12881_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 12882{ 12883 Py_UCS4 maxchar; 12884 Py_ssize_t len; 12885 12886 if (PyUnicode_READY(str) == -1) 12887 return -1; 12888 len = PyUnicode_GET_LENGTH(str); 12889 if (len == 0) 12890 return 0; 12891 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 12892 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 12893 if (writer->buffer == NULL && !writer->overallocate) { 12894 Py_INCREF(str); 12895 writer->buffer = str; 12896 _PyUnicodeWriter_Update(writer); 12897 writer->readonly = 1; 12898 writer->size = 0; 12899 writer->pos += len; 12900 return 0; 12901 } 12902 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 12903 return -1; 12904 } 12905 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 12906 str, 0, len); 12907 writer->pos += len; 12908 return 0; 12909} 12910 12911PyObject * 12912_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 12913{ 12914 if (writer->pos == 0) { 12915 Py_XDECREF(writer->buffer); 12916 Py_INCREF(unicode_empty); 12917 return unicode_empty; 12918 } 12919 if (writer->readonly) { 12920 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 12921 return writer->buffer; 12922 } 12923 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 12924 PyObject *newbuffer; 12925 newbuffer = resize_compact(writer->buffer, writer->pos); 12926 if (newbuffer == NULL) { 12927 Py_DECREF(writer->buffer); 12928 return NULL; 12929 } 12930 writer->buffer = newbuffer; 12931 } 12932 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 12933 return writer->buffer; 12934} 12935 12936void 12937_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 12938{ 12939 Py_CLEAR(writer->buffer); 12940} 12941 12942#include "stringlib/unicode_format.h" 12943 12944PyDoc_STRVAR(format__doc__, 12945 "S.format(*args, **kwargs) -> str\n\ 12946\n\ 12947Return a formatted version of S, using substitutions from args and kwargs.\n\ 12948The substitutions are identified by braces ('{' and '}')."); 12949 12950PyDoc_STRVAR(format_map__doc__, 12951 "S.format_map(mapping) -> str\n\ 12952\n\ 12953Return a formatted version of S, using substitutions from mapping.\n\ 12954The substitutions are identified by braces ('{' and '}')."); 12955 12956static PyObject * 12957unicode__format__(PyObject* self, PyObject* args) 12958{ 12959 PyObject *format_spec; 12960 _PyUnicodeWriter writer; 12961 int ret; 12962 12963 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12964 return NULL; 12965 12966 if (PyUnicode_READY(self) == -1) 12967 return NULL; 12968 _PyUnicodeWriter_Init(&writer, 0); 12969 ret = _PyUnicode_FormatAdvancedWriter(&writer, 12970 self, format_spec, 0, 12971 PyUnicode_GET_LENGTH(format_spec)); 12972 if (ret == -1) { 12973 _PyUnicodeWriter_Dealloc(&writer); 12974 return NULL; 12975 } 12976 return _PyUnicodeWriter_Finish(&writer); 12977} 12978 12979PyDoc_STRVAR(p_format__doc__, 12980 "S.__format__(format_spec) -> str\n\ 12981\n\ 12982Return a formatted version of S as described by format_spec."); 12983 12984static PyObject * 12985unicode__sizeof__(PyObject *v) 12986{ 12987 Py_ssize_t size; 12988 12989 /* If it's a compact object, account for base structure + 12990 character data. */ 12991 if (PyUnicode_IS_COMPACT_ASCII(v)) 12992 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12993 else if (PyUnicode_IS_COMPACT(v)) 12994 size = sizeof(PyCompactUnicodeObject) + 12995 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12996 else { 12997 /* If it is a two-block object, account for base object, and 12998 for character block if present. */ 12999 size = sizeof(PyUnicodeObject); 13000 if (_PyUnicode_DATA_ANY(v)) 13001 size += (PyUnicode_GET_LENGTH(v) + 1) * 13002 PyUnicode_KIND(v); 13003 } 13004 /* If the wstr pointer is present, account for it unless it is shared 13005 with the data pointer. Check if the data is not shared. */ 13006 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13007 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13008 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13009 size += PyUnicode_UTF8_LENGTH(v) + 1; 13010 13011 return PyLong_FromSsize_t(size); 13012} 13013 13014PyDoc_STRVAR(sizeof__doc__, 13015 "S.__sizeof__() -> size of S in memory, in bytes"); 13016 13017static PyObject * 13018unicode_getnewargs(PyObject *v) 13019{ 13020 PyObject *copy = _PyUnicode_Copy(v); 13021 if (!copy) 13022 return NULL; 13023 return Py_BuildValue("(N)", copy); 13024} 13025 13026static PyMethodDef unicode_methods[] = { 13027 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13028 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13029 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13030 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13031 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13032 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13033 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13034 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13035 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13036 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13037 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 13038 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13039 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13040 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13041 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13042 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13043 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13044 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13045 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13046 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13047 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13048 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13049 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13050 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13051 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13052 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13053 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13054 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13055 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13056 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13057 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13058 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13059 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13060 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13061 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13062 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13063 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13064 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13065 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13066 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13067 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13068 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13069 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13070 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13071 {"maketrans", (PyCFunction) unicode_maketrans, 13072 METH_VARARGS | METH_STATIC, maketrans__doc__}, 13073 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13074#if 0 13075 /* These methods are just used for debugging the implementation. */ 13076 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13077#endif 13078 13079 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13080 {NULL, NULL} 13081}; 13082 13083static PyObject * 13084unicode_mod(PyObject *v, PyObject *w) 13085{ 13086 if (!PyUnicode_Check(v)) 13087 Py_RETURN_NOTIMPLEMENTED; 13088 return PyUnicode_Format(v, w); 13089} 13090 13091static PyNumberMethods unicode_as_number = { 13092 0, /*nb_add*/ 13093 0, /*nb_subtract*/ 13094 0, /*nb_multiply*/ 13095 unicode_mod, /*nb_remainder*/ 13096}; 13097 13098static PySequenceMethods unicode_as_sequence = { 13099 (lenfunc) unicode_length, /* sq_length */ 13100 PyUnicode_Concat, /* sq_concat */ 13101 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13102 (ssizeargfunc) unicode_getitem, /* sq_item */ 13103 0, /* sq_slice */ 13104 0, /* sq_ass_item */ 13105 0, /* sq_ass_slice */ 13106 PyUnicode_Contains, /* sq_contains */ 13107}; 13108 13109static PyObject* 13110unicode_subscript(PyObject* self, PyObject* item) 13111{ 13112 if (PyUnicode_READY(self) == -1) 13113 return NULL; 13114 13115 if (PyIndex_Check(item)) { 13116 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13117 if (i == -1 && PyErr_Occurred()) 13118 return NULL; 13119 if (i < 0) 13120 i += PyUnicode_GET_LENGTH(self); 13121 return unicode_getitem(self, i); 13122 } else if (PySlice_Check(item)) { 13123 Py_ssize_t start, stop, step, slicelength, cur, i; 13124 PyObject *result; 13125 void *src_data, *dest_data; 13126 int src_kind, dest_kind; 13127 Py_UCS4 ch, max_char, kind_limit; 13128 13129 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13130 &start, &stop, &step, &slicelength) < 0) { 13131 return NULL; 13132 } 13133 13134 if (slicelength <= 0) { 13135 Py_INCREF(unicode_empty); 13136 return unicode_empty; 13137 } else if (start == 0 && step == 1 && 13138 slicelength == PyUnicode_GET_LENGTH(self)) { 13139 return unicode_result_unchanged(self); 13140 } else if (step == 1) { 13141 return PyUnicode_Substring(self, 13142 start, start + slicelength); 13143 } 13144 /* General case */ 13145 src_kind = PyUnicode_KIND(self); 13146 src_data = PyUnicode_DATA(self); 13147 if (!PyUnicode_IS_ASCII(self)) { 13148 kind_limit = kind_maxchar_limit(src_kind); 13149 max_char = 0; 13150 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13151 ch = PyUnicode_READ(src_kind, src_data, cur); 13152 if (ch > max_char) { 13153 max_char = ch; 13154 if (max_char >= kind_limit) 13155 break; 13156 } 13157 } 13158 } 13159 else 13160 max_char = 127; 13161 result = PyUnicode_New(slicelength, max_char); 13162 if (result == NULL) 13163 return NULL; 13164 dest_kind = PyUnicode_KIND(result); 13165 dest_data = PyUnicode_DATA(result); 13166 13167 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13168 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13169 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13170 } 13171 assert(_PyUnicode_CheckConsistency(result, 1)); 13172 return result; 13173 } else { 13174 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13175 return NULL; 13176 } 13177} 13178 13179static PyMappingMethods unicode_as_mapping = { 13180 (lenfunc)unicode_length, /* mp_length */ 13181 (binaryfunc)unicode_subscript, /* mp_subscript */ 13182 (objobjargproc)0, /* mp_ass_subscript */ 13183}; 13184 13185 13186/* Helpers for PyUnicode_Format() */ 13187 13188static PyObject * 13189getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 13190{ 13191 Py_ssize_t argidx = *p_argidx; 13192 if (argidx < arglen) { 13193 (*p_argidx)++; 13194 if (arglen < 0) 13195 return args; 13196 else 13197 return PyTuple_GetItem(args, argidx); 13198 } 13199 PyErr_SetString(PyExc_TypeError, 13200 "not enough arguments for format string"); 13201 return NULL; 13202} 13203 13204/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13205 13206static int 13207formatfloat(PyObject *v, int flags, int prec, int type, 13208 PyObject **p_output, _PyUnicodeWriter *writer) 13209{ 13210 char *p; 13211 double x; 13212 Py_ssize_t len; 13213 13214 x = PyFloat_AsDouble(v); 13215 if (x == -1.0 && PyErr_Occurred()) 13216 return -1; 13217 13218 if (prec < 0) 13219 prec = 6; 13220 13221 p = PyOS_double_to_string(x, type, prec, 13222 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13223 if (p == NULL) 13224 return -1; 13225 len = strlen(p); 13226 if (writer) { 13227 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) { 13228 PyMem_Free(p); 13229 return -1; 13230 } 13231 unicode_write_cstr(writer->buffer, writer->pos, p, len); 13232 writer->pos += len; 13233 } 13234 else 13235 *p_output = _PyUnicode_FromASCII(p, len); 13236 PyMem_Free(p); 13237 return 0; 13238} 13239 13240/* formatlong() emulates the format codes d, u, o, x and X, and 13241 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13242 * Python's regular ints. 13243 * Return value: a new PyUnicodeObject*, or NULL if error. 13244 * The output string is of the form 13245 * "-"? ("0x" | "0X")? digit+ 13246 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13247 * set in flags. The case of hex digits will be correct, 13248 * There will be at least prec digits, zero-filled on the left if 13249 * necessary to get that many. 13250 * val object to be converted 13251 * flags bitmask of format flags; only F_ALT is looked at 13252 * prec minimum number of digits; 0-fill on left if needed 13253 * type a character in [duoxX]; u acts the same as d 13254 * 13255 * CAUTION: o, x and X conversions on regular ints can never 13256 * produce a '-' sign, but can for Python's unbounded ints. 13257 */ 13258static PyObject* 13259formatlong(PyObject *val, int flags, int prec, int type) 13260{ 13261 PyObject *result = NULL; 13262 char *buf; 13263 Py_ssize_t i; 13264 int sign; /* 1 if '-', else 0 */ 13265 int len; /* number of characters */ 13266 Py_ssize_t llen; 13267 int numdigits; /* len == numnondigits + numdigits */ 13268 int numnondigits = 0; 13269 13270 /* Avoid exceeding SSIZE_T_MAX */ 13271 if (prec > INT_MAX-3) { 13272 PyErr_SetString(PyExc_OverflowError, 13273 "precision too large"); 13274 return NULL; 13275 } 13276 13277 assert(PyLong_Check(val)); 13278 13279 switch (type) { 13280 case 'd': 13281 case 'u': 13282 /* Special-case boolean: we want 0/1 */ 13283 if (PyBool_Check(val)) 13284 result = PyNumber_ToBase(val, 10); 13285 else 13286 result = Py_TYPE(val)->tp_str(val); 13287 break; 13288 case 'o': 13289 numnondigits = 2; 13290 result = PyNumber_ToBase(val, 8); 13291 break; 13292 case 'x': 13293 case 'X': 13294 numnondigits = 2; 13295 result = PyNumber_ToBase(val, 16); 13296 break; 13297 default: 13298 assert(!"'type' not in [duoxX]"); 13299 } 13300 if (!result) 13301 return NULL; 13302 13303 assert(unicode_modifiable(result)); 13304 assert(PyUnicode_IS_READY(result)); 13305 assert(PyUnicode_IS_ASCII(result)); 13306 13307 /* To modify the string in-place, there can only be one reference. */ 13308 if (Py_REFCNT(result) != 1) { 13309 PyErr_BadInternalCall(); 13310 return NULL; 13311 } 13312 buf = PyUnicode_DATA(result); 13313 llen = PyUnicode_GET_LENGTH(result); 13314 if (llen > INT_MAX) { 13315 PyErr_SetString(PyExc_ValueError, 13316 "string too large in _PyBytes_FormatLong"); 13317 return NULL; 13318 } 13319 len = (int)llen; 13320 sign = buf[0] == '-'; 13321 numnondigits += sign; 13322 numdigits = len - numnondigits; 13323 assert(numdigits > 0); 13324 13325 /* Get rid of base marker unless F_ALT */ 13326 if (((flags & F_ALT) == 0 && 13327 (type == 'o' || type == 'x' || type == 'X'))) { 13328 assert(buf[sign] == '0'); 13329 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13330 buf[sign+1] == 'o'); 13331 numnondigits -= 2; 13332 buf += 2; 13333 len -= 2; 13334 if (sign) 13335 buf[0] = '-'; 13336 assert(len == numnondigits + numdigits); 13337 assert(numdigits > 0); 13338 } 13339 13340 /* Fill with leading zeroes to meet minimum width. */ 13341 if (prec > numdigits) { 13342 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13343 numnondigits + prec); 13344 char *b1; 13345 if (!r1) { 13346 Py_DECREF(result); 13347 return NULL; 13348 } 13349 b1 = PyBytes_AS_STRING(r1); 13350 for (i = 0; i < numnondigits; ++i) 13351 *b1++ = *buf++; 13352 for (i = 0; i < prec - numdigits; i++) 13353 *b1++ = '0'; 13354 for (i = 0; i < numdigits; i++) 13355 *b1++ = *buf++; 13356 *b1 = '\0'; 13357 Py_DECREF(result); 13358 result = r1; 13359 buf = PyBytes_AS_STRING(result); 13360 len = numnondigits + prec; 13361 } 13362 13363 /* Fix up case for hex conversions. */ 13364 if (type == 'X') { 13365 /* Need to convert all lower case letters to upper case. 13366 and need to convert 0x to 0X (and -0x to -0X). */ 13367 for (i = 0; i < len; i++) 13368 if (buf[i] >= 'a' && buf[i] <= 'x') 13369 buf[i] -= 'a'-'A'; 13370 } 13371 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) { 13372 PyObject *unicode; 13373 unicode = _PyUnicode_FromASCII(buf, len); 13374 Py_DECREF(result); 13375 result = unicode; 13376 } 13377 return result; 13378} 13379 13380static Py_UCS4 13381formatchar(PyObject *v) 13382{ 13383 /* presume that the buffer is at least 3 characters long */ 13384 if (PyUnicode_Check(v)) { 13385 if (PyUnicode_GET_LENGTH(v) == 1) { 13386 return PyUnicode_READ_CHAR(v, 0); 13387 } 13388 goto onError; 13389 } 13390 else { 13391 /* Integer input truncated to a character */ 13392 long x; 13393 x = PyLong_AsLong(v); 13394 if (x == -1 && PyErr_Occurred()) 13395 goto onError; 13396 13397 if (x < 0 || x > MAX_UNICODE) { 13398 PyErr_SetString(PyExc_OverflowError, 13399 "%c arg not in range(0x110000)"); 13400 return (Py_UCS4) -1; 13401 } 13402 13403 return (Py_UCS4) x; 13404 } 13405 13406 onError: 13407 PyErr_SetString(PyExc_TypeError, 13408 "%c requires int or char"); 13409 return (Py_UCS4) -1; 13410} 13411 13412PyObject * 13413PyUnicode_Format(PyObject *format, PyObject *args) 13414{ 13415 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13416 int args_owned = 0; 13417 PyObject *dict = NULL; 13418 PyObject *temp = NULL; 13419 PyObject *second = NULL; 13420 PyObject *uformat; 13421 void *fmt; 13422 enum PyUnicode_Kind kind, fmtkind; 13423 _PyUnicodeWriter writer; 13424 Py_ssize_t sublen; 13425 Py_UCS4 maxchar; 13426 13427 if (format == NULL || args == NULL) { 13428 PyErr_BadInternalCall(); 13429 return NULL; 13430 } 13431 uformat = PyUnicode_FromObject(format); 13432 if (uformat == NULL) 13433 return NULL; 13434 if (PyUnicode_READY(uformat) == -1) { 13435 Py_DECREF(uformat); 13436 return NULL; 13437 } 13438 13439 fmt = PyUnicode_DATA(uformat); 13440 fmtkind = PyUnicode_KIND(uformat); 13441 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13442 fmtpos = 0; 13443 13444 _PyUnicodeWriter_Init(&writer, fmtcnt + 100); 13445 13446 if (PyTuple_Check(args)) { 13447 arglen = PyTuple_Size(args); 13448 argidx = 0; 13449 } 13450 else { 13451 arglen = -1; 13452 argidx = -2; 13453 } 13454 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 13455 dict = args; 13456 13457 while (--fmtcnt >= 0) { 13458 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13459 Py_ssize_t nonfmtpos; 13460 nonfmtpos = fmtpos++; 13461 while (fmtcnt >= 0 && 13462 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13463 fmtpos++; 13464 fmtcnt--; 13465 } 13466 if (fmtcnt < 0) 13467 fmtpos--; 13468 sublen = fmtpos - nonfmtpos; 13469 maxchar = _PyUnicode_FindMaxChar(uformat, 13470 nonfmtpos, nonfmtpos + sublen); 13471 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1) 13472 goto onError; 13473 13474 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13475 uformat, nonfmtpos, sublen); 13476 writer.pos += sublen; 13477 } 13478 else { 13479 /* Got a format specifier */ 13480 int flags = 0; 13481 Py_ssize_t width = -1; 13482 int prec = -1; 13483 Py_UCS4 c = '\0'; 13484 Py_UCS4 fill; 13485 int sign; 13486 Py_UCS4 signchar; 13487 int isnumok; 13488 PyObject *v = NULL; 13489 void *pbuf = NULL; 13490 Py_ssize_t pindex, len; 13491 Py_UCS4 bufmaxchar; 13492 Py_ssize_t buflen; 13493 13494 fmtpos++; 13495 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13496 if (c == '(') { 13497 Py_ssize_t keystart; 13498 Py_ssize_t keylen; 13499 PyObject *key; 13500 int pcount = 1; 13501 13502 if (dict == NULL) { 13503 PyErr_SetString(PyExc_TypeError, 13504 "format requires a mapping"); 13505 goto onError; 13506 } 13507 ++fmtpos; 13508 --fmtcnt; 13509 keystart = fmtpos; 13510 /* Skip over balanced parentheses */ 13511 while (pcount > 0 && --fmtcnt >= 0) { 13512 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13513 if (c == ')') 13514 --pcount; 13515 else if (c == '(') 13516 ++pcount; 13517 fmtpos++; 13518 } 13519 keylen = fmtpos - keystart - 1; 13520 if (fmtcnt < 0 || pcount > 0) { 13521 PyErr_SetString(PyExc_ValueError, 13522 "incomplete format key"); 13523 goto onError; 13524 } 13525 key = PyUnicode_Substring(uformat, 13526 keystart, keystart + keylen); 13527 if (key == NULL) 13528 goto onError; 13529 if (args_owned) { 13530 Py_DECREF(args); 13531 args_owned = 0; 13532 } 13533 args = PyObject_GetItem(dict, key); 13534 Py_DECREF(key); 13535 if (args == NULL) { 13536 goto onError; 13537 } 13538 args_owned = 1; 13539 arglen = -1; 13540 argidx = -2; 13541 } 13542 while (--fmtcnt >= 0) { 13543 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13544 switch (c) { 13545 case '-': flags |= F_LJUST; continue; 13546 case '+': flags |= F_SIGN; continue; 13547 case ' ': flags |= F_BLANK; continue; 13548 case '#': flags |= F_ALT; continue; 13549 case '0': flags |= F_ZERO; continue; 13550 } 13551 break; 13552 } 13553 if (c == '*') { 13554 v = getnextarg(args, arglen, &argidx); 13555 if (v == NULL) 13556 goto onError; 13557 if (!PyLong_Check(v)) { 13558 PyErr_SetString(PyExc_TypeError, 13559 "* wants int"); 13560 goto onError; 13561 } 13562 width = PyLong_AsLong(v); 13563 if (width == -1 && PyErr_Occurred()) 13564 goto onError; 13565 if (width < 0) { 13566 flags |= F_LJUST; 13567 width = -width; 13568 } 13569 if (--fmtcnt >= 0) 13570 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13571 } 13572 else if (c >= '0' && c <= '9') { 13573 width = c - '0'; 13574 while (--fmtcnt >= 0) { 13575 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13576 if (c < '0' || c > '9') 13577 break; 13578 /* Since c is unsigned, the RHS would end up as unsigned, 13579 mixing signed and unsigned comparison. Since c is between 13580 '0' and '9', casting to int is safe. */ 13581 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 13582 PyErr_SetString(PyExc_ValueError, 13583 "width too big"); 13584 goto onError; 13585 } 13586 width = width*10 + (c - '0'); 13587 } 13588 } 13589 if (c == '.') { 13590 prec = 0; 13591 if (--fmtcnt >= 0) 13592 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13593 if (c == '*') { 13594 v = getnextarg(args, arglen, &argidx); 13595 if (v == NULL) 13596 goto onError; 13597 if (!PyLong_Check(v)) { 13598 PyErr_SetString(PyExc_TypeError, 13599 "* wants int"); 13600 goto onError; 13601 } 13602 prec = PyLong_AsLong(v); 13603 if (prec == -1 && PyErr_Occurred()) 13604 goto onError; 13605 if (prec < 0) 13606 prec = 0; 13607 if (--fmtcnt >= 0) 13608 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13609 } 13610 else if (c >= '0' && c <= '9') { 13611 prec = c - '0'; 13612 while (--fmtcnt >= 0) { 13613 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13614 if (c < '0' || c > '9') 13615 break; 13616 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 13617 PyErr_SetString(PyExc_ValueError, 13618 "prec too big"); 13619 goto onError; 13620 } 13621 prec = prec*10 + (c - '0'); 13622 } 13623 } 13624 } /* prec */ 13625 if (fmtcnt >= 0) { 13626 if (c == 'h' || c == 'l' || c == 'L') { 13627 if (--fmtcnt >= 0) 13628 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13629 } 13630 } 13631 if (fmtcnt < 0) { 13632 PyErr_SetString(PyExc_ValueError, 13633 "incomplete format"); 13634 goto onError; 13635 } 13636 if (fmtcnt == 0) 13637 writer.overallocate = 0; 13638 13639 if (c == '%') { 13640 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1) 13641 goto onError; 13642 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%'); 13643 writer.pos += 1; 13644 continue; 13645 } 13646 13647 v = getnextarg(args, arglen, &argidx); 13648 if (v == NULL) 13649 goto onError; 13650 13651 sign = 0; 13652 signchar = '\0'; 13653 fill = ' '; 13654 switch (c) { 13655 13656 case 's': 13657 case 'r': 13658 case 'a': 13659 if (PyLong_CheckExact(v) && width == -1 && prec == -1) { 13660 /* Fast path */ 13661 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13662 goto onError; 13663 goto nextarg; 13664 } 13665 13666 if (PyUnicode_CheckExact(v) && c == 's') { 13667 temp = v; 13668 Py_INCREF(temp); 13669 } 13670 else { 13671 if (c == 's') 13672 temp = PyObject_Str(v); 13673 else if (c == 'r') 13674 temp = PyObject_Repr(v); 13675 else 13676 temp = PyObject_ASCII(v); 13677 } 13678 break; 13679 13680 case 'i': 13681 case 'd': 13682 case 'u': 13683 case 'o': 13684 case 'x': 13685 case 'X': 13686 if (PyLong_CheckExact(v) 13687 && width == -1 && prec == -1 13688 && !(flags & (F_SIGN | F_BLANK))) 13689 { 13690 /* Fast path */ 13691 switch(c) 13692 { 13693 case 'd': 13694 case 'i': 13695 case 'u': 13696 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13697 goto onError; 13698 goto nextarg; 13699 case 'x': 13700 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1) 13701 goto onError; 13702 goto nextarg; 13703 case 'o': 13704 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1) 13705 goto onError; 13706 goto nextarg; 13707 default: 13708 break; 13709 } 13710 } 13711 13712 isnumok = 0; 13713 if (PyNumber_Check(v)) { 13714 PyObject *iobj=NULL; 13715 13716 if (PyLong_Check(v)) { 13717 iobj = v; 13718 Py_INCREF(iobj); 13719 } 13720 else { 13721 iobj = PyNumber_Long(v); 13722 } 13723 if (iobj!=NULL) { 13724 if (PyLong_Check(iobj)) { 13725 isnumok = 1; 13726 sign = 1; 13727 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13728 Py_DECREF(iobj); 13729 } 13730 else { 13731 Py_DECREF(iobj); 13732 } 13733 } 13734 } 13735 if (!isnumok) { 13736 PyErr_Format(PyExc_TypeError, 13737 "%%%c format: a number is required, " 13738 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13739 goto onError; 13740 } 13741 if (flags & F_ZERO) 13742 fill = '0'; 13743 break; 13744 13745 case 'e': 13746 case 'E': 13747 case 'f': 13748 case 'F': 13749 case 'g': 13750 case 'G': 13751 if (width == -1 && prec == -1 13752 && !(flags & (F_SIGN | F_BLANK))) 13753 { 13754 /* Fast path */ 13755 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1) 13756 goto onError; 13757 goto nextarg; 13758 } 13759 13760 sign = 1; 13761 if (flags & F_ZERO) 13762 fill = '0'; 13763 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1) 13764 temp = NULL; 13765 break; 13766 13767 case 'c': 13768 { 13769 Py_UCS4 ch = formatchar(v); 13770 if (ch == (Py_UCS4) -1) 13771 goto onError; 13772 if (width == -1 && prec == -1) { 13773 /* Fast path */ 13774 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 13775 goto onError; 13776 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 13777 writer.pos += 1; 13778 goto nextarg; 13779 } 13780 temp = PyUnicode_FromOrdinal(ch); 13781 break; 13782 } 13783 13784 default: 13785 PyErr_Format(PyExc_ValueError, 13786 "unsupported format character '%c' (0x%x) " 13787 "at index %zd", 13788 (31<=c && c<=126) ? (char)c : '?', 13789 (int)c, 13790 fmtpos - 1); 13791 goto onError; 13792 } 13793 if (temp == NULL) 13794 goto onError; 13795 assert (PyUnicode_Check(temp)); 13796 13797 if (width == -1 && prec == -1 13798 && !(flags & (F_SIGN | F_BLANK))) 13799 { 13800 /* Fast path */ 13801 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1) 13802 goto onError; 13803 goto nextarg; 13804 } 13805 13806 if (PyUnicode_READY(temp) == -1) { 13807 Py_CLEAR(temp); 13808 goto onError; 13809 } 13810 kind = PyUnicode_KIND(temp); 13811 pbuf = PyUnicode_DATA(temp); 13812 len = PyUnicode_GET_LENGTH(temp); 13813 13814 if (c == 's' || c == 'r' || c == 'a') { 13815 if (prec >= 0 && len > prec) 13816 len = prec; 13817 } 13818 13819 /* pbuf is initialized here. */ 13820 pindex = 0; 13821 if (sign) { 13822 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 13823 if (ch == '-' || ch == '+') { 13824 signchar = ch; 13825 len--; 13826 pindex++; 13827 } 13828 else if (flags & F_SIGN) 13829 signchar = '+'; 13830 else if (flags & F_BLANK) 13831 signchar = ' '; 13832 else 13833 sign = 0; 13834 } 13835 if (width < len) 13836 width = len; 13837 13838 /* Compute the length and maximum character of the 13839 written characters */ 13840 bufmaxchar = 127; 13841 if (!(flags & F_LJUST)) { 13842 if (sign) { 13843 if ((width-1) > len) 13844 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13845 } 13846 else { 13847 if (width > len) 13848 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13849 } 13850 } 13851 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len); 13852 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); 13853 13854 buflen = width; 13855 if (sign && len == width) 13856 buflen++; 13857 13858 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1) 13859 goto onError; 13860 13861 /* Write characters */ 13862 if (sign) { 13863 if (fill != ' ') { 13864 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13865 writer.pos += 1; 13866 } 13867 if (width > len) 13868 width--; 13869 } 13870 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13871 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13872 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13873 if (fill != ' ') { 13874 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13875 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13876 writer.pos += 2; 13877 pindex += 2; 13878 } 13879 width -= 2; 13880 if (width < 0) 13881 width = 0; 13882 len -= 2; 13883 } 13884 if (width > len && !(flags & F_LJUST)) { 13885 sublen = width - len; 13886 FILL(writer.kind, writer.data, fill, writer.pos, sublen); 13887 writer.pos += sublen; 13888 width = len; 13889 } 13890 if (fill == ' ') { 13891 if (sign) { 13892 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13893 writer.pos += 1; 13894 } 13895 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13896 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13897 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13898 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13899 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13900 writer.pos += 2; 13901 pindex += 2; 13902 } 13903 } 13904 13905 if (len) { 13906 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13907 temp, pindex, len); 13908 writer.pos += len; 13909 } 13910 if (width > len) { 13911 sublen = width - len; 13912 FILL(writer.kind, writer.data, ' ', writer.pos, sublen); 13913 writer.pos += sublen; 13914 } 13915 13916nextarg: 13917 if (dict && (argidx < arglen) && c != '%') { 13918 PyErr_SetString(PyExc_TypeError, 13919 "not all arguments converted during string formatting"); 13920 goto onError; 13921 } 13922 Py_CLEAR(temp); 13923 } /* '%' */ 13924 } /* until end */ 13925 if (argidx < arglen && !dict) { 13926 PyErr_SetString(PyExc_TypeError, 13927 "not all arguments converted during string formatting"); 13928 goto onError; 13929 } 13930 13931 if (args_owned) { 13932 Py_DECREF(args); 13933 } 13934 Py_DECREF(uformat); 13935 Py_XDECREF(temp); 13936 Py_XDECREF(second); 13937 return _PyUnicodeWriter_Finish(&writer); 13938 13939 onError: 13940 Py_DECREF(uformat); 13941 Py_XDECREF(temp); 13942 Py_XDECREF(second); 13943 _PyUnicodeWriter_Dealloc(&writer); 13944 if (args_owned) { 13945 Py_DECREF(args); 13946 } 13947 return NULL; 13948} 13949 13950static PyObject * 13951unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13952 13953static PyObject * 13954unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13955{ 13956 PyObject *x = NULL; 13957 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13958 char *encoding = NULL; 13959 char *errors = NULL; 13960 13961 if (type != &PyUnicode_Type) 13962 return unicode_subtype_new(type, args, kwds); 13963 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13964 kwlist, &x, &encoding, &errors)) 13965 return NULL; 13966 if (x == NULL) { 13967 Py_INCREF(unicode_empty); 13968 return unicode_empty; 13969 } 13970 if (encoding == NULL && errors == NULL) 13971 return PyObject_Str(x); 13972 else 13973 return PyUnicode_FromEncodedObject(x, encoding, errors); 13974} 13975 13976static PyObject * 13977unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13978{ 13979 PyObject *unicode, *self; 13980 Py_ssize_t length, char_size; 13981 int share_wstr, share_utf8; 13982 unsigned int kind; 13983 void *data; 13984 13985 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13986 13987 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13988 if (unicode == NULL) 13989 return NULL; 13990 assert(_PyUnicode_CHECK(unicode)); 13991 if (PyUnicode_READY(unicode) == -1) { 13992 Py_DECREF(unicode); 13993 return NULL; 13994 } 13995 13996 self = type->tp_alloc(type, 0); 13997 if (self == NULL) { 13998 Py_DECREF(unicode); 13999 return NULL; 14000 } 14001 kind = PyUnicode_KIND(unicode); 14002 length = PyUnicode_GET_LENGTH(unicode); 14003 14004 _PyUnicode_LENGTH(self) = length; 14005#ifdef Py_DEBUG 14006 _PyUnicode_HASH(self) = -1; 14007#else 14008 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14009#endif 14010 _PyUnicode_STATE(self).interned = 0; 14011 _PyUnicode_STATE(self).kind = kind; 14012 _PyUnicode_STATE(self).compact = 0; 14013 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14014 _PyUnicode_STATE(self).ready = 1; 14015 _PyUnicode_WSTR(self) = NULL; 14016 _PyUnicode_UTF8_LENGTH(self) = 0; 14017 _PyUnicode_UTF8(self) = NULL; 14018 _PyUnicode_WSTR_LENGTH(self) = 0; 14019 _PyUnicode_DATA_ANY(self) = NULL; 14020 14021 share_utf8 = 0; 14022 share_wstr = 0; 14023 if (kind == PyUnicode_1BYTE_KIND) { 14024 char_size = 1; 14025 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14026 share_utf8 = 1; 14027 } 14028 else if (kind == PyUnicode_2BYTE_KIND) { 14029 char_size = 2; 14030 if (sizeof(wchar_t) == 2) 14031 share_wstr = 1; 14032 } 14033 else { 14034 assert(kind == PyUnicode_4BYTE_KIND); 14035 char_size = 4; 14036 if (sizeof(wchar_t) == 4) 14037 share_wstr = 1; 14038 } 14039 14040 /* Ensure we won't overflow the length. */ 14041 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14042 PyErr_NoMemory(); 14043 goto onError; 14044 } 14045 data = PyObject_MALLOC((length + 1) * char_size); 14046 if (data == NULL) { 14047 PyErr_NoMemory(); 14048 goto onError; 14049 } 14050 14051 _PyUnicode_DATA_ANY(self) = data; 14052 if (share_utf8) { 14053 _PyUnicode_UTF8_LENGTH(self) = length; 14054 _PyUnicode_UTF8(self) = data; 14055 } 14056 if (share_wstr) { 14057 _PyUnicode_WSTR_LENGTH(self) = length; 14058 _PyUnicode_WSTR(self) = (wchar_t *)data; 14059 } 14060 14061 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14062 kind * (length + 1)); 14063 assert(_PyUnicode_CheckConsistency(self, 1)); 14064#ifdef Py_DEBUG 14065 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14066#endif 14067 Py_DECREF(unicode); 14068 return self; 14069 14070onError: 14071 Py_DECREF(unicode); 14072 Py_DECREF(self); 14073 return NULL; 14074} 14075 14076PyDoc_STRVAR(unicode_doc, 14077"str(object='') -> str\n\ 14078str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14079\n\ 14080Create a new string object from the given object. If encoding or\n\ 14081errors is specified, then the object must expose a data buffer\n\ 14082that will be decoded using the given encoding and error handler.\n\ 14083Otherwise, returns the result of object.__str__() (if defined)\n\ 14084or repr(object).\n\ 14085encoding defaults to sys.getdefaultencoding().\n\ 14086errors defaults to 'strict'."); 14087 14088static PyObject *unicode_iter(PyObject *seq); 14089 14090PyTypeObject PyUnicode_Type = { 14091 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14092 "str", /* tp_name */ 14093 sizeof(PyUnicodeObject), /* tp_size */ 14094 0, /* tp_itemsize */ 14095 /* Slots */ 14096 (destructor)unicode_dealloc, /* tp_dealloc */ 14097 0, /* tp_print */ 14098 0, /* tp_getattr */ 14099 0, /* tp_setattr */ 14100 0, /* tp_reserved */ 14101 unicode_repr, /* tp_repr */ 14102 &unicode_as_number, /* tp_as_number */ 14103 &unicode_as_sequence, /* tp_as_sequence */ 14104 &unicode_as_mapping, /* tp_as_mapping */ 14105 (hashfunc) unicode_hash, /* tp_hash*/ 14106 0, /* tp_call*/ 14107 (reprfunc) unicode_str, /* tp_str */ 14108 PyObject_GenericGetAttr, /* tp_getattro */ 14109 0, /* tp_setattro */ 14110 0, /* tp_as_buffer */ 14111 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14112 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14113 unicode_doc, /* tp_doc */ 14114 0, /* tp_traverse */ 14115 0, /* tp_clear */ 14116 PyUnicode_RichCompare, /* tp_richcompare */ 14117 0, /* tp_weaklistoffset */ 14118 unicode_iter, /* tp_iter */ 14119 0, /* tp_iternext */ 14120 unicode_methods, /* tp_methods */ 14121 0, /* tp_members */ 14122 0, /* tp_getset */ 14123 &PyBaseObject_Type, /* tp_base */ 14124 0, /* tp_dict */ 14125 0, /* tp_descr_get */ 14126 0, /* tp_descr_set */ 14127 0, /* tp_dictoffset */ 14128 0, /* tp_init */ 14129 0, /* tp_alloc */ 14130 unicode_new, /* tp_new */ 14131 PyObject_Del, /* tp_free */ 14132}; 14133 14134/* Initialize the Unicode implementation */ 14135 14136int _PyUnicode_Init(void) 14137{ 14138 int i; 14139 14140 /* XXX - move this array to unicodectype.c ? */ 14141 Py_UCS2 linebreak[] = { 14142 0x000A, /* LINE FEED */ 14143 0x000D, /* CARRIAGE RETURN */ 14144 0x001C, /* FILE SEPARATOR */ 14145 0x001D, /* GROUP SEPARATOR */ 14146 0x001E, /* RECORD SEPARATOR */ 14147 0x0085, /* NEXT LINE */ 14148 0x2028, /* LINE SEPARATOR */ 14149 0x2029, /* PARAGRAPH SEPARATOR */ 14150 }; 14151 14152 /* Init the implementation */ 14153 unicode_empty = PyUnicode_New(0, 0); 14154 if (!unicode_empty) 14155 Py_FatalError("Can't create empty string"); 14156 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 14157 14158 for (i = 0; i < 256; i++) 14159 unicode_latin1[i] = NULL; 14160 if (PyType_Ready(&PyUnicode_Type) < 0) 14161 Py_FatalError("Can't initialize 'unicode'"); 14162 14163 /* initialize the linebreak bloom filter */ 14164 bloom_linebreak = make_bloom_mask( 14165 PyUnicode_2BYTE_KIND, linebreak, 14166 Py_ARRAY_LENGTH(linebreak)); 14167 14168 PyType_Ready(&EncodingMapType); 14169 14170 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 14171 Py_FatalError("Can't initialize field name iterator type"); 14172 14173 if (PyType_Ready(&PyFormatterIter_Type) < 0) 14174 Py_FatalError("Can't initialize formatter iter type"); 14175 14176#ifdef HAVE_MBCS 14177 winver.dwOSVersionInfoSize = sizeof(winver); 14178 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14179 PyErr_SetFromWindowsErr(0); 14180 return -1; 14181 } 14182#endif 14183 return 0; 14184} 14185 14186/* Finalize the Unicode implementation */ 14187 14188int 14189PyUnicode_ClearFreeList(void) 14190{ 14191 return 0; 14192} 14193 14194void 14195_PyUnicode_Fini(void) 14196{ 14197 int i; 14198 14199 Py_XDECREF(unicode_empty); 14200 unicode_empty = NULL; 14201 14202 for (i = 0; i < 256; i++) { 14203 if (unicode_latin1[i]) { 14204 Py_DECREF(unicode_latin1[i]); 14205 unicode_latin1[i] = NULL; 14206 } 14207 } 14208 _PyUnicode_ClearStaticStrings(); 14209 (void)PyUnicode_ClearFreeList(); 14210} 14211 14212void 14213PyUnicode_InternInPlace(PyObject **p) 14214{ 14215 register PyObject *s = *p; 14216 PyObject *t; 14217#ifdef Py_DEBUG 14218 assert(s != NULL); 14219 assert(_PyUnicode_CHECK(s)); 14220#else 14221 if (s == NULL || !PyUnicode_Check(s)) 14222 return; 14223#endif 14224 /* If it's a subclass, we don't really know what putting 14225 it in the interned dict might do. */ 14226 if (!PyUnicode_CheckExact(s)) 14227 return; 14228 if (PyUnicode_CHECK_INTERNED(s)) 14229 return; 14230 if (interned == NULL) { 14231 interned = PyDict_New(); 14232 if (interned == NULL) { 14233 PyErr_Clear(); /* Don't leave an exception */ 14234 return; 14235 } 14236 } 14237 /* It might be that the GetItem call fails even 14238 though the key is present in the dictionary, 14239 namely when this happens during a stack overflow. */ 14240 Py_ALLOW_RECURSION 14241 t = PyDict_GetItem(interned, s); 14242 Py_END_ALLOW_RECURSION 14243 14244 if (t) { 14245 Py_INCREF(t); 14246 Py_DECREF(*p); 14247 *p = t; 14248 return; 14249 } 14250 14251 PyThreadState_GET()->recursion_critical = 1; 14252 if (PyDict_SetItem(interned, s, s) < 0) { 14253 PyErr_Clear(); 14254 PyThreadState_GET()->recursion_critical = 0; 14255 return; 14256 } 14257 PyThreadState_GET()->recursion_critical = 0; 14258 /* The two references in interned are not counted by refcnt. 14259 The deallocator will take care of this */ 14260 Py_REFCNT(s) -= 2; 14261 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14262} 14263 14264void 14265PyUnicode_InternImmortal(PyObject **p) 14266{ 14267 PyUnicode_InternInPlace(p); 14268 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14269 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14270 Py_INCREF(*p); 14271 } 14272} 14273 14274PyObject * 14275PyUnicode_InternFromString(const char *cp) 14276{ 14277 PyObject *s = PyUnicode_FromString(cp); 14278 if (s == NULL) 14279 return NULL; 14280 PyUnicode_InternInPlace(&s); 14281 return s; 14282} 14283 14284void 14285_Py_ReleaseInternedUnicodeStrings(void) 14286{ 14287 PyObject *keys; 14288 PyObject *s; 14289 Py_ssize_t i, n; 14290 Py_ssize_t immortal_size = 0, mortal_size = 0; 14291 14292 if (interned == NULL || !PyDict_Check(interned)) 14293 return; 14294 keys = PyDict_Keys(interned); 14295 if (keys == NULL || !PyList_Check(keys)) { 14296 PyErr_Clear(); 14297 return; 14298 } 14299 14300 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14301 detector, interned unicode strings are not forcibly deallocated; 14302 rather, we give them their stolen references back, and then clear 14303 and DECREF the interned dict. */ 14304 14305 n = PyList_GET_SIZE(keys); 14306 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14307 n); 14308 for (i = 0; i < n; i++) { 14309 s = PyList_GET_ITEM(keys, i); 14310 if (PyUnicode_READY(s) == -1) { 14311 assert(0 && "could not ready string"); 14312 fprintf(stderr, "could not ready string\n"); 14313 } 14314 switch (PyUnicode_CHECK_INTERNED(s)) { 14315 case SSTATE_NOT_INTERNED: 14316 /* XXX Shouldn't happen */ 14317 break; 14318 case SSTATE_INTERNED_IMMORTAL: 14319 Py_REFCNT(s) += 1; 14320 immortal_size += PyUnicode_GET_LENGTH(s); 14321 break; 14322 case SSTATE_INTERNED_MORTAL: 14323 Py_REFCNT(s) += 2; 14324 mortal_size += PyUnicode_GET_LENGTH(s); 14325 break; 14326 default: 14327 Py_FatalError("Inconsistent interned string state."); 14328 } 14329 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14330 } 14331 fprintf(stderr, "total size of all interned strings: " 14332 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14333 "mortal/immortal\n", mortal_size, immortal_size); 14334 Py_DECREF(keys); 14335 PyDict_Clear(interned); 14336 Py_DECREF(interned); 14337 interned = NULL; 14338} 14339 14340 14341/********************* Unicode Iterator **************************/ 14342 14343typedef struct { 14344 PyObject_HEAD 14345 Py_ssize_t it_index; 14346 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14347} unicodeiterobject; 14348 14349static void 14350unicodeiter_dealloc(unicodeiterobject *it) 14351{ 14352 _PyObject_GC_UNTRACK(it); 14353 Py_XDECREF(it->it_seq); 14354 PyObject_GC_Del(it); 14355} 14356 14357static int 14358unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14359{ 14360 Py_VISIT(it->it_seq); 14361 return 0; 14362} 14363 14364static PyObject * 14365unicodeiter_next(unicodeiterobject *it) 14366{ 14367 PyObject *seq, *item; 14368 14369 assert(it != NULL); 14370 seq = it->it_seq; 14371 if (seq == NULL) 14372 return NULL; 14373 assert(_PyUnicode_CHECK(seq)); 14374 14375 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14376 int kind = PyUnicode_KIND(seq); 14377 void *data = PyUnicode_DATA(seq); 14378 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14379 item = PyUnicode_FromOrdinal(chr); 14380 if (item != NULL) 14381 ++it->it_index; 14382 return item; 14383 } 14384 14385 Py_DECREF(seq); 14386 it->it_seq = NULL; 14387 return NULL; 14388} 14389 14390static PyObject * 14391unicodeiter_len(unicodeiterobject *it) 14392{ 14393 Py_ssize_t len = 0; 14394 if (it->it_seq) 14395 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14396 return PyLong_FromSsize_t(len); 14397} 14398 14399PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14400 14401static PyObject * 14402unicodeiter_reduce(unicodeiterobject *it) 14403{ 14404 if (it->it_seq != NULL) { 14405 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14406 it->it_seq, it->it_index); 14407 } else { 14408 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14409 if (u == NULL) 14410 return NULL; 14411 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14412 } 14413} 14414 14415PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14416 14417static PyObject * 14418unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14419{ 14420 Py_ssize_t index = PyLong_AsSsize_t(state); 14421 if (index == -1 && PyErr_Occurred()) 14422 return NULL; 14423 if (index < 0) 14424 index = 0; 14425 it->it_index = index; 14426 Py_RETURN_NONE; 14427} 14428 14429PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14430 14431static PyMethodDef unicodeiter_methods[] = { 14432 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14433 length_hint_doc}, 14434 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14435 reduce_doc}, 14436 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14437 setstate_doc}, 14438 {NULL, NULL} /* sentinel */ 14439}; 14440 14441PyTypeObject PyUnicodeIter_Type = { 14442 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14443 "str_iterator", /* tp_name */ 14444 sizeof(unicodeiterobject), /* tp_basicsize */ 14445 0, /* tp_itemsize */ 14446 /* methods */ 14447 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14448 0, /* tp_print */ 14449 0, /* tp_getattr */ 14450 0, /* tp_setattr */ 14451 0, /* tp_reserved */ 14452 0, /* tp_repr */ 14453 0, /* tp_as_number */ 14454 0, /* tp_as_sequence */ 14455 0, /* tp_as_mapping */ 14456 0, /* tp_hash */ 14457 0, /* tp_call */ 14458 0, /* tp_str */ 14459 PyObject_GenericGetAttr, /* tp_getattro */ 14460 0, /* tp_setattro */ 14461 0, /* tp_as_buffer */ 14462 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14463 0, /* tp_doc */ 14464 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14465 0, /* tp_clear */ 14466 0, /* tp_richcompare */ 14467 0, /* tp_weaklistoffset */ 14468 PyObject_SelfIter, /* tp_iter */ 14469 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14470 unicodeiter_methods, /* tp_methods */ 14471 0, 14472}; 14473 14474static PyObject * 14475unicode_iter(PyObject *seq) 14476{ 14477 unicodeiterobject *it; 14478 14479 if (!PyUnicode_Check(seq)) { 14480 PyErr_BadInternalCall(); 14481 return NULL; 14482 } 14483 if (PyUnicode_READY(seq) == -1) 14484 return NULL; 14485 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14486 if (it == NULL) 14487 return NULL; 14488 it->it_index = 0; 14489 Py_INCREF(seq); 14490 it->it_seq = seq; 14491 _PyObject_GC_TRACK(it); 14492 return (PyObject *)it; 14493} 14494 14495 14496size_t 14497Py_UNICODE_strlen(const Py_UNICODE *u) 14498{ 14499 int res = 0; 14500 while(*u++) 14501 res++; 14502 return res; 14503} 14504 14505Py_UNICODE* 14506Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14507{ 14508 Py_UNICODE *u = s1; 14509 while ((*u++ = *s2++)); 14510 return s1; 14511} 14512 14513Py_UNICODE* 14514Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14515{ 14516 Py_UNICODE *u = s1; 14517 while ((*u++ = *s2++)) 14518 if (n-- == 0) 14519 break; 14520 return s1; 14521} 14522 14523Py_UNICODE* 14524Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14525{ 14526 Py_UNICODE *u1 = s1; 14527 u1 += Py_UNICODE_strlen(u1); 14528 Py_UNICODE_strcpy(u1, s2); 14529 return s1; 14530} 14531 14532int 14533Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14534{ 14535 while (*s1 && *s2 && *s1 == *s2) 14536 s1++, s2++; 14537 if (*s1 && *s2) 14538 return (*s1 < *s2) ? -1 : +1; 14539 if (*s1) 14540 return 1; 14541 if (*s2) 14542 return -1; 14543 return 0; 14544} 14545 14546int 14547Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14548{ 14549 register Py_UNICODE u1, u2; 14550 for (; n != 0; n--) { 14551 u1 = *s1; 14552 u2 = *s2; 14553 if (u1 != u2) 14554 return (u1 < u2) ? -1 : +1; 14555 if (u1 == '\0') 14556 return 0; 14557 s1++; 14558 s2++; 14559 } 14560 return 0; 14561} 14562 14563Py_UNICODE* 14564Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14565{ 14566 const Py_UNICODE *p; 14567 for (p = s; *p; p++) 14568 if (*p == c) 14569 return (Py_UNICODE*)p; 14570 return NULL; 14571} 14572 14573Py_UNICODE* 14574Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14575{ 14576 const Py_UNICODE *p; 14577 p = s + Py_UNICODE_strlen(s); 14578 while (p != s) { 14579 p--; 14580 if (*p == c) 14581 return (Py_UNICODE*)p; 14582 } 14583 return NULL; 14584} 14585 14586Py_UNICODE* 14587PyUnicode_AsUnicodeCopy(PyObject *unicode) 14588{ 14589 Py_UNICODE *u, *copy; 14590 Py_ssize_t len, size; 14591 14592 if (!PyUnicode_Check(unicode)) { 14593 PyErr_BadArgument(); 14594 return NULL; 14595 } 14596 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14597 if (u == NULL) 14598 return NULL; 14599 /* Ensure we won't overflow the size. */ 14600 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14601 PyErr_NoMemory(); 14602 return NULL; 14603 } 14604 size = len + 1; /* copy the null character */ 14605 size *= sizeof(Py_UNICODE); 14606 copy = PyMem_Malloc(size); 14607 if (copy == NULL) { 14608 PyErr_NoMemory(); 14609 return NULL; 14610 } 14611 memcpy(copy, u, size); 14612 return copy; 14613} 14614 14615/* A _string module, to export formatter_parser and formatter_field_name_split 14616 to the string.Formatter class implemented in Python. */ 14617 14618static PyMethodDef _string_methods[] = { 14619 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14620 METH_O, PyDoc_STR("split the argument as a field name")}, 14621 {"formatter_parser", (PyCFunction) formatter_parser, 14622 METH_O, PyDoc_STR("parse the argument as a format string")}, 14623 {NULL, NULL} 14624}; 14625 14626static struct PyModuleDef _string_module = { 14627 PyModuleDef_HEAD_INIT, 14628 "_string", 14629 PyDoc_STR("string helper module"), 14630 0, 14631 _string_methods, 14632 NULL, 14633 NULL, 14634 NULL, 14635 NULL 14636}; 14637 14638PyMODINIT_FUNC 14639PyInit__string(void) 14640{ 14641 return PyModule_Create(&_string_module); 14642} 14643 14644 14645#ifdef __cplusplus 14646} 14647#endif 14648