unicodeobject.c revision f05e17ece9ee4cf4d04e0657e6c7c9283a233968
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Endianness switches; defaults to little endian */ 51 52#ifdef WORDS_BIGENDIAN 53# define BYTEORDER_IS_BIG_ENDIAN 54#else 55# define BYTEORDER_IS_LITTLE_ENDIAN 56#endif 57 58/* --- Globals ------------------------------------------------------------ 59 60 The globals are initialized by the _PyUnicode_Init() API and should 61 not be used before calling that API. 62 63*/ 64 65 66#ifdef __cplusplus 67extern "C" { 68#endif 69 70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 71#define MAX_UNICODE 0x10ffff 72 73#ifdef Py_DEBUG 74# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 75#else 76# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 77#endif 78 79#define _PyUnicode_UTF8(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8) 81#define PyUnicode_UTF8(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((char*)((PyASCIIObject*)(op) + 1)) : \ 86 _PyUnicode_UTF8(op)) 87#define _PyUnicode_UTF8_LENGTH(op) \ 88 (((PyCompactUnicodeObject*)(op))->utf8_length) 89#define PyUnicode_UTF8_LENGTH(op) \ 90 (assert(_PyUnicode_CHECK(op)), \ 91 assert(PyUnicode_IS_READY(op)), \ 92 PyUnicode_IS_COMPACT_ASCII(op) ? \ 93 ((PyASCIIObject*)(op))->length : \ 94 _PyUnicode_UTF8_LENGTH(op)) 95#define _PyUnicode_WSTR(op) \ 96 (((PyASCIIObject*)(op))->wstr) 97#define _PyUnicode_WSTR_LENGTH(op) \ 98 (((PyCompactUnicodeObject*)(op))->wstr_length) 99#define _PyUnicode_LENGTH(op) \ 100 (((PyASCIIObject *)(op))->length) 101#define _PyUnicode_STATE(op) \ 102 (((PyASCIIObject *)(op))->state) 103#define _PyUnicode_HASH(op) \ 104 (((PyASCIIObject *)(op))->hash) 105#define _PyUnicode_KIND(op) \ 106 (assert(_PyUnicode_CHECK(op)), \ 107 ((PyASCIIObject *)(op))->state.kind) 108#define _PyUnicode_GET_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 ((PyASCIIObject *)(op))->length) 111#define _PyUnicode_DATA_ANY(op) \ 112 (((PyUnicodeObject*)(op))->data.any) 113 114/* Optimized version of Py_MAX() to compute the maximum character: 115 use it when your are computing the second argument of PyUnicode_New() */ 116#define MAX_MAXCHAR(maxchar1, maxchar2) \ 117 ((maxchar1) | (maxchar2)) 118 119#undef PyUnicode_READY 120#define PyUnicode_READY(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 (PyUnicode_IS_READY(op) ? \ 123 0 : \ 124 _PyUnicode_Ready(op))) 125 126#define _PyUnicode_SHARE_UTF8(op) \ 127 (assert(_PyUnicode_CHECK(op)), \ 128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 130#define _PyUnicode_SHARE_WSTR(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated UTF-8 memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (!PyUnicode_IS_COMPACT_ASCII(op) \ 139 && _PyUnicode_UTF8(op) \ 140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 141 142/* true if the Unicode object has an allocated wstr memory block 143 (not shared with other data) */ 144#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 145 (assert(_PyUnicode_CHECK(op)), \ 146 (_PyUnicode_WSTR(op) && \ 147 (!PyUnicode_IS_READY(op) || \ 148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 149 150/* Generic helper macro to convert characters of different types. 151 from_type and to_type have to be valid type names, begin and end 152 are pointers to the source characters which should be of type 153 "from_type *". to is a pointer of type "to_type *" and points to the 154 buffer where the result characters are written to. */ 155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 156 do { \ 157 to_type *_to = (to_type *) to; \ 158 const from_type *_iter = (begin); \ 159 const from_type *_end = (end); \ 160 Py_ssize_t n = (_end) - (_iter); \ 161 const from_type *_unrolled_end = \ 162 _iter + (n & ~ (Py_ssize_t) 3); \ 163 while (_iter < (_unrolled_end)) { \ 164 _to[0] = (to_type) _iter[0]; \ 165 _to[1] = (to_type) _iter[1]; \ 166 _to[2] = (to_type) _iter[2]; \ 167 _to[3] = (to_type) _iter[3]; \ 168 _iter += 4; _to += 4; \ 169 } \ 170 while (_iter < (_end)) \ 171 *_to++ = (to_type) *_iter++; \ 172 } while (0) 173 174/* This dictionary holds all interned unicode strings. Note that references 175 to strings in this dictionary are *not* counted in the string's ob_refcnt. 176 When the interned string reaches a refcnt of 0 the string deallocation 177 function will delete the reference from this dictionary. 178 179 Another way to look at this is that to say that the actual reference 180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 181*/ 182static PyObject *interned; 183 184/* The empty Unicode object is shared to improve performance. */ 185static PyObject *unicode_empty; 186 187/* List of static strings. */ 188static _Py_Identifier *static_strings; 189 190/* Single character Unicode strings in the Latin-1 range are being 191 shared as well. */ 192static PyObject *unicode_latin1[256]; 193 194/* Fast detection of the most frequent whitespace characters */ 195const unsigned char _Py_ascii_whitespace[] = { 196 0, 0, 0, 0, 0, 0, 0, 0, 197/* case 0x0009: * CHARACTER TABULATION */ 198/* case 0x000A: * LINE FEED */ 199/* case 0x000B: * LINE TABULATION */ 200/* case 0x000C: * FORM FEED */ 201/* case 0x000D: * CARRIAGE RETURN */ 202 0, 1, 1, 1, 1, 1, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204/* case 0x001C: * FILE SEPARATOR */ 205/* case 0x001D: * GROUP SEPARATOR */ 206/* case 0x001E: * RECORD SEPARATOR */ 207/* case 0x001F: * UNIT SEPARATOR */ 208 0, 0, 0, 0, 1, 1, 1, 1, 209/* case 0x0020: * SPACE */ 210 1, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0 223}; 224 225/* forward */ 226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 227static PyObject* get_latin1_char(unsigned char ch); 228static int unicode_modifiable(PyObject *unicode); 229 230 231static PyObject * 232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 233static PyObject * 234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 235static PyObject * 236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 237 238static PyObject * 239unicode_encode_call_errorhandler(const char *errors, 240 PyObject **errorHandler,const char *encoding, const char *reason, 241 PyObject *unicode, PyObject **exceptionObject, 242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 243 244static void 245raise_encode_exception(PyObject **exceptionObject, 246 const char *encoding, 247 PyObject *unicode, 248 Py_ssize_t startpos, Py_ssize_t endpos, 249 const char *reason); 250 251/* Same for linebreaks */ 252static unsigned char ascii_linebreak[] = { 253 0, 0, 0, 0, 0, 0, 0, 0, 254/* 0x000A, * LINE FEED */ 255/* 0x000B, * LINE TABULATION */ 256/* 0x000C, * FORM FEED */ 257/* 0x000D, * CARRIAGE RETURN */ 258 0, 0, 1, 1, 1, 1, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260/* 0x001C, * FILE SEPARATOR */ 261/* 0x001D, * GROUP SEPARATOR */ 262/* 0x001E, * RECORD SEPARATOR */ 263 0, 0, 0, 0, 1, 1, 1, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 269 0, 0, 0, 0, 0, 0, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0 277}; 278 279/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 280 This function is kept for backward compatibility with the old API. */ 281Py_UNICODE 282PyUnicode_GetMax(void) 283{ 284#ifdef Py_UNICODE_WIDE 285 return 0x10FFFF; 286#else 287 /* This is actually an illegal character, so it should 288 not be passed to unichr. */ 289 return 0xFFFF; 290#endif 291} 292 293#ifdef Py_DEBUG 294int 295_PyUnicode_CheckConsistency(PyObject *op, int check_content) 296{ 297 PyASCIIObject *ascii; 298 unsigned int kind; 299 300 assert(PyUnicode_Check(op)); 301 302 ascii = (PyASCIIObject *)op; 303 kind = ascii->state.kind; 304 305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 306 assert(kind == PyUnicode_1BYTE_KIND); 307 assert(ascii->state.ready == 1); 308 } 309 else { 310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 311 void *data; 312 313 if (ascii->state.compact == 1) { 314 data = compact + 1; 315 assert(kind == PyUnicode_1BYTE_KIND 316 || kind == PyUnicode_2BYTE_KIND 317 || kind == PyUnicode_4BYTE_KIND); 318 assert(ascii->state.ascii == 0); 319 assert(ascii->state.ready == 1); 320 assert (compact->utf8 != data); 321 } 322 else { 323 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 324 325 data = unicode->data.any; 326 if (kind == PyUnicode_WCHAR_KIND) { 327 assert(ascii->length == 0); 328 assert(ascii->hash == -1); 329 assert(ascii->state.compact == 0); 330 assert(ascii->state.ascii == 0); 331 assert(ascii->state.ready == 0); 332 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 333 assert(ascii->wstr != NULL); 334 assert(data == NULL); 335 assert(compact->utf8 == NULL); 336 } 337 else { 338 assert(kind == PyUnicode_1BYTE_KIND 339 || kind == PyUnicode_2BYTE_KIND 340 || kind == PyUnicode_4BYTE_KIND); 341 assert(ascii->state.compact == 0); 342 assert(ascii->state.ready == 1); 343 assert(data != NULL); 344 if (ascii->state.ascii) { 345 assert (compact->utf8 == data); 346 assert (compact->utf8_length == ascii->length); 347 } 348 else 349 assert (compact->utf8 != data); 350 } 351 } 352 if (kind != PyUnicode_WCHAR_KIND) { 353 if ( 354#if SIZEOF_WCHAR_T == 2 355 kind == PyUnicode_2BYTE_KIND 356#else 357 kind == PyUnicode_4BYTE_KIND 358#endif 359 ) 360 { 361 assert(ascii->wstr == data); 362 assert(compact->wstr_length == ascii->length); 363 } else 364 assert(ascii->wstr != data); 365 } 366 367 if (compact->utf8 == NULL) 368 assert(compact->utf8_length == 0); 369 if (ascii->wstr == NULL) 370 assert(compact->wstr_length == 0); 371 } 372 /* check that the best kind is used */ 373 if (check_content && kind != PyUnicode_WCHAR_KIND) 374 { 375 Py_ssize_t i; 376 Py_UCS4 maxchar = 0; 377 void *data; 378 Py_UCS4 ch; 379 380 data = PyUnicode_DATA(ascii); 381 for (i=0; i < ascii->length; i++) 382 { 383 ch = PyUnicode_READ(kind, data, i); 384 if (ch > maxchar) 385 maxchar = ch; 386 } 387 if (kind == PyUnicode_1BYTE_KIND) { 388 if (ascii->state.ascii == 0) { 389 assert(maxchar >= 128); 390 assert(maxchar <= 255); 391 } 392 else 393 assert(maxchar < 128); 394 } 395 else if (kind == PyUnicode_2BYTE_KIND) { 396 assert(maxchar >= 0x100); 397 assert(maxchar <= 0xFFFF); 398 } 399 else { 400 assert(maxchar >= 0x10000); 401 assert(maxchar <= MAX_UNICODE); 402 } 403 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 404 } 405 return 1; 406} 407#endif 408 409static PyObject* 410unicode_result_wchar(PyObject *unicode) 411{ 412#ifndef Py_DEBUG 413 Py_ssize_t len; 414 415 assert(Py_REFCNT(unicode) == 1); 416 417 len = _PyUnicode_WSTR_LENGTH(unicode); 418 if (len == 0) { 419 Py_INCREF(unicode_empty); 420 Py_DECREF(unicode); 421 return unicode_empty; 422 } 423 424 if (len == 1) { 425 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 426 if (ch < 256) { 427 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 428 Py_DECREF(unicode); 429 return latin1_char; 430 } 431 } 432 433 if (_PyUnicode_Ready(unicode) < 0) { 434 Py_XDECREF(unicode); 435 return NULL; 436 } 437#else 438 /* don't make the result ready in debug mode to ensure that the caller 439 makes the string ready before using it */ 440 assert(_PyUnicode_CheckConsistency(unicode, 1)); 441#endif 442 return unicode; 443} 444 445static PyObject* 446unicode_result_ready(PyObject *unicode) 447{ 448 Py_ssize_t length; 449 450 length = PyUnicode_GET_LENGTH(unicode); 451 if (length == 0) { 452 if (unicode != unicode_empty) { 453 Py_INCREF(unicode_empty); 454 Py_DECREF(unicode); 455 } 456 return unicode_empty; 457 } 458 459 if (length == 1) { 460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 461 if (ch < 256) { 462 PyObject *latin1_char = unicode_latin1[ch]; 463 if (latin1_char != NULL) { 464 if (unicode != latin1_char) { 465 Py_INCREF(latin1_char); 466 Py_DECREF(unicode); 467 } 468 return latin1_char; 469 } 470 else { 471 assert(_PyUnicode_CheckConsistency(unicode, 1)); 472 Py_INCREF(unicode); 473 unicode_latin1[ch] = unicode; 474 return unicode; 475 } 476 } 477 } 478 479 assert(_PyUnicode_CheckConsistency(unicode, 1)); 480 return unicode; 481} 482 483static PyObject* 484unicode_result(PyObject *unicode) 485{ 486 assert(_PyUnicode_CHECK(unicode)); 487 if (PyUnicode_IS_READY(unicode)) 488 return unicode_result_ready(unicode); 489 else 490 return unicode_result_wchar(unicode); 491} 492 493static PyObject* 494unicode_result_unchanged(PyObject *unicode) 495{ 496 if (PyUnicode_CheckExact(unicode)) { 497 if (PyUnicode_READY(unicode) == -1) 498 return NULL; 499 Py_INCREF(unicode); 500 return unicode; 501 } 502 else 503 /* Subtype -- return genuine unicode string with the same value. */ 504 return _PyUnicode_Copy(unicode); 505} 506 507#ifdef HAVE_MBCS 508static OSVERSIONINFOEX winver; 509#endif 510 511/* --- Bloom Filters ----------------------------------------------------- */ 512 513/* stuff to implement simple "bloom filters" for Unicode characters. 514 to keep things simple, we use a single bitmask, using the least 5 515 bits from each unicode characters as the bit index. */ 516 517/* the linebreak mask is set up by Unicode_Init below */ 518 519#if LONG_BIT >= 128 520#define BLOOM_WIDTH 128 521#elif LONG_BIT >= 64 522#define BLOOM_WIDTH 64 523#elif LONG_BIT >= 32 524#define BLOOM_WIDTH 32 525#else 526#error "LONG_BIT is smaller than 32" 527#endif 528 529#define BLOOM_MASK unsigned long 530 531static BLOOM_MASK bloom_linebreak; 532 533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 535 536#define BLOOM_LINEBREAK(ch) \ 537 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 539 540Py_LOCAL_INLINE(BLOOM_MASK) 541make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 542{ 543 /* calculate simple bloom-style bitmask for a given unicode string */ 544 545 BLOOM_MASK mask; 546 Py_ssize_t i; 547 548 mask = 0; 549 for (i = 0; i < len; i++) 550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 551 552 return mask; 553} 554 555#define BLOOM_MEMBER(mask, chr, str) \ 556 (BLOOM(mask, chr) \ 557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 558 559/* Compilation of templated routines */ 560 561#include "stringlib/asciilib.h" 562#include "stringlib/fastsearch.h" 563#include "stringlib/partition.h" 564#include "stringlib/split.h" 565#include "stringlib/count.h" 566#include "stringlib/find.h" 567#include "stringlib/find_max_char.h" 568#include "stringlib/localeutil.h" 569#include "stringlib/undef.h" 570 571#include "stringlib/ucs1lib.h" 572#include "stringlib/fastsearch.h" 573#include "stringlib/partition.h" 574#include "stringlib/split.h" 575#include "stringlib/count.h" 576#include "stringlib/find.h" 577#include "stringlib/find_max_char.h" 578#include "stringlib/localeutil.h" 579#include "stringlib/undef.h" 580 581#include "stringlib/ucs2lib.h" 582#include "stringlib/fastsearch.h" 583#include "stringlib/partition.h" 584#include "stringlib/split.h" 585#include "stringlib/count.h" 586#include "stringlib/find.h" 587#include "stringlib/find_max_char.h" 588#include "stringlib/localeutil.h" 589#include "stringlib/undef.h" 590 591#include "stringlib/ucs4lib.h" 592#include "stringlib/fastsearch.h" 593#include "stringlib/partition.h" 594#include "stringlib/split.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/find_max_char.h" 598#include "stringlib/localeutil.h" 599#include "stringlib/undef.h" 600 601#include "stringlib/unicodedefs.h" 602#include "stringlib/fastsearch.h" 603#include "stringlib/count.h" 604#include "stringlib/find.h" 605#include "stringlib/undef.h" 606 607/* --- Unicode Object ----------------------------------------------------- */ 608 609static PyObject * 610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 611 612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 613 Py_ssize_t size, Py_UCS4 ch, 614 int direction) 615{ 616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 617 618 switch (kind) { 619 case PyUnicode_1BYTE_KIND: 620 { 621 Py_UCS1 ch1 = (Py_UCS1) ch; 622 if (ch1 == ch) 623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 624 else 625 return -1; 626 } 627 case PyUnicode_2BYTE_KIND: 628 { 629 Py_UCS2 ch2 = (Py_UCS2) ch; 630 if (ch2 == ch) 631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 632 else 633 return -1; 634 } 635 case PyUnicode_4BYTE_KIND: 636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 637 default: 638 assert(0); 639 return -1; 640 } 641} 642 643static PyObject* 644resize_compact(PyObject *unicode, Py_ssize_t length) 645{ 646 Py_ssize_t char_size; 647 Py_ssize_t struct_size; 648 Py_ssize_t new_size; 649 int share_wstr; 650 PyObject *new_unicode; 651 assert(unicode_modifiable(unicode)); 652 assert(PyUnicode_IS_READY(unicode)); 653 assert(PyUnicode_IS_COMPACT(unicode)); 654 655 char_size = PyUnicode_KIND(unicode); 656 if (PyUnicode_IS_ASCII(unicode)) 657 struct_size = sizeof(PyASCIIObject); 658 else 659 struct_size = sizeof(PyCompactUnicodeObject); 660 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 661 662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 663 PyErr_NoMemory(); 664 return NULL; 665 } 666 new_size = (struct_size + (length + 1) * char_size); 667 668 _Py_DEC_REFTOTAL; 669 _Py_ForgetReference(unicode); 670 671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 672 if (new_unicode == NULL) { 673 _Py_NewReference(unicode); 674 PyErr_NoMemory(); 675 return NULL; 676 } 677 unicode = new_unicode; 678 _Py_NewReference(unicode); 679 680 _PyUnicode_LENGTH(unicode) = length; 681 if (share_wstr) { 682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 683 if (!PyUnicode_IS_ASCII(unicode)) 684 _PyUnicode_WSTR_LENGTH(unicode) = length; 685 } 686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 687 length, 0); 688 assert(_PyUnicode_CheckConsistency(unicode, 0)); 689 return unicode; 690} 691 692static int 693resize_inplace(PyObject *unicode, Py_ssize_t length) 694{ 695 wchar_t *wstr; 696 Py_ssize_t new_size; 697 assert(!PyUnicode_IS_COMPACT(unicode)); 698 assert(Py_REFCNT(unicode) == 1); 699 700 if (PyUnicode_IS_READY(unicode)) { 701 Py_ssize_t char_size; 702 int share_wstr, share_utf8; 703 void *data; 704 705 data = _PyUnicode_DATA_ANY(unicode); 706 char_size = PyUnicode_KIND(unicode); 707 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 709 710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 711 PyErr_NoMemory(); 712 return -1; 713 } 714 new_size = (length + 1) * char_size; 715 716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 717 { 718 PyObject_DEL(_PyUnicode_UTF8(unicode)); 719 _PyUnicode_UTF8(unicode) = NULL; 720 _PyUnicode_UTF8_LENGTH(unicode) = 0; 721 } 722 723 data = (PyObject *)PyObject_REALLOC(data, new_size); 724 if (data == NULL) { 725 PyErr_NoMemory(); 726 return -1; 727 } 728 _PyUnicode_DATA_ANY(unicode) = data; 729 if (share_wstr) { 730 _PyUnicode_WSTR(unicode) = data; 731 _PyUnicode_WSTR_LENGTH(unicode) = length; 732 } 733 if (share_utf8) { 734 _PyUnicode_UTF8(unicode) = data; 735 _PyUnicode_UTF8_LENGTH(unicode) = length; 736 } 737 _PyUnicode_LENGTH(unicode) = length; 738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 740 assert(_PyUnicode_CheckConsistency(unicode, 0)); 741 return 0; 742 } 743 } 744 assert(_PyUnicode_WSTR(unicode) != NULL); 745 746 /* check for integer overflow */ 747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 748 PyErr_NoMemory(); 749 return -1; 750 } 751 new_size = sizeof(wchar_t) * (length + 1); 752 wstr = _PyUnicode_WSTR(unicode); 753 wstr = PyObject_REALLOC(wstr, new_size); 754 if (!wstr) { 755 PyErr_NoMemory(); 756 return -1; 757 } 758 _PyUnicode_WSTR(unicode) = wstr; 759 _PyUnicode_WSTR(unicode)[length] = 0; 760 _PyUnicode_WSTR_LENGTH(unicode) = length; 761 assert(_PyUnicode_CheckConsistency(unicode, 0)); 762 return 0; 763} 764 765static PyObject* 766resize_copy(PyObject *unicode, Py_ssize_t length) 767{ 768 Py_ssize_t copy_length; 769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 770 PyObject *copy; 771 772 if (PyUnicode_READY(unicode) == -1) 773 return NULL; 774 775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 776 if (copy == NULL) 777 return NULL; 778 779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 781 return copy; 782 } 783 else { 784 PyObject *w; 785 786 w = (PyObject*)_PyUnicode_New(length); 787 if (w == NULL) 788 return NULL; 789 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 790 copy_length = Py_MIN(copy_length, length); 791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 792 copy_length); 793 return w; 794 } 795} 796 797/* We allocate one more byte to make sure the string is 798 Ux0000 terminated; some code (e.g. new_identifier) 799 relies on that. 800 801 XXX This allocator could further be enhanced by assuring that the 802 free list never reduces its size below 1. 803 804*/ 805 806static PyUnicodeObject * 807_PyUnicode_New(Py_ssize_t length) 808{ 809 register PyUnicodeObject *unicode; 810 size_t new_size; 811 812 /* Optimization for empty strings */ 813 if (length == 0 && unicode_empty != NULL) { 814 Py_INCREF(unicode_empty); 815 return (PyUnicodeObject*)unicode_empty; 816 } 817 818 /* Ensure we won't overflow the size. */ 819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 820 return (PyUnicodeObject *)PyErr_NoMemory(); 821 } 822 if (length < 0) { 823 PyErr_SetString(PyExc_SystemError, 824 "Negative size passed to _PyUnicode_New"); 825 return NULL; 826 } 827 828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 829 if (unicode == NULL) 830 return NULL; 831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 833 if (!_PyUnicode_WSTR(unicode)) { 834 Py_DECREF(unicode); 835 PyErr_NoMemory(); 836 return NULL; 837 } 838 839 /* Initialize the first element to guard against cases where 840 * the caller fails before initializing str -- unicode_resize() 841 * reads str[0], and the Keep-Alive optimization can keep memory 842 * allocated for str alive across a call to unicode_dealloc(unicode). 843 * We don't want unicode_resize to read uninitialized memory in 844 * that case. 845 */ 846 _PyUnicode_WSTR(unicode)[0] = 0; 847 _PyUnicode_WSTR(unicode)[length] = 0; 848 _PyUnicode_WSTR_LENGTH(unicode) = length; 849 _PyUnicode_HASH(unicode) = -1; 850 _PyUnicode_STATE(unicode).interned = 0; 851 _PyUnicode_STATE(unicode).kind = 0; 852 _PyUnicode_STATE(unicode).compact = 0; 853 _PyUnicode_STATE(unicode).ready = 0; 854 _PyUnicode_STATE(unicode).ascii = 0; 855 _PyUnicode_DATA_ANY(unicode) = NULL; 856 _PyUnicode_LENGTH(unicode) = 0; 857 _PyUnicode_UTF8(unicode) = NULL; 858 _PyUnicode_UTF8_LENGTH(unicode) = 0; 859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 860 return unicode; 861} 862 863static const char* 864unicode_kind_name(PyObject *unicode) 865{ 866 /* don't check consistency: unicode_kind_name() is called from 867 _PyUnicode_Dump() */ 868 if (!PyUnicode_IS_COMPACT(unicode)) 869 { 870 if (!PyUnicode_IS_READY(unicode)) 871 return "wstr"; 872 switch (PyUnicode_KIND(unicode)) 873 { 874 case PyUnicode_1BYTE_KIND: 875 if (PyUnicode_IS_ASCII(unicode)) 876 return "legacy ascii"; 877 else 878 return "legacy latin1"; 879 case PyUnicode_2BYTE_KIND: 880 return "legacy UCS2"; 881 case PyUnicode_4BYTE_KIND: 882 return "legacy UCS4"; 883 default: 884 return "<legacy invalid kind>"; 885 } 886 } 887 assert(PyUnicode_IS_READY(unicode)); 888 switch (PyUnicode_KIND(unicode)) { 889 case PyUnicode_1BYTE_KIND: 890 if (PyUnicode_IS_ASCII(unicode)) 891 return "ascii"; 892 else 893 return "latin1"; 894 case PyUnicode_2BYTE_KIND: 895 return "UCS2"; 896 case PyUnicode_4BYTE_KIND: 897 return "UCS4"; 898 default: 899 return "<invalid compact kind>"; 900 } 901} 902 903#ifdef Py_DEBUG 904/* Functions wrapping macros for use in debugger */ 905char *_PyUnicode_utf8(void *unicode){ 906 return PyUnicode_UTF8(unicode); 907} 908 909void *_PyUnicode_compact_data(void *unicode) { 910 return _PyUnicode_COMPACT_DATA(unicode); 911} 912void *_PyUnicode_data(void *unicode){ 913 printf("obj %p\n", unicode); 914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 919 return PyUnicode_DATA(unicode); 920} 921 922void 923_PyUnicode_Dump(PyObject *op) 924{ 925 PyASCIIObject *ascii = (PyASCIIObject *)op; 926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 927 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 928 void *data; 929 930 if (ascii->state.compact) 931 { 932 if (ascii->state.ascii) 933 data = (ascii + 1); 934 else 935 data = (compact + 1); 936 } 937 else 938 data = unicode->data.any; 939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 940 941 if (ascii->wstr == data) 942 printf("shared "); 943 printf("wstr=%p", ascii->wstr); 944 945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 946 printf(" (%zu), ", compact->wstr_length); 947 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 948 printf("shared "); 949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 950 } 951 printf(", data=%p\n", data); 952} 953#endif 954 955PyObject * 956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 957{ 958 PyObject *obj; 959 PyCompactUnicodeObject *unicode; 960 void *data; 961 enum PyUnicode_Kind kind; 962 int is_sharing, is_ascii; 963 Py_ssize_t char_size; 964 Py_ssize_t struct_size; 965 966 /* Optimization for empty strings */ 967 if (size == 0 && unicode_empty != NULL) { 968 Py_INCREF(unicode_empty); 969 return unicode_empty; 970 } 971 972 is_ascii = 0; 973 is_sharing = 0; 974 struct_size = sizeof(PyCompactUnicodeObject); 975 if (maxchar < 128) { 976 kind = PyUnicode_1BYTE_KIND; 977 char_size = 1; 978 is_ascii = 1; 979 struct_size = sizeof(PyASCIIObject); 980 } 981 else if (maxchar < 256) { 982 kind = PyUnicode_1BYTE_KIND; 983 char_size = 1; 984 } 985 else if (maxchar < 65536) { 986 kind = PyUnicode_2BYTE_KIND; 987 char_size = 2; 988 if (sizeof(wchar_t) == 2) 989 is_sharing = 1; 990 } 991 else { 992 if (maxchar > MAX_UNICODE) { 993 PyErr_SetString(PyExc_SystemError, 994 "invalid maximum character passed to PyUnicode_New"); 995 return NULL; 996 } 997 kind = PyUnicode_4BYTE_KIND; 998 char_size = 4; 999 if (sizeof(wchar_t) == 4) 1000 is_sharing = 1; 1001 } 1002 1003 /* Ensure we won't overflow the size. */ 1004 if (size < 0) { 1005 PyErr_SetString(PyExc_SystemError, 1006 "Negative size passed to PyUnicode_New"); 1007 return NULL; 1008 } 1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1010 return PyErr_NoMemory(); 1011 1012 /* Duplicated allocation code from _PyObject_New() instead of a call to 1013 * PyObject_New() so we are able to allocate space for the object and 1014 * it's data buffer. 1015 */ 1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1017 if (obj == NULL) 1018 return PyErr_NoMemory(); 1019 obj = PyObject_INIT(obj, &PyUnicode_Type); 1020 if (obj == NULL) 1021 return NULL; 1022 1023 unicode = (PyCompactUnicodeObject *)obj; 1024 if (is_ascii) 1025 data = ((PyASCIIObject*)obj) + 1; 1026 else 1027 data = unicode + 1; 1028 _PyUnicode_LENGTH(unicode) = size; 1029 _PyUnicode_HASH(unicode) = -1; 1030 _PyUnicode_STATE(unicode).interned = 0; 1031 _PyUnicode_STATE(unicode).kind = kind; 1032 _PyUnicode_STATE(unicode).compact = 1; 1033 _PyUnicode_STATE(unicode).ready = 1; 1034 _PyUnicode_STATE(unicode).ascii = is_ascii; 1035 if (is_ascii) { 1036 ((char*)data)[size] = 0; 1037 _PyUnicode_WSTR(unicode) = NULL; 1038 } 1039 else if (kind == PyUnicode_1BYTE_KIND) { 1040 ((char*)data)[size] = 0; 1041 _PyUnicode_WSTR(unicode) = NULL; 1042 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1043 unicode->utf8 = NULL; 1044 unicode->utf8_length = 0; 1045 } 1046 else { 1047 unicode->utf8 = NULL; 1048 unicode->utf8_length = 0; 1049 if (kind == PyUnicode_2BYTE_KIND) 1050 ((Py_UCS2*)data)[size] = 0; 1051 else /* kind == PyUnicode_4BYTE_KIND */ 1052 ((Py_UCS4*)data)[size] = 0; 1053 if (is_sharing) { 1054 _PyUnicode_WSTR_LENGTH(unicode) = size; 1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1056 } 1057 else { 1058 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1059 _PyUnicode_WSTR(unicode) = NULL; 1060 } 1061 } 1062#ifdef Py_DEBUG 1063 /* Fill the data with invalid characters to detect bugs earlier. 1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, 1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII 1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */ 1067 memset(data, 0xff, size * kind); 1068#endif 1069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1070 return obj; 1071} 1072 1073#if SIZEOF_WCHAR_T == 2 1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1075 will decode surrogate pairs, the other conversions are implemented as macros 1076 for efficiency. 1077 1078 This function assumes that unicode can hold one more code point than wstr 1079 characters for a terminating null character. */ 1080static void 1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1082 PyObject *unicode) 1083{ 1084 const wchar_t *iter; 1085 Py_UCS4 *ucs4_out; 1086 1087 assert(unicode != NULL); 1088 assert(_PyUnicode_CHECK(unicode)); 1089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1091 1092 for (iter = begin; iter < end; ) { 1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1094 _PyUnicode_GET_LENGTH(unicode))); 1095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1096 && (iter+1) < end 1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1098 { 1099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1100 iter += 2; 1101 } 1102 else { 1103 *ucs4_out++ = *iter; 1104 iter++; 1105 } 1106 } 1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1108 _PyUnicode_GET_LENGTH(unicode))); 1109 1110} 1111#endif 1112 1113static int 1114unicode_check_modifiable(PyObject *unicode) 1115{ 1116 if (!unicode_modifiable(unicode)) { 1117 PyErr_SetString(PyExc_SystemError, 1118 "Cannot modify a string currently used"); 1119 return -1; 1120 } 1121 return 0; 1122} 1123 1124static int 1125_copy_characters(PyObject *to, Py_ssize_t to_start, 1126 PyObject *from, Py_ssize_t from_start, 1127 Py_ssize_t how_many, int check_maxchar) 1128{ 1129 unsigned int from_kind, to_kind; 1130 void *from_data, *to_data; 1131 int fast; 1132 1133 assert(0 <= how_many); 1134 assert(0 <= from_start); 1135 assert(0 <= to_start); 1136 assert(PyUnicode_Check(from)); 1137 assert(PyUnicode_IS_READY(from)); 1138 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1139 1140 if (how_many == 0) 1141 return 0; 1142 1143 assert(PyUnicode_Check(to)); 1144 assert(PyUnicode_IS_READY(to)); 1145 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1146 1147 from_kind = PyUnicode_KIND(from); 1148 from_data = PyUnicode_DATA(from); 1149 to_kind = PyUnicode_KIND(to); 1150 to_data = PyUnicode_DATA(to); 1151 1152#ifdef Py_DEBUG 1153 if (!check_maxchar 1154 && (from_kind > to_kind 1155 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 1156 { 1157 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1158 Py_UCS4 ch; 1159 Py_ssize_t i; 1160 for (i=0; i < how_many; i++) { 1161 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1162 assert(ch <= to_maxchar); 1163 } 1164 } 1165#endif 1166 fast = (from_kind == to_kind); 1167 if (check_maxchar 1168 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1169 { 1170 /* deny latin1 => ascii */ 1171 fast = 0; 1172 } 1173 1174 if (fast) { 1175 Py_MEMCPY((char*)to_data + to_kind * to_start, 1176 (char*)from_data + from_kind * from_start, 1177 to_kind * how_many); 1178 } 1179 else if (from_kind == PyUnicode_1BYTE_KIND 1180 && to_kind == PyUnicode_2BYTE_KIND) 1181 { 1182 _PyUnicode_CONVERT_BYTES( 1183 Py_UCS1, Py_UCS2, 1184 PyUnicode_1BYTE_DATA(from) + from_start, 1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1186 PyUnicode_2BYTE_DATA(to) + to_start 1187 ); 1188 } 1189 else if (from_kind == PyUnicode_1BYTE_KIND 1190 && to_kind == PyUnicode_4BYTE_KIND) 1191 { 1192 _PyUnicode_CONVERT_BYTES( 1193 Py_UCS1, Py_UCS4, 1194 PyUnicode_1BYTE_DATA(from) + from_start, 1195 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1196 PyUnicode_4BYTE_DATA(to) + to_start 1197 ); 1198 } 1199 else if (from_kind == PyUnicode_2BYTE_KIND 1200 && to_kind == PyUnicode_4BYTE_KIND) 1201 { 1202 _PyUnicode_CONVERT_BYTES( 1203 Py_UCS2, Py_UCS4, 1204 PyUnicode_2BYTE_DATA(from) + from_start, 1205 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1206 PyUnicode_4BYTE_DATA(to) + to_start 1207 ); 1208 } 1209 else { 1210 /* check if max_char(from substring) <= max_char(to) */ 1211 if (from_kind > to_kind 1212 /* latin1 => ascii */ 1213 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1214 { 1215 /* slow path to check for character overflow */ 1216 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1217 Py_UCS4 ch; 1218 Py_ssize_t i; 1219 1220#ifdef Py_DEBUG 1221 for (i=0; i < how_many; i++) { 1222 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1223 assert(ch <= to_maxchar); 1224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1225 } 1226#else 1227 if (!check_maxchar) { 1228 for (i=0; i < how_many; i++) { 1229 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1230 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1231 } 1232 } 1233 else { 1234 for (i=0; i < how_many; i++) { 1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1236 if (ch > to_maxchar) 1237 return 1; 1238 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1239 } 1240 } 1241#endif 1242 } 1243 else { 1244 assert(0 && "inconsistent state"); 1245 return 1; 1246 } 1247 } 1248 return 0; 1249} 1250 1251void 1252_PyUnicode_FastCopyCharacters( 1253 PyObject *to, Py_ssize_t to_start, 1254 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1255{ 1256 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1257} 1258 1259Py_ssize_t 1260PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1261 PyObject *from, Py_ssize_t from_start, 1262 Py_ssize_t how_many) 1263{ 1264 int err; 1265 1266 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1267 PyErr_BadInternalCall(); 1268 return -1; 1269 } 1270 1271 if (PyUnicode_READY(from) == -1) 1272 return -1; 1273 if (PyUnicode_READY(to) == -1) 1274 return -1; 1275 1276 if (from_start < 0) { 1277 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1278 return -1; 1279 } 1280 if (to_start < 0) { 1281 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1282 return -1; 1283 } 1284 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1285 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1286 PyErr_Format(PyExc_SystemError, 1287 "Cannot write %zi characters at %zi " 1288 "in a string of %zi characters", 1289 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1290 return -1; 1291 } 1292 1293 if (how_many == 0) 1294 return 0; 1295 1296 if (unicode_check_modifiable(to)) 1297 return -1; 1298 1299 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1300 if (err) { 1301 PyErr_Format(PyExc_SystemError, 1302 "Cannot copy %s characters " 1303 "into a string of %s characters", 1304 unicode_kind_name(from), 1305 unicode_kind_name(to)); 1306 return -1; 1307 } 1308 return how_many; 1309} 1310 1311/* Find the maximum code point and count the number of surrogate pairs so a 1312 correct string length can be computed before converting a string to UCS4. 1313 This function counts single surrogates as a character and not as a pair. 1314 1315 Return 0 on success, or -1 on error. */ 1316static int 1317find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1318 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1319{ 1320 const wchar_t *iter; 1321 Py_UCS4 ch; 1322 1323 assert(num_surrogates != NULL && maxchar != NULL); 1324 *num_surrogates = 0; 1325 *maxchar = 0; 1326 1327 for (iter = begin; iter < end; ) { 1328#if SIZEOF_WCHAR_T == 2 1329 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1330 && (iter+1) < end 1331 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1332 { 1333 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1334 ++(*num_surrogates); 1335 iter += 2; 1336 } 1337 else 1338#endif 1339 { 1340 ch = *iter; 1341 iter++; 1342 } 1343 if (ch > *maxchar) { 1344 *maxchar = ch; 1345 if (*maxchar > MAX_UNICODE) { 1346 PyErr_Format(PyExc_ValueError, 1347 "character U+%x is not in range [U+0000; U+10ffff]", 1348 ch); 1349 return -1; 1350 } 1351 } 1352 } 1353 return 0; 1354} 1355 1356#ifdef Py_DEBUG 1357static int unicode_ready_calls = 0; 1358#endif 1359 1360int 1361_PyUnicode_Ready(PyObject *unicode) 1362{ 1363 wchar_t *end; 1364 Py_UCS4 maxchar = 0; 1365 Py_ssize_t num_surrogates; 1366#if SIZEOF_WCHAR_T == 2 1367 Py_ssize_t length_wo_surrogates; 1368#endif 1369 1370 /* _PyUnicode_Ready() is only intended for old-style API usage where 1371 strings were created using _PyObject_New() and where no canonical 1372 representation (the str field) has been set yet aka strings 1373 which are not yet ready. */ 1374 assert(_PyUnicode_CHECK(unicode)); 1375 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1376 assert(_PyUnicode_WSTR(unicode) != NULL); 1377 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1378 assert(_PyUnicode_UTF8(unicode) == NULL); 1379 /* Actually, it should neither be interned nor be anything else: */ 1380 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1381 1382#ifdef Py_DEBUG 1383 ++unicode_ready_calls; 1384#endif 1385 1386 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1387 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1388 &maxchar, &num_surrogates) == -1) 1389 return -1; 1390 1391 if (maxchar < 256) { 1392 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1393 if (!_PyUnicode_DATA_ANY(unicode)) { 1394 PyErr_NoMemory(); 1395 return -1; 1396 } 1397 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1398 _PyUnicode_WSTR(unicode), end, 1399 PyUnicode_1BYTE_DATA(unicode)); 1400 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1401 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1402 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1403 if (maxchar < 128) { 1404 _PyUnicode_STATE(unicode).ascii = 1; 1405 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1406 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1407 } 1408 else { 1409 _PyUnicode_STATE(unicode).ascii = 0; 1410 _PyUnicode_UTF8(unicode) = NULL; 1411 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1412 } 1413 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1414 _PyUnicode_WSTR(unicode) = NULL; 1415 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1416 } 1417 /* In this case we might have to convert down from 4-byte native 1418 wchar_t to 2-byte unicode. */ 1419 else if (maxchar < 65536) { 1420 assert(num_surrogates == 0 && 1421 "FindMaxCharAndNumSurrogatePairs() messed up"); 1422 1423#if SIZEOF_WCHAR_T == 2 1424 /* We can share representations and are done. */ 1425 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1426 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1427 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1428 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1429 _PyUnicode_UTF8(unicode) = NULL; 1430 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1431#else 1432 /* sizeof(wchar_t) == 4 */ 1433 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1434 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1435 if (!_PyUnicode_DATA_ANY(unicode)) { 1436 PyErr_NoMemory(); 1437 return -1; 1438 } 1439 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1440 _PyUnicode_WSTR(unicode), end, 1441 PyUnicode_2BYTE_DATA(unicode)); 1442 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1443 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1444 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1445 _PyUnicode_UTF8(unicode) = NULL; 1446 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1447 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1448 _PyUnicode_WSTR(unicode) = NULL; 1449 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1450#endif 1451 } 1452 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1453 else { 1454#if SIZEOF_WCHAR_T == 2 1455 /* in case the native representation is 2-bytes, we need to allocate a 1456 new normalized 4-byte version. */ 1457 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1458 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1459 if (!_PyUnicode_DATA_ANY(unicode)) { 1460 PyErr_NoMemory(); 1461 return -1; 1462 } 1463 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1464 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1465 _PyUnicode_UTF8(unicode) = NULL; 1466 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1467 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1468 _PyUnicode_STATE(unicode).ready = 1; 1469 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1470 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1471 _PyUnicode_WSTR(unicode) = NULL; 1472 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1473#else 1474 assert(num_surrogates == 0); 1475 1476 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1477 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1478 _PyUnicode_UTF8(unicode) = NULL; 1479 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1480 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1481#endif 1482 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1483 } 1484 _PyUnicode_STATE(unicode).ready = 1; 1485 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1486 return 0; 1487} 1488 1489static void 1490unicode_dealloc(register PyObject *unicode) 1491{ 1492 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1493 case SSTATE_NOT_INTERNED: 1494 break; 1495 1496 case SSTATE_INTERNED_MORTAL: 1497 /* revive dead object temporarily for DelItem */ 1498 Py_REFCNT(unicode) = 3; 1499 if (PyDict_DelItem(interned, unicode) != 0) 1500 Py_FatalError( 1501 "deletion of interned string failed"); 1502 break; 1503 1504 case SSTATE_INTERNED_IMMORTAL: 1505 Py_FatalError("Immortal interned string died."); 1506 1507 default: 1508 Py_FatalError("Inconsistent interned string state."); 1509 } 1510 1511 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1512 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1513 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1514 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1515 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1516 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1517 1518 Py_TYPE(unicode)->tp_free(unicode); 1519} 1520 1521#ifdef Py_DEBUG 1522static int 1523unicode_is_singleton(PyObject *unicode) 1524{ 1525 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1526 if (unicode == unicode_empty) 1527 return 1; 1528 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1529 { 1530 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1531 if (ch < 256 && unicode_latin1[ch] == unicode) 1532 return 1; 1533 } 1534 return 0; 1535} 1536#endif 1537 1538static int 1539unicode_modifiable(PyObject *unicode) 1540{ 1541 assert(_PyUnicode_CHECK(unicode)); 1542 if (Py_REFCNT(unicode) != 1) 1543 return 0; 1544 if (_PyUnicode_HASH(unicode) != -1) 1545 return 0; 1546 if (PyUnicode_CHECK_INTERNED(unicode)) 1547 return 0; 1548 if (!PyUnicode_CheckExact(unicode)) 1549 return 0; 1550#ifdef Py_DEBUG 1551 /* singleton refcount is greater than 1 */ 1552 assert(!unicode_is_singleton(unicode)); 1553#endif 1554 return 1; 1555} 1556 1557static int 1558unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1559{ 1560 PyObject *unicode; 1561 Py_ssize_t old_length; 1562 1563 assert(p_unicode != NULL); 1564 unicode = *p_unicode; 1565 1566 assert(unicode != NULL); 1567 assert(PyUnicode_Check(unicode)); 1568 assert(0 <= length); 1569 1570 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1571 old_length = PyUnicode_WSTR_LENGTH(unicode); 1572 else 1573 old_length = PyUnicode_GET_LENGTH(unicode); 1574 if (old_length == length) 1575 return 0; 1576 1577 if (length == 0) { 1578 Py_DECREF(*p_unicode); 1579 *p_unicode = unicode_empty; 1580 Py_INCREF(*p_unicode); 1581 return 0; 1582 } 1583 1584 if (!unicode_modifiable(unicode)) { 1585 PyObject *copy = resize_copy(unicode, length); 1586 if (copy == NULL) 1587 return -1; 1588 Py_DECREF(*p_unicode); 1589 *p_unicode = copy; 1590 return 0; 1591 } 1592 1593 if (PyUnicode_IS_COMPACT(unicode)) { 1594 PyObject *new_unicode = resize_compact(unicode, length); 1595 if (new_unicode == NULL) 1596 return -1; 1597 *p_unicode = new_unicode; 1598 return 0; 1599 } 1600 return resize_inplace(unicode, length); 1601} 1602 1603int 1604PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1605{ 1606 PyObject *unicode; 1607 if (p_unicode == NULL) { 1608 PyErr_BadInternalCall(); 1609 return -1; 1610 } 1611 unicode = *p_unicode; 1612 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1613 { 1614 PyErr_BadInternalCall(); 1615 return -1; 1616 } 1617 return unicode_resize(p_unicode, length); 1618} 1619 1620static int 1621unicode_widen(PyObject **p_unicode, Py_ssize_t length, 1622 unsigned int maxchar) 1623{ 1624 PyObject *result; 1625 assert(PyUnicode_IS_READY(*p_unicode)); 1626 assert(length <= PyUnicode_GET_LENGTH(*p_unicode)); 1627 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1628 return 0; 1629 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1630 maxchar); 1631 if (result == NULL) 1632 return -1; 1633 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length); 1634 Py_DECREF(*p_unicode); 1635 *p_unicode = result; 1636 return 0; 1637} 1638 1639static int 1640unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1641 Py_UCS4 ch) 1642{ 1643 assert(ch <= MAX_UNICODE); 1644 if (unicode_widen(p_unicode, *pos, ch) < 0) 1645 return -1; 1646 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1647 PyUnicode_DATA(*p_unicode), 1648 (*pos)++, ch); 1649 return 0; 1650} 1651 1652/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1653 Return the length of the input string. 1654 1655 WARNING: The function doesn't copy the terminating null character and 1656 doesn't check the maximum character (may write a latin1 character in an 1657 ASCII string). */ 1658static Py_ssize_t 1659unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str) 1660{ 1661 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1662 void *data = PyUnicode_DATA(unicode); 1663 1664 switch (kind) { 1665 case PyUnicode_1BYTE_KIND: { 1666 Py_ssize_t len = strlen(str); 1667 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1668 memcpy((char *) data + index, str, len); 1669 return len; 1670 } 1671 case PyUnicode_2BYTE_KIND: { 1672 Py_UCS2 *start = (Py_UCS2 *)data + index; 1673 Py_UCS2 *ucs2 = start; 1674 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1675 1676 for (; *str; ++ucs2, ++str) 1677 *ucs2 = (Py_UCS2)*str; 1678 1679 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1680 return ucs2 - start; 1681 } 1682 default: { 1683 Py_UCS4 *start = (Py_UCS4 *)data + index; 1684 Py_UCS4 *ucs4 = start; 1685 assert(kind == PyUnicode_4BYTE_KIND); 1686 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1687 1688 for (; *str; ++ucs4, ++str) 1689 *ucs4 = (Py_UCS4)*str; 1690 1691 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1692 return ucs4 - start; 1693 } 1694 } 1695} 1696 1697 1698static PyObject* 1699get_latin1_char(unsigned char ch) 1700{ 1701 PyObject *unicode = unicode_latin1[ch]; 1702 if (!unicode) { 1703 unicode = PyUnicode_New(1, ch); 1704 if (!unicode) 1705 return NULL; 1706 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1707 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1708 unicode_latin1[ch] = unicode; 1709 } 1710 Py_INCREF(unicode); 1711 return unicode; 1712} 1713 1714PyObject * 1715PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1716{ 1717 PyObject *unicode; 1718 Py_UCS4 maxchar = 0; 1719 Py_ssize_t num_surrogates; 1720 1721 if (u == NULL) 1722 return (PyObject*)_PyUnicode_New(size); 1723 1724 /* If the Unicode data is known at construction time, we can apply 1725 some optimizations which share commonly used objects. */ 1726 1727 /* Optimization for empty strings */ 1728 if (size == 0 && unicode_empty != NULL) { 1729 Py_INCREF(unicode_empty); 1730 return unicode_empty; 1731 } 1732 1733 /* Single character Unicode objects in the Latin-1 range are 1734 shared when using this constructor */ 1735 if (size == 1 && *u < 256) 1736 return get_latin1_char((unsigned char)*u); 1737 1738 /* If not empty and not single character, copy the Unicode data 1739 into the new object */ 1740 if (find_maxchar_surrogates(u, u + size, 1741 &maxchar, &num_surrogates) == -1) 1742 return NULL; 1743 1744 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1745 if (!unicode) 1746 return NULL; 1747 1748 switch (PyUnicode_KIND(unicode)) { 1749 case PyUnicode_1BYTE_KIND: 1750 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1751 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1752 break; 1753 case PyUnicode_2BYTE_KIND: 1754#if Py_UNICODE_SIZE == 2 1755 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1756#else 1757 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1758 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1759#endif 1760 break; 1761 case PyUnicode_4BYTE_KIND: 1762#if SIZEOF_WCHAR_T == 2 1763 /* This is the only case which has to process surrogates, thus 1764 a simple copy loop is not enough and we need a function. */ 1765 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1766#else 1767 assert(num_surrogates == 0); 1768 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1769#endif 1770 break; 1771 default: 1772 assert(0 && "Impossible state"); 1773 } 1774 1775 return unicode_result(unicode); 1776} 1777 1778PyObject * 1779PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1780{ 1781 if (size < 0) { 1782 PyErr_SetString(PyExc_SystemError, 1783 "Negative size passed to PyUnicode_FromStringAndSize"); 1784 return NULL; 1785 } 1786 if (u != NULL) 1787 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1788 else 1789 return (PyObject *)_PyUnicode_New(size); 1790} 1791 1792PyObject * 1793PyUnicode_FromString(const char *u) 1794{ 1795 size_t size = strlen(u); 1796 if (size > PY_SSIZE_T_MAX) { 1797 PyErr_SetString(PyExc_OverflowError, "input too long"); 1798 return NULL; 1799 } 1800 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1801} 1802 1803PyObject * 1804_PyUnicode_FromId(_Py_Identifier *id) 1805{ 1806 if (!id->object) { 1807 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1808 strlen(id->string), 1809 NULL, NULL); 1810 if (!id->object) 1811 return NULL; 1812 PyUnicode_InternInPlace(&id->object); 1813 assert(!id->next); 1814 id->next = static_strings; 1815 static_strings = id; 1816 } 1817 return id->object; 1818} 1819 1820void 1821_PyUnicode_ClearStaticStrings() 1822{ 1823 _Py_Identifier *i; 1824 for (i = static_strings; i; i = i->next) { 1825 Py_DECREF(i->object); 1826 i->object = NULL; 1827 i->next = NULL; 1828 } 1829} 1830 1831/* Internal function, doesn't check maximum character */ 1832 1833PyObject* 1834_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1835{ 1836 const unsigned char *s = (const unsigned char *)buffer; 1837 PyObject *unicode; 1838 if (size == 1) { 1839#ifdef Py_DEBUG 1840 assert(s[0] < 128); 1841#endif 1842 return get_latin1_char(s[0]); 1843 } 1844 unicode = PyUnicode_New(size, 127); 1845 if (!unicode) 1846 return NULL; 1847 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1848 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1849 return unicode; 1850} 1851 1852static Py_UCS4 1853kind_maxchar_limit(unsigned int kind) 1854{ 1855 switch (kind) { 1856 case PyUnicode_1BYTE_KIND: 1857 return 0x80; 1858 case PyUnicode_2BYTE_KIND: 1859 return 0x100; 1860 case PyUnicode_4BYTE_KIND: 1861 return 0x10000; 1862 default: 1863 assert(0 && "invalid kind"); 1864 return MAX_UNICODE; 1865 } 1866} 1867 1868Py_LOCAL_INLINE(Py_UCS4) 1869align_maxchar(Py_UCS4 maxchar) 1870{ 1871 if (maxchar <= 127) 1872 return 127; 1873 else if (maxchar <= 255) 1874 return 255; 1875 else if (maxchar <= 65535) 1876 return 65535; 1877 else 1878 return MAX_UNICODE; 1879} 1880 1881static PyObject* 1882_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1883{ 1884 PyObject *res; 1885 unsigned char max_char; 1886 1887 if (size == 0) { 1888 Py_INCREF(unicode_empty); 1889 return unicode_empty; 1890 } 1891 assert(size > 0); 1892 if (size == 1) 1893 return get_latin1_char(u[0]); 1894 1895 max_char = ucs1lib_find_max_char(u, u + size); 1896 res = PyUnicode_New(size, max_char); 1897 if (!res) 1898 return NULL; 1899 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1900 assert(_PyUnicode_CheckConsistency(res, 1)); 1901 return res; 1902} 1903 1904static PyObject* 1905_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1906{ 1907 PyObject *res; 1908 Py_UCS2 max_char; 1909 1910 if (size == 0) { 1911 Py_INCREF(unicode_empty); 1912 return unicode_empty; 1913 } 1914 assert(size > 0); 1915 if (size == 1) { 1916 Py_UCS4 ch = u[0]; 1917 if (ch < 256) 1918 return get_latin1_char((unsigned char)ch); 1919 1920 res = PyUnicode_New(1, ch); 1921 if (res == NULL) 1922 return NULL; 1923 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1924 assert(_PyUnicode_CheckConsistency(res, 1)); 1925 return res; 1926 } 1927 1928 max_char = ucs2lib_find_max_char(u, u + size); 1929 res = PyUnicode_New(size, max_char); 1930 if (!res) 1931 return NULL; 1932 if (max_char >= 256) 1933 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1934 else { 1935 _PyUnicode_CONVERT_BYTES( 1936 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1937 } 1938 assert(_PyUnicode_CheckConsistency(res, 1)); 1939 return res; 1940} 1941 1942static PyObject* 1943_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1944{ 1945 PyObject *res; 1946 Py_UCS4 max_char; 1947 1948 if (size == 0) { 1949 Py_INCREF(unicode_empty); 1950 return unicode_empty; 1951 } 1952 assert(size > 0); 1953 if (size == 1) { 1954 Py_UCS4 ch = u[0]; 1955 if (ch < 256) 1956 return get_latin1_char((unsigned char)ch); 1957 1958 res = PyUnicode_New(1, ch); 1959 if (res == NULL) 1960 return NULL; 1961 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1962 assert(_PyUnicode_CheckConsistency(res, 1)); 1963 return res; 1964 } 1965 1966 max_char = ucs4lib_find_max_char(u, u + size); 1967 res = PyUnicode_New(size, max_char); 1968 if (!res) 1969 return NULL; 1970 if (max_char < 256) 1971 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1972 PyUnicode_1BYTE_DATA(res)); 1973 else if (max_char < 0x10000) 1974 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1975 PyUnicode_2BYTE_DATA(res)); 1976 else 1977 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1978 assert(_PyUnicode_CheckConsistency(res, 1)); 1979 return res; 1980} 1981 1982PyObject* 1983PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1984{ 1985 if (size < 0) { 1986 PyErr_SetString(PyExc_ValueError, "size must be positive"); 1987 return NULL; 1988 } 1989 switch (kind) { 1990 case PyUnicode_1BYTE_KIND: 1991 return _PyUnicode_FromUCS1(buffer, size); 1992 case PyUnicode_2BYTE_KIND: 1993 return _PyUnicode_FromUCS2(buffer, size); 1994 case PyUnicode_4BYTE_KIND: 1995 return _PyUnicode_FromUCS4(buffer, size); 1996 default: 1997 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1998 return NULL; 1999 } 2000} 2001 2002Py_UCS4 2003_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2004{ 2005 enum PyUnicode_Kind kind; 2006 void *startptr, *endptr; 2007 2008 assert(PyUnicode_IS_READY(unicode)); 2009 assert(0 <= start); 2010 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2011 assert(start <= end); 2012 2013 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2014 return PyUnicode_MAX_CHAR_VALUE(unicode); 2015 2016 if (start == end) 2017 return 127; 2018 2019 if (PyUnicode_IS_ASCII(unicode)) 2020 return 127; 2021 2022 kind = PyUnicode_KIND(unicode); 2023 startptr = PyUnicode_DATA(unicode); 2024 endptr = (char *)startptr + end * kind; 2025 startptr = (char *)startptr + start * kind; 2026 switch(kind) { 2027 case PyUnicode_1BYTE_KIND: 2028 return ucs1lib_find_max_char(startptr, endptr); 2029 case PyUnicode_2BYTE_KIND: 2030 return ucs2lib_find_max_char(startptr, endptr); 2031 case PyUnicode_4BYTE_KIND: 2032 return ucs4lib_find_max_char(startptr, endptr); 2033 default: 2034 assert(0); 2035 return 0; 2036 } 2037} 2038 2039/* Ensure that a string uses the most efficient storage, if it is not the 2040 case: create a new string with of the right kind. Write NULL into *p_unicode 2041 on error. */ 2042static void 2043unicode_adjust_maxchar(PyObject **p_unicode) 2044{ 2045 PyObject *unicode, *copy; 2046 Py_UCS4 max_char; 2047 Py_ssize_t len; 2048 unsigned int kind; 2049 2050 assert(p_unicode != NULL); 2051 unicode = *p_unicode; 2052 assert(PyUnicode_IS_READY(unicode)); 2053 if (PyUnicode_IS_ASCII(unicode)) 2054 return; 2055 2056 len = PyUnicode_GET_LENGTH(unicode); 2057 kind = PyUnicode_KIND(unicode); 2058 if (kind == PyUnicode_1BYTE_KIND) { 2059 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2060 max_char = ucs1lib_find_max_char(u, u + len); 2061 if (max_char >= 128) 2062 return; 2063 } 2064 else if (kind == PyUnicode_2BYTE_KIND) { 2065 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2066 max_char = ucs2lib_find_max_char(u, u + len); 2067 if (max_char >= 256) 2068 return; 2069 } 2070 else { 2071 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2072 assert(kind == PyUnicode_4BYTE_KIND); 2073 max_char = ucs4lib_find_max_char(u, u + len); 2074 if (max_char >= 0x10000) 2075 return; 2076 } 2077 copy = PyUnicode_New(len, max_char); 2078 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2079 Py_DECREF(unicode); 2080 *p_unicode = copy; 2081} 2082 2083PyObject* 2084_PyUnicode_Copy(PyObject *unicode) 2085{ 2086 Py_ssize_t length; 2087 PyObject *copy; 2088 2089 if (!PyUnicode_Check(unicode)) { 2090 PyErr_BadInternalCall(); 2091 return NULL; 2092 } 2093 if (PyUnicode_READY(unicode) == -1) 2094 return NULL; 2095 2096 length = PyUnicode_GET_LENGTH(unicode); 2097 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2098 if (!copy) 2099 return NULL; 2100 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2101 2102 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2103 length * PyUnicode_KIND(unicode)); 2104 assert(_PyUnicode_CheckConsistency(copy, 1)); 2105 return copy; 2106} 2107 2108 2109/* Widen Unicode objects to larger buffers. Don't write terminating null 2110 character. Return NULL on error. */ 2111 2112void* 2113_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2114{ 2115 Py_ssize_t len; 2116 void *result; 2117 unsigned int skind; 2118 2119 if (PyUnicode_READY(s) == -1) 2120 return NULL; 2121 2122 len = PyUnicode_GET_LENGTH(s); 2123 skind = PyUnicode_KIND(s); 2124 if (skind >= kind) { 2125 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2126 return NULL; 2127 } 2128 switch (kind) { 2129 case PyUnicode_2BYTE_KIND: 2130 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2131 if (!result) 2132 return PyErr_NoMemory(); 2133 assert(skind == PyUnicode_1BYTE_KIND); 2134 _PyUnicode_CONVERT_BYTES( 2135 Py_UCS1, Py_UCS2, 2136 PyUnicode_1BYTE_DATA(s), 2137 PyUnicode_1BYTE_DATA(s) + len, 2138 result); 2139 return result; 2140 case PyUnicode_4BYTE_KIND: 2141 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2142 if (!result) 2143 return PyErr_NoMemory(); 2144 if (skind == PyUnicode_2BYTE_KIND) { 2145 _PyUnicode_CONVERT_BYTES( 2146 Py_UCS2, Py_UCS4, 2147 PyUnicode_2BYTE_DATA(s), 2148 PyUnicode_2BYTE_DATA(s) + len, 2149 result); 2150 } 2151 else { 2152 assert(skind == PyUnicode_1BYTE_KIND); 2153 _PyUnicode_CONVERT_BYTES( 2154 Py_UCS1, Py_UCS4, 2155 PyUnicode_1BYTE_DATA(s), 2156 PyUnicode_1BYTE_DATA(s) + len, 2157 result); 2158 } 2159 return result; 2160 default: 2161 break; 2162 } 2163 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2164 return NULL; 2165} 2166 2167static Py_UCS4* 2168as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2169 int copy_null) 2170{ 2171 int kind; 2172 void *data; 2173 Py_ssize_t len, targetlen; 2174 if (PyUnicode_READY(string) == -1) 2175 return NULL; 2176 kind = PyUnicode_KIND(string); 2177 data = PyUnicode_DATA(string); 2178 len = PyUnicode_GET_LENGTH(string); 2179 targetlen = len; 2180 if (copy_null) 2181 targetlen++; 2182 if (!target) { 2183 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2184 PyErr_NoMemory(); 2185 return NULL; 2186 } 2187 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2188 if (!target) { 2189 PyErr_NoMemory(); 2190 return NULL; 2191 } 2192 } 2193 else { 2194 if (targetsize < targetlen) { 2195 PyErr_Format(PyExc_SystemError, 2196 "string is longer than the buffer"); 2197 if (copy_null && 0 < targetsize) 2198 target[0] = 0; 2199 return NULL; 2200 } 2201 } 2202 if (kind == PyUnicode_1BYTE_KIND) { 2203 Py_UCS1 *start = (Py_UCS1 *) data; 2204 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2205 } 2206 else if (kind == PyUnicode_2BYTE_KIND) { 2207 Py_UCS2 *start = (Py_UCS2 *) data; 2208 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2209 } 2210 else { 2211 assert(kind == PyUnicode_4BYTE_KIND); 2212 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2213 } 2214 if (copy_null) 2215 target[len] = 0; 2216 return target; 2217} 2218 2219Py_UCS4* 2220PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2221 int copy_null) 2222{ 2223 if (target == NULL || targetsize < 0) { 2224 PyErr_BadInternalCall(); 2225 return NULL; 2226 } 2227 return as_ucs4(string, target, targetsize, copy_null); 2228} 2229 2230Py_UCS4* 2231PyUnicode_AsUCS4Copy(PyObject *string) 2232{ 2233 return as_ucs4(string, NULL, 0, 1); 2234} 2235 2236#ifdef HAVE_WCHAR_H 2237 2238PyObject * 2239PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2240{ 2241 if (w == NULL) { 2242 if (size == 0) { 2243 Py_INCREF(unicode_empty); 2244 return unicode_empty; 2245 } 2246 PyErr_BadInternalCall(); 2247 return NULL; 2248 } 2249 2250 if (size == -1) { 2251 size = wcslen(w); 2252 } 2253 2254 return PyUnicode_FromUnicode(w, size); 2255} 2256 2257#endif /* HAVE_WCHAR_H */ 2258 2259static void 2260makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2261 int zeropad, int width, int precision, char c) 2262{ 2263 *fmt++ = '%'; 2264 if (width) { 2265 if (zeropad) 2266 *fmt++ = '0'; 2267 fmt += sprintf(fmt, "%d", width); 2268 } 2269 if (precision) 2270 fmt += sprintf(fmt, ".%d", precision); 2271 if (longflag) 2272 *fmt++ = 'l'; 2273 else if (longlongflag) { 2274 /* longlongflag should only ever be nonzero on machines with 2275 HAVE_LONG_LONG defined */ 2276#ifdef HAVE_LONG_LONG 2277 char *f = PY_FORMAT_LONG_LONG; 2278 while (*f) 2279 *fmt++ = *f++; 2280#else 2281 /* we shouldn't ever get here */ 2282 assert(0); 2283 *fmt++ = 'l'; 2284#endif 2285 } 2286 else if (size_tflag) { 2287 char *f = PY_FORMAT_SIZE_T; 2288 while (*f) 2289 *fmt++ = *f++; 2290 } 2291 *fmt++ = c; 2292 *fmt = '\0'; 2293} 2294 2295/* helper for PyUnicode_FromFormatV() */ 2296 2297static const char* 2298parse_format_flags(const char *f, 2299 int *p_width, int *p_precision, 2300 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2301{ 2302 int width, precision, longflag, longlongflag, size_tflag; 2303 2304 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2305 f++; 2306 width = 0; 2307 while (Py_ISDIGIT((unsigned)*f)) 2308 width = (width*10) + *f++ - '0'; 2309 precision = 0; 2310 if (*f == '.') { 2311 f++; 2312 while (Py_ISDIGIT((unsigned)*f)) 2313 precision = (precision*10) + *f++ - '0'; 2314 if (*f == '%') { 2315 /* "%.3%s" => f points to "3" */ 2316 f--; 2317 } 2318 } 2319 if (*f == '\0') { 2320 /* bogus format "%.1" => go backward, f points to "1" */ 2321 f--; 2322 } 2323 if (p_width != NULL) 2324 *p_width = width; 2325 if (p_precision != NULL) 2326 *p_precision = precision; 2327 2328 /* Handle %ld, %lu, %lld and %llu. */ 2329 longflag = 0; 2330 longlongflag = 0; 2331 size_tflag = 0; 2332 2333 if (*f == 'l') { 2334 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2335 longflag = 1; 2336 ++f; 2337 } 2338#ifdef HAVE_LONG_LONG 2339 else if (f[1] == 'l' && 2340 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2341 longlongflag = 1; 2342 f += 2; 2343 } 2344#endif 2345 } 2346 /* handle the size_t flag. */ 2347 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2348 size_tflag = 1; 2349 ++f; 2350 } 2351 if (p_longflag != NULL) 2352 *p_longflag = longflag; 2353 if (p_longlongflag != NULL) 2354 *p_longlongflag = longlongflag; 2355 if (p_size_tflag != NULL) 2356 *p_size_tflag = size_tflag; 2357 return f; 2358} 2359 2360/* maximum number of characters required for output of %ld. 21 characters 2361 allows for 64-bit integers (in decimal) and an optional sign. */ 2362#define MAX_LONG_CHARS 21 2363/* maximum number of characters required for output of %lld. 2364 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2365 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2366#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2367 2368PyObject * 2369PyUnicode_FromFormatV(const char *format, va_list vargs) 2370{ 2371 va_list count; 2372 Py_ssize_t callcount = 0; 2373 PyObject **callresults = NULL; 2374 PyObject **callresult = NULL; 2375 Py_ssize_t n = 0; 2376 int width = 0; 2377 int precision = 0; 2378 int zeropad; 2379 const char* f; 2380 PyObject *string; 2381 /* used by sprintf */ 2382 char fmt[61]; /* should be enough for %0width.precisionlld */ 2383 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2384 Py_UCS4 argmaxchar; 2385 Py_ssize_t numbersize = 0; 2386 char *numberresults = NULL; 2387 char *numberresult = NULL; 2388 Py_ssize_t i; 2389 int kind; 2390 void *data; 2391 2392 Py_VA_COPY(count, vargs); 2393 /* step 1: count the number of %S/%R/%A/%s format specifications 2394 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2395 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2396 * result in an array) 2397 * also estimate a upper bound for all the number formats in the string, 2398 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2399 * buffer before putting everything together. */ 2400 for (f = format; *f; f++) { 2401 if (*f == '%') { 2402 int longlongflag; 2403 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2404 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2405 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2406 ++callcount; 2407 2408 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2409#ifdef HAVE_LONG_LONG 2410 if (longlongflag) { 2411 if (width < MAX_LONG_LONG_CHARS) 2412 width = MAX_LONG_LONG_CHARS; 2413 } 2414 else 2415#endif 2416 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2417 including sign. Decimal takes the most space. This 2418 isn't enough for octal. If a width is specified we 2419 need more (which we allocate later). */ 2420 if (width < MAX_LONG_CHARS) 2421 width = MAX_LONG_CHARS; 2422 2423 /* account for the size + '\0' to separate numbers 2424 inside of the numberresults buffer */ 2425 numbersize += (width + 1); 2426 } 2427 } 2428 else if ((unsigned char)*f > 127) { 2429 PyErr_Format(PyExc_ValueError, 2430 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2431 "string, got a non-ASCII byte: 0x%02x", 2432 (unsigned char)*f); 2433 return NULL; 2434 } 2435 } 2436 /* step 2: allocate memory for the results of 2437 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2438 if (callcount) { 2439 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2440 if (!callresults) { 2441 PyErr_NoMemory(); 2442 return NULL; 2443 } 2444 callresult = callresults; 2445 } 2446 /* step 2.5: allocate memory for the results of formating numbers */ 2447 if (numbersize) { 2448 numberresults = PyObject_Malloc(numbersize); 2449 if (!numberresults) { 2450 PyErr_NoMemory(); 2451 goto fail; 2452 } 2453 numberresult = numberresults; 2454 } 2455 2456 /* step 3: format numbers and figure out how large a buffer we need */ 2457 for (f = format; *f; f++) { 2458 if (*f == '%') { 2459 const char* p; 2460 int longflag; 2461 int longlongflag; 2462 int size_tflag; 2463 int numprinted; 2464 2465 p = f; 2466 zeropad = (f[1] == '0'); 2467 f = parse_format_flags(f, &width, &precision, 2468 &longflag, &longlongflag, &size_tflag); 2469 switch (*f) { 2470 case 'c': 2471 { 2472 Py_UCS4 ordinal = va_arg(count, int); 2473 maxchar = MAX_MAXCHAR(maxchar, ordinal); 2474 n++; 2475 break; 2476 } 2477 case '%': 2478 n++; 2479 break; 2480 case 'i': 2481 case 'd': 2482 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2483 width, precision, *f); 2484 if (longflag) 2485 numprinted = sprintf(numberresult, fmt, 2486 va_arg(count, long)); 2487#ifdef HAVE_LONG_LONG 2488 else if (longlongflag) 2489 numprinted = sprintf(numberresult, fmt, 2490 va_arg(count, PY_LONG_LONG)); 2491#endif 2492 else if (size_tflag) 2493 numprinted = sprintf(numberresult, fmt, 2494 va_arg(count, Py_ssize_t)); 2495 else 2496 numprinted = sprintf(numberresult, fmt, 2497 va_arg(count, int)); 2498 n += numprinted; 2499 /* advance by +1 to skip over the '\0' */ 2500 numberresult += (numprinted + 1); 2501 assert(*(numberresult - 1) == '\0'); 2502 assert(*(numberresult - 2) != '\0'); 2503 assert(numprinted >= 0); 2504 assert(numberresult <= numberresults + numbersize); 2505 break; 2506 case 'u': 2507 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2508 width, precision, 'u'); 2509 if (longflag) 2510 numprinted = sprintf(numberresult, fmt, 2511 va_arg(count, unsigned long)); 2512#ifdef HAVE_LONG_LONG 2513 else if (longlongflag) 2514 numprinted = sprintf(numberresult, fmt, 2515 va_arg(count, unsigned PY_LONG_LONG)); 2516#endif 2517 else if (size_tflag) 2518 numprinted = sprintf(numberresult, fmt, 2519 va_arg(count, size_t)); 2520 else 2521 numprinted = sprintf(numberresult, fmt, 2522 va_arg(count, unsigned int)); 2523 n += numprinted; 2524 numberresult += (numprinted + 1); 2525 assert(*(numberresult - 1) == '\0'); 2526 assert(*(numberresult - 2) != '\0'); 2527 assert(numprinted >= 0); 2528 assert(numberresult <= numberresults + numbersize); 2529 break; 2530 case 'x': 2531 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2532 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2533 n += numprinted; 2534 numberresult += (numprinted + 1); 2535 assert(*(numberresult - 1) == '\0'); 2536 assert(*(numberresult - 2) != '\0'); 2537 assert(numprinted >= 0); 2538 assert(numberresult <= numberresults + numbersize); 2539 break; 2540 case 'p': 2541 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2542 /* %p is ill-defined: ensure leading 0x. */ 2543 if (numberresult[1] == 'X') 2544 numberresult[1] = 'x'; 2545 else if (numberresult[1] != 'x') { 2546 memmove(numberresult + 2, numberresult, 2547 strlen(numberresult) + 1); 2548 numberresult[0] = '0'; 2549 numberresult[1] = 'x'; 2550 numprinted += 2; 2551 } 2552 n += numprinted; 2553 numberresult += (numprinted + 1); 2554 assert(*(numberresult - 1) == '\0'); 2555 assert(*(numberresult - 2) != '\0'); 2556 assert(numprinted >= 0); 2557 assert(numberresult <= numberresults + numbersize); 2558 break; 2559 case 's': 2560 { 2561 /* UTF-8 */ 2562 const char *s = va_arg(count, const char*); 2563 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); 2564 if (!str) 2565 goto fail; 2566 /* since PyUnicode_DecodeUTF8 returns already flexible 2567 unicode objects, there is no need to call ready on them */ 2568 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2569 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2570 n += PyUnicode_GET_LENGTH(str); 2571 /* Remember the str and switch to the next slot */ 2572 *callresult++ = str; 2573 break; 2574 } 2575 case 'U': 2576 { 2577 PyObject *obj = va_arg(count, PyObject *); 2578 assert(obj && _PyUnicode_CHECK(obj)); 2579 if (PyUnicode_READY(obj) == -1) 2580 goto fail; 2581 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2582 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2583 n += PyUnicode_GET_LENGTH(obj); 2584 break; 2585 } 2586 case 'V': 2587 { 2588 PyObject *obj = va_arg(count, PyObject *); 2589 const char *str = va_arg(count, const char *); 2590 PyObject *str_obj; 2591 assert(obj || str); 2592 assert(!obj || _PyUnicode_CHECK(obj)); 2593 if (obj) { 2594 if (PyUnicode_READY(obj) == -1) 2595 goto fail; 2596 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2597 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2598 n += PyUnicode_GET_LENGTH(obj); 2599 *callresult++ = NULL; 2600 } 2601 else { 2602 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2603 if (!str_obj) 2604 goto fail; 2605 if (PyUnicode_READY(str_obj) == -1) { 2606 Py_DECREF(str_obj); 2607 goto fail; 2608 } 2609 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2610 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2611 n += PyUnicode_GET_LENGTH(str_obj); 2612 *callresult++ = str_obj; 2613 } 2614 break; 2615 } 2616 case 'S': 2617 { 2618 PyObject *obj = va_arg(count, PyObject *); 2619 PyObject *str; 2620 assert(obj); 2621 str = PyObject_Str(obj); 2622 if (!str) 2623 goto fail; 2624 if (PyUnicode_READY(str) == -1) { 2625 Py_DECREF(str); 2626 goto fail; 2627 } 2628 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2629 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2630 n += PyUnicode_GET_LENGTH(str); 2631 /* Remember the str and switch to the next slot */ 2632 *callresult++ = str; 2633 break; 2634 } 2635 case 'R': 2636 { 2637 PyObject *obj = va_arg(count, PyObject *); 2638 PyObject *repr; 2639 assert(obj); 2640 repr = PyObject_Repr(obj); 2641 if (!repr) 2642 goto fail; 2643 if (PyUnicode_READY(repr) == -1) { 2644 Py_DECREF(repr); 2645 goto fail; 2646 } 2647 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2648 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2649 n += PyUnicode_GET_LENGTH(repr); 2650 /* Remember the repr and switch to the next slot */ 2651 *callresult++ = repr; 2652 break; 2653 } 2654 case 'A': 2655 { 2656 PyObject *obj = va_arg(count, PyObject *); 2657 PyObject *ascii; 2658 assert(obj); 2659 ascii = PyObject_ASCII(obj); 2660 if (!ascii) 2661 goto fail; 2662 if (PyUnicode_READY(ascii) == -1) { 2663 Py_DECREF(ascii); 2664 goto fail; 2665 } 2666 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2667 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2668 n += PyUnicode_GET_LENGTH(ascii); 2669 /* Remember the repr and switch to the next slot */ 2670 *callresult++ = ascii; 2671 break; 2672 } 2673 default: 2674 /* if we stumble upon an unknown 2675 formatting code, copy the rest of 2676 the format string to the output 2677 string. (we cannot just skip the 2678 code, since there's no way to know 2679 what's in the argument list) */ 2680 n += strlen(p); 2681 goto expand; 2682 } 2683 } else 2684 n++; 2685 } 2686 expand: 2687 /* step 4: fill the buffer */ 2688 /* Since we've analyzed how much space we need, 2689 we don't have to resize the string. 2690 There can be no errors beyond this point. */ 2691 string = PyUnicode_New(n, maxchar); 2692 if (!string) 2693 goto fail; 2694 kind = PyUnicode_KIND(string); 2695 data = PyUnicode_DATA(string); 2696 callresult = callresults; 2697 numberresult = numberresults; 2698 2699 for (i = 0, f = format; *f; f++) { 2700 if (*f == '%') { 2701 const char* p; 2702 2703 p = f; 2704 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2705 /* checking for == because the last argument could be a empty 2706 string, which causes i to point to end, the assert at the end of 2707 the loop */ 2708 assert(i <= PyUnicode_GET_LENGTH(string)); 2709 2710 switch (*f) { 2711 case 'c': 2712 { 2713 const int ordinal = va_arg(vargs, int); 2714 PyUnicode_WRITE(kind, data, i++, ordinal); 2715 break; 2716 } 2717 case 'i': 2718 case 'd': 2719 case 'u': 2720 case 'x': 2721 case 'p': 2722 { 2723 Py_ssize_t written; 2724 /* unused, since we already have the result */ 2725 if (*f == 'p') 2726 (void) va_arg(vargs, void *); 2727 else 2728 (void) va_arg(vargs, int); 2729 /* extract the result from numberresults and append. */ 2730 written = unicode_write_cstr(string, i, numberresult); 2731 /* skip over the separating '\0' */ 2732 i += written; 2733 numberresult += written; 2734 assert(*numberresult == '\0'); 2735 numberresult++; 2736 assert(numberresult <= numberresults + numbersize); 2737 break; 2738 } 2739 case 's': 2740 { 2741 /* unused, since we already have the result */ 2742 Py_ssize_t size; 2743 (void) va_arg(vargs, char *); 2744 size = PyUnicode_GET_LENGTH(*callresult); 2745 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2746 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2747 i += size; 2748 /* We're done with the unicode()/repr() => forget it */ 2749 Py_DECREF(*callresult); 2750 /* switch to next unicode()/repr() result */ 2751 ++callresult; 2752 break; 2753 } 2754 case 'U': 2755 { 2756 PyObject *obj = va_arg(vargs, PyObject *); 2757 Py_ssize_t size; 2758 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2759 size = PyUnicode_GET_LENGTH(obj); 2760 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2761 i += size; 2762 break; 2763 } 2764 case 'V': 2765 { 2766 Py_ssize_t size; 2767 PyObject *obj = va_arg(vargs, PyObject *); 2768 va_arg(vargs, const char *); 2769 if (obj) { 2770 size = PyUnicode_GET_LENGTH(obj); 2771 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2772 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2773 i += size; 2774 } else { 2775 size = PyUnicode_GET_LENGTH(*callresult); 2776 assert(PyUnicode_KIND(*callresult) <= 2777 PyUnicode_KIND(string)); 2778 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2779 i += size; 2780 Py_DECREF(*callresult); 2781 } 2782 ++callresult; 2783 break; 2784 } 2785 case 'S': 2786 case 'R': 2787 case 'A': 2788 { 2789 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2790 /* unused, since we already have the result */ 2791 (void) va_arg(vargs, PyObject *); 2792 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2793 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2794 i += size; 2795 /* We're done with the unicode()/repr() => forget it */ 2796 Py_DECREF(*callresult); 2797 /* switch to next unicode()/repr() result */ 2798 ++callresult; 2799 break; 2800 } 2801 case '%': 2802 PyUnicode_WRITE(kind, data, i++, '%'); 2803 break; 2804 default: 2805 i += unicode_write_cstr(string, i, p); 2806 assert(i == PyUnicode_GET_LENGTH(string)); 2807 goto end; 2808 } 2809 } 2810 else { 2811 assert(i < PyUnicode_GET_LENGTH(string)); 2812 PyUnicode_WRITE(kind, data, i++, *f); 2813 } 2814 } 2815 assert(i == PyUnicode_GET_LENGTH(string)); 2816 2817 end: 2818 if (callresults) 2819 PyObject_Free(callresults); 2820 if (numberresults) 2821 PyObject_Free(numberresults); 2822 return unicode_result(string); 2823 fail: 2824 if (callresults) { 2825 PyObject **callresult2 = callresults; 2826 while (callresult2 < callresult) { 2827 Py_XDECREF(*callresult2); 2828 ++callresult2; 2829 } 2830 PyObject_Free(callresults); 2831 } 2832 if (numberresults) 2833 PyObject_Free(numberresults); 2834 return NULL; 2835} 2836 2837PyObject * 2838PyUnicode_FromFormat(const char *format, ...) 2839{ 2840 PyObject* ret; 2841 va_list vargs; 2842 2843#ifdef HAVE_STDARG_PROTOTYPES 2844 va_start(vargs, format); 2845#else 2846 va_start(vargs); 2847#endif 2848 ret = PyUnicode_FromFormatV(format, vargs); 2849 va_end(vargs); 2850 return ret; 2851} 2852 2853#ifdef HAVE_WCHAR_H 2854 2855/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2856 convert a Unicode object to a wide character string. 2857 2858 - If w is NULL: return the number of wide characters (including the null 2859 character) required to convert the unicode object. Ignore size argument. 2860 2861 - Otherwise: return the number of wide characters (excluding the null 2862 character) written into w. Write at most size wide characters (including 2863 the null character). */ 2864static Py_ssize_t 2865unicode_aswidechar(PyObject *unicode, 2866 wchar_t *w, 2867 Py_ssize_t size) 2868{ 2869 Py_ssize_t res; 2870 const wchar_t *wstr; 2871 2872 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2873 if (wstr == NULL) 2874 return -1; 2875 2876 if (w != NULL) { 2877 if (size > res) 2878 size = res + 1; 2879 else 2880 res = size; 2881 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2882 return res; 2883 } 2884 else 2885 return res + 1; 2886} 2887 2888Py_ssize_t 2889PyUnicode_AsWideChar(PyObject *unicode, 2890 wchar_t *w, 2891 Py_ssize_t size) 2892{ 2893 if (unicode == NULL) { 2894 PyErr_BadInternalCall(); 2895 return -1; 2896 } 2897 return unicode_aswidechar(unicode, w, size); 2898} 2899 2900wchar_t* 2901PyUnicode_AsWideCharString(PyObject *unicode, 2902 Py_ssize_t *size) 2903{ 2904 wchar_t* buffer; 2905 Py_ssize_t buflen; 2906 2907 if (unicode == NULL) { 2908 PyErr_BadInternalCall(); 2909 return NULL; 2910 } 2911 2912 buflen = unicode_aswidechar(unicode, NULL, 0); 2913 if (buflen == -1) 2914 return NULL; 2915 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2916 PyErr_NoMemory(); 2917 return NULL; 2918 } 2919 2920 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2921 if (buffer == NULL) { 2922 PyErr_NoMemory(); 2923 return NULL; 2924 } 2925 buflen = unicode_aswidechar(unicode, buffer, buflen); 2926 if (buflen == -1) 2927 return NULL; 2928 if (size != NULL) 2929 *size = buflen; 2930 return buffer; 2931} 2932 2933#endif /* HAVE_WCHAR_H */ 2934 2935PyObject * 2936PyUnicode_FromOrdinal(int ordinal) 2937{ 2938 PyObject *v; 2939 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2940 PyErr_SetString(PyExc_ValueError, 2941 "chr() arg not in range(0x110000)"); 2942 return NULL; 2943 } 2944 2945 if (ordinal < 256) 2946 return get_latin1_char(ordinal); 2947 2948 v = PyUnicode_New(1, ordinal); 2949 if (v == NULL) 2950 return NULL; 2951 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2952 assert(_PyUnicode_CheckConsistency(v, 1)); 2953 return v; 2954} 2955 2956PyObject * 2957PyUnicode_FromObject(register PyObject *obj) 2958{ 2959 /* XXX Perhaps we should make this API an alias of 2960 PyObject_Str() instead ?! */ 2961 if (PyUnicode_CheckExact(obj)) { 2962 if (PyUnicode_READY(obj) == -1) 2963 return NULL; 2964 Py_INCREF(obj); 2965 return obj; 2966 } 2967 if (PyUnicode_Check(obj)) { 2968 /* For a Unicode subtype that's not a Unicode object, 2969 return a true Unicode object with the same data. */ 2970 return _PyUnicode_Copy(obj); 2971 } 2972 PyErr_Format(PyExc_TypeError, 2973 "Can't convert '%.100s' object to str implicitly", 2974 Py_TYPE(obj)->tp_name); 2975 return NULL; 2976} 2977 2978PyObject * 2979PyUnicode_FromEncodedObject(register PyObject *obj, 2980 const char *encoding, 2981 const char *errors) 2982{ 2983 Py_buffer buffer; 2984 PyObject *v; 2985 2986 if (obj == NULL) { 2987 PyErr_BadInternalCall(); 2988 return NULL; 2989 } 2990 2991 /* Decoding bytes objects is the most common case and should be fast */ 2992 if (PyBytes_Check(obj)) { 2993 if (PyBytes_GET_SIZE(obj) == 0) { 2994 Py_INCREF(unicode_empty); 2995 v = unicode_empty; 2996 } 2997 else { 2998 v = PyUnicode_Decode( 2999 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3000 encoding, errors); 3001 } 3002 return v; 3003 } 3004 3005 if (PyUnicode_Check(obj)) { 3006 PyErr_SetString(PyExc_TypeError, 3007 "decoding str is not supported"); 3008 return NULL; 3009 } 3010 3011 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3012 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3013 PyErr_Format(PyExc_TypeError, 3014 "coercing to str: need bytes, bytearray " 3015 "or buffer-like object, %.80s found", 3016 Py_TYPE(obj)->tp_name); 3017 return NULL; 3018 } 3019 3020 if (buffer.len == 0) { 3021 Py_INCREF(unicode_empty); 3022 v = unicode_empty; 3023 } 3024 else 3025 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3026 3027 PyBuffer_Release(&buffer); 3028 return v; 3029} 3030 3031/* Convert encoding to lower case and replace '_' with '-' in order to 3032 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 3033 1 on success. */ 3034static int 3035normalize_encoding(const char *encoding, 3036 char *lower, 3037 size_t lower_len) 3038{ 3039 const char *e; 3040 char *l; 3041 char *l_end; 3042 3043 if (encoding == NULL) { 3044 strcpy(lower, "utf-8"); 3045 return 1; 3046 } 3047 e = encoding; 3048 l = lower; 3049 l_end = &lower[lower_len - 1]; 3050 while (*e) { 3051 if (l == l_end) 3052 return 0; 3053 if (Py_ISUPPER(*e)) { 3054 *l++ = Py_TOLOWER(*e++); 3055 } 3056 else if (*e == '_') { 3057 *l++ = '-'; 3058 e++; 3059 } 3060 else { 3061 *l++ = *e++; 3062 } 3063 } 3064 *l = '\0'; 3065 return 1; 3066} 3067 3068PyObject * 3069PyUnicode_Decode(const char *s, 3070 Py_ssize_t size, 3071 const char *encoding, 3072 const char *errors) 3073{ 3074 PyObject *buffer = NULL, *unicode; 3075 Py_buffer info; 3076 char lower[11]; /* Enough for any encoding shortcut */ 3077 3078 /* Shortcuts for common default encodings */ 3079 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3080 if ((strcmp(lower, "utf-8") == 0) || 3081 (strcmp(lower, "utf8") == 0)) 3082 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3083 else if ((strcmp(lower, "latin-1") == 0) || 3084 (strcmp(lower, "latin1") == 0) || 3085 (strcmp(lower, "iso-8859-1") == 0)) 3086 return PyUnicode_DecodeLatin1(s, size, errors); 3087#ifdef HAVE_MBCS 3088 else if (strcmp(lower, "mbcs") == 0) 3089 return PyUnicode_DecodeMBCS(s, size, errors); 3090#endif 3091 else if (strcmp(lower, "ascii") == 0) 3092 return PyUnicode_DecodeASCII(s, size, errors); 3093 else if (strcmp(lower, "utf-16") == 0) 3094 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3095 else if (strcmp(lower, "utf-32") == 0) 3096 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3097 } 3098 3099 /* Decode via the codec registry */ 3100 buffer = NULL; 3101 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3102 goto onError; 3103 buffer = PyMemoryView_FromBuffer(&info); 3104 if (buffer == NULL) 3105 goto onError; 3106 unicode = PyCodec_Decode(buffer, encoding, errors); 3107 if (unicode == NULL) 3108 goto onError; 3109 if (!PyUnicode_Check(unicode)) { 3110 PyErr_Format(PyExc_TypeError, 3111 "decoder did not return a str object (type=%.400s)", 3112 Py_TYPE(unicode)->tp_name); 3113 Py_DECREF(unicode); 3114 goto onError; 3115 } 3116 Py_DECREF(buffer); 3117 return unicode_result(unicode); 3118 3119 onError: 3120 Py_XDECREF(buffer); 3121 return NULL; 3122} 3123 3124PyObject * 3125PyUnicode_AsDecodedObject(PyObject *unicode, 3126 const char *encoding, 3127 const char *errors) 3128{ 3129 PyObject *v; 3130 3131 if (!PyUnicode_Check(unicode)) { 3132 PyErr_BadArgument(); 3133 goto onError; 3134 } 3135 3136 if (encoding == NULL) 3137 encoding = PyUnicode_GetDefaultEncoding(); 3138 3139 /* Decode via the codec registry */ 3140 v = PyCodec_Decode(unicode, encoding, errors); 3141 if (v == NULL) 3142 goto onError; 3143 return unicode_result(v); 3144 3145 onError: 3146 return NULL; 3147} 3148 3149PyObject * 3150PyUnicode_AsDecodedUnicode(PyObject *unicode, 3151 const char *encoding, 3152 const char *errors) 3153{ 3154 PyObject *v; 3155 3156 if (!PyUnicode_Check(unicode)) { 3157 PyErr_BadArgument(); 3158 goto onError; 3159 } 3160 3161 if (encoding == NULL) 3162 encoding = PyUnicode_GetDefaultEncoding(); 3163 3164 /* Decode via the codec registry */ 3165 v = PyCodec_Decode(unicode, encoding, errors); 3166 if (v == NULL) 3167 goto onError; 3168 if (!PyUnicode_Check(v)) { 3169 PyErr_Format(PyExc_TypeError, 3170 "decoder did not return a str object (type=%.400s)", 3171 Py_TYPE(v)->tp_name); 3172 Py_DECREF(v); 3173 goto onError; 3174 } 3175 return unicode_result(v); 3176 3177 onError: 3178 return NULL; 3179} 3180 3181PyObject * 3182PyUnicode_Encode(const Py_UNICODE *s, 3183 Py_ssize_t size, 3184 const char *encoding, 3185 const char *errors) 3186{ 3187 PyObject *v, *unicode; 3188 3189 unicode = PyUnicode_FromUnicode(s, size); 3190 if (unicode == NULL) 3191 return NULL; 3192 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3193 Py_DECREF(unicode); 3194 return v; 3195} 3196 3197PyObject * 3198PyUnicode_AsEncodedObject(PyObject *unicode, 3199 const char *encoding, 3200 const char *errors) 3201{ 3202 PyObject *v; 3203 3204 if (!PyUnicode_Check(unicode)) { 3205 PyErr_BadArgument(); 3206 goto onError; 3207 } 3208 3209 if (encoding == NULL) 3210 encoding = PyUnicode_GetDefaultEncoding(); 3211 3212 /* Encode via the codec registry */ 3213 v = PyCodec_Encode(unicode, encoding, errors); 3214 if (v == NULL) 3215 goto onError; 3216 return v; 3217 3218 onError: 3219 return NULL; 3220} 3221 3222static size_t 3223wcstombs_errorpos(const wchar_t *wstr) 3224{ 3225 size_t len; 3226#if SIZEOF_WCHAR_T == 2 3227 wchar_t buf[3]; 3228#else 3229 wchar_t buf[2]; 3230#endif 3231 char outbuf[MB_LEN_MAX]; 3232 const wchar_t *start, *previous; 3233 3234#if SIZEOF_WCHAR_T == 2 3235 buf[2] = 0; 3236#else 3237 buf[1] = 0; 3238#endif 3239 start = wstr; 3240 while (*wstr != L'\0') 3241 { 3242 previous = wstr; 3243#if SIZEOF_WCHAR_T == 2 3244 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3245 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3246 { 3247 buf[0] = wstr[0]; 3248 buf[1] = wstr[1]; 3249 wstr += 2; 3250 } 3251 else { 3252 buf[0] = *wstr; 3253 buf[1] = 0; 3254 wstr++; 3255 } 3256#else 3257 buf[0] = *wstr; 3258 wstr++; 3259#endif 3260 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3261 if (len == (size_t)-1) 3262 return previous - start; 3263 } 3264 3265 /* failed to find the unencodable character */ 3266 return 0; 3267} 3268 3269static int 3270locale_error_handler(const char *errors, int *surrogateescape) 3271{ 3272 if (errors == NULL) { 3273 *surrogateescape = 0; 3274 return 0; 3275 } 3276 3277 if (strcmp(errors, "strict") == 0) { 3278 *surrogateescape = 0; 3279 return 0; 3280 } 3281 if (strcmp(errors, "surrogateescape") == 0) { 3282 *surrogateescape = 1; 3283 return 0; 3284 } 3285 PyErr_Format(PyExc_ValueError, 3286 "only 'strict' and 'surrogateescape' error handlers " 3287 "are supported, not '%s'", 3288 errors); 3289 return -1; 3290} 3291 3292PyObject * 3293PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3294{ 3295 Py_ssize_t wlen, wlen2; 3296 wchar_t *wstr; 3297 PyObject *bytes = NULL; 3298 char *errmsg; 3299 PyObject *reason; 3300 PyObject *exc; 3301 size_t error_pos; 3302 int surrogateescape; 3303 3304 if (locale_error_handler(errors, &surrogateescape) < 0) 3305 return NULL; 3306 3307 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3308 if (wstr == NULL) 3309 return NULL; 3310 3311 wlen2 = wcslen(wstr); 3312 if (wlen2 != wlen) { 3313 PyMem_Free(wstr); 3314 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3315 return NULL; 3316 } 3317 3318 if (surrogateescape) { 3319 /* locale encoding with surrogateescape */ 3320 char *str; 3321 3322 str = _Py_wchar2char(wstr, &error_pos); 3323 if (str == NULL) { 3324 if (error_pos == (size_t)-1) { 3325 PyErr_NoMemory(); 3326 PyMem_Free(wstr); 3327 return NULL; 3328 } 3329 else { 3330 goto encode_error; 3331 } 3332 } 3333 PyMem_Free(wstr); 3334 3335 bytes = PyBytes_FromString(str); 3336 PyMem_Free(str); 3337 } 3338 else { 3339 size_t len, len2; 3340 3341 len = wcstombs(NULL, wstr, 0); 3342 if (len == (size_t)-1) { 3343 error_pos = (size_t)-1; 3344 goto encode_error; 3345 } 3346 3347 bytes = PyBytes_FromStringAndSize(NULL, len); 3348 if (bytes == NULL) { 3349 PyMem_Free(wstr); 3350 return NULL; 3351 } 3352 3353 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3354 if (len2 == (size_t)-1 || len2 > len) { 3355 error_pos = (size_t)-1; 3356 goto encode_error; 3357 } 3358 PyMem_Free(wstr); 3359 } 3360 return bytes; 3361 3362encode_error: 3363 errmsg = strerror(errno); 3364 assert(errmsg != NULL); 3365 3366 if (error_pos == (size_t)-1) 3367 error_pos = wcstombs_errorpos(wstr); 3368 3369 PyMem_Free(wstr); 3370 Py_XDECREF(bytes); 3371 3372 if (errmsg != NULL) { 3373 size_t errlen; 3374 wstr = _Py_char2wchar(errmsg, &errlen); 3375 if (wstr != NULL) { 3376 reason = PyUnicode_FromWideChar(wstr, errlen); 3377 PyMem_Free(wstr); 3378 } else 3379 errmsg = NULL; 3380 } 3381 if (errmsg == NULL) 3382 reason = PyUnicode_FromString( 3383 "wcstombs() encountered an unencodable " 3384 "wide character"); 3385 if (reason == NULL) 3386 return NULL; 3387 3388 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3389 "locale", unicode, 3390 (Py_ssize_t)error_pos, 3391 (Py_ssize_t)(error_pos+1), 3392 reason); 3393 Py_DECREF(reason); 3394 if (exc != NULL) { 3395 PyCodec_StrictErrors(exc); 3396 Py_XDECREF(exc); 3397 } 3398 return NULL; 3399} 3400 3401PyObject * 3402PyUnicode_EncodeFSDefault(PyObject *unicode) 3403{ 3404#ifdef HAVE_MBCS 3405 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3406#elif defined(__APPLE__) 3407 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3408#else 3409 PyInterpreterState *interp = PyThreadState_GET()->interp; 3410 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3411 cannot use it to encode and decode filenames before it is loaded. Load 3412 the Python codec requires to encode at least its own filename. Use the C 3413 version of the locale codec until the codec registry is initialized and 3414 the Python codec is loaded. 3415 3416 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3417 cannot only rely on it: check also interp->fscodec_initialized for 3418 subinterpreters. */ 3419 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3420 return PyUnicode_AsEncodedString(unicode, 3421 Py_FileSystemDefaultEncoding, 3422 "surrogateescape"); 3423 } 3424 else { 3425 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3426 } 3427#endif 3428} 3429 3430PyObject * 3431PyUnicode_AsEncodedString(PyObject *unicode, 3432 const char *encoding, 3433 const char *errors) 3434{ 3435 PyObject *v; 3436 char lower[11]; /* Enough for any encoding shortcut */ 3437 3438 if (!PyUnicode_Check(unicode)) { 3439 PyErr_BadArgument(); 3440 return NULL; 3441 } 3442 3443 /* Shortcuts for common default encodings */ 3444 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3445 if ((strcmp(lower, "utf-8") == 0) || 3446 (strcmp(lower, "utf8") == 0)) 3447 { 3448 if (errors == NULL || strcmp(errors, "strict") == 0) 3449 return _PyUnicode_AsUTF8String(unicode, NULL); 3450 else 3451 return _PyUnicode_AsUTF8String(unicode, errors); 3452 } 3453 else if ((strcmp(lower, "latin-1") == 0) || 3454 (strcmp(lower, "latin1") == 0) || 3455 (strcmp(lower, "iso-8859-1") == 0)) 3456 return _PyUnicode_AsLatin1String(unicode, errors); 3457#ifdef HAVE_MBCS 3458 else if (strcmp(lower, "mbcs") == 0) 3459 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3460#endif 3461 else if (strcmp(lower, "ascii") == 0) 3462 return _PyUnicode_AsASCIIString(unicode, errors); 3463 } 3464 3465 /* Encode via the codec registry */ 3466 v = PyCodec_Encode(unicode, encoding, errors); 3467 if (v == NULL) 3468 return NULL; 3469 3470 /* The normal path */ 3471 if (PyBytes_Check(v)) 3472 return v; 3473 3474 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3475 if (PyByteArray_Check(v)) { 3476 int error; 3477 PyObject *b; 3478 3479 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3480 "encoder %s returned bytearray instead of bytes", 3481 encoding); 3482 if (error) { 3483 Py_DECREF(v); 3484 return NULL; 3485 } 3486 3487 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3488 Py_DECREF(v); 3489 return b; 3490 } 3491 3492 PyErr_Format(PyExc_TypeError, 3493 "encoder did not return a bytes object (type=%.400s)", 3494 Py_TYPE(v)->tp_name); 3495 Py_DECREF(v); 3496 return NULL; 3497} 3498 3499PyObject * 3500PyUnicode_AsEncodedUnicode(PyObject *unicode, 3501 const char *encoding, 3502 const char *errors) 3503{ 3504 PyObject *v; 3505 3506 if (!PyUnicode_Check(unicode)) { 3507 PyErr_BadArgument(); 3508 goto onError; 3509 } 3510 3511 if (encoding == NULL) 3512 encoding = PyUnicode_GetDefaultEncoding(); 3513 3514 /* Encode via the codec registry */ 3515 v = PyCodec_Encode(unicode, encoding, errors); 3516 if (v == NULL) 3517 goto onError; 3518 if (!PyUnicode_Check(v)) { 3519 PyErr_Format(PyExc_TypeError, 3520 "encoder did not return an str object (type=%.400s)", 3521 Py_TYPE(v)->tp_name); 3522 Py_DECREF(v); 3523 goto onError; 3524 } 3525 return v; 3526 3527 onError: 3528 return NULL; 3529} 3530 3531static size_t 3532mbstowcs_errorpos(const char *str, size_t len) 3533{ 3534#ifdef HAVE_MBRTOWC 3535 const char *start = str; 3536 mbstate_t mbs; 3537 size_t converted; 3538 wchar_t ch; 3539 3540 memset(&mbs, 0, sizeof mbs); 3541 while (len) 3542 { 3543 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3544 if (converted == 0) 3545 /* Reached end of string */ 3546 break; 3547 if (converted == (size_t)-1 || converted == (size_t)-2) { 3548 /* Conversion error or incomplete character */ 3549 return str - start; 3550 } 3551 else { 3552 str += converted; 3553 len -= converted; 3554 } 3555 } 3556 /* failed to find the undecodable byte sequence */ 3557 return 0; 3558#endif 3559 return 0; 3560} 3561 3562PyObject* 3563PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3564 const char *errors) 3565{ 3566 wchar_t smallbuf[256]; 3567 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3568 wchar_t *wstr; 3569 size_t wlen, wlen2; 3570 PyObject *unicode; 3571 int surrogateescape; 3572 size_t error_pos; 3573 char *errmsg; 3574 PyObject *reason, *exc; 3575 3576 if (locale_error_handler(errors, &surrogateescape) < 0) 3577 return NULL; 3578 3579 if (str[len] != '\0' || len != strlen(str)) { 3580 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3581 return NULL; 3582 } 3583 3584 if (surrogateescape) 3585 { 3586 wstr = _Py_char2wchar(str, &wlen); 3587 if (wstr == NULL) { 3588 if (wlen == (size_t)-1) 3589 PyErr_NoMemory(); 3590 else 3591 PyErr_SetFromErrno(PyExc_OSError); 3592 return NULL; 3593 } 3594 3595 unicode = PyUnicode_FromWideChar(wstr, wlen); 3596 PyMem_Free(wstr); 3597 } 3598 else { 3599#ifndef HAVE_BROKEN_MBSTOWCS 3600 wlen = mbstowcs(NULL, str, 0); 3601#else 3602 wlen = len; 3603#endif 3604 if (wlen == (size_t)-1) 3605 goto decode_error; 3606 if (wlen+1 <= smallbuf_len) { 3607 wstr = smallbuf; 3608 } 3609 else { 3610 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3611 return PyErr_NoMemory(); 3612 3613 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3614 if (!wstr) 3615 return PyErr_NoMemory(); 3616 } 3617 3618 /* This shouldn't fail now */ 3619 wlen2 = mbstowcs(wstr, str, wlen+1); 3620 if (wlen2 == (size_t)-1) { 3621 if (wstr != smallbuf) 3622 PyMem_Free(wstr); 3623 goto decode_error; 3624 } 3625#ifdef HAVE_BROKEN_MBSTOWCS 3626 assert(wlen2 == wlen); 3627#endif 3628 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3629 if (wstr != smallbuf) 3630 PyMem_Free(wstr); 3631 } 3632 return unicode; 3633 3634decode_error: 3635 errmsg = strerror(errno); 3636 assert(errmsg != NULL); 3637 3638 error_pos = mbstowcs_errorpos(str, len); 3639 if (errmsg != NULL) { 3640 size_t errlen; 3641 wstr = _Py_char2wchar(errmsg, &errlen); 3642 if (wstr != NULL) { 3643 reason = PyUnicode_FromWideChar(wstr, errlen); 3644 PyMem_Free(wstr); 3645 } else 3646 errmsg = NULL; 3647 } 3648 if (errmsg == NULL) 3649 reason = PyUnicode_FromString( 3650 "mbstowcs() encountered an invalid multibyte sequence"); 3651 if (reason == NULL) 3652 return NULL; 3653 3654 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3655 "locale", str, len, 3656 (Py_ssize_t)error_pos, 3657 (Py_ssize_t)(error_pos+1), 3658 reason); 3659 Py_DECREF(reason); 3660 if (exc != NULL) { 3661 PyCodec_StrictErrors(exc); 3662 Py_XDECREF(exc); 3663 } 3664 return NULL; 3665} 3666 3667PyObject* 3668PyUnicode_DecodeLocale(const char *str, const char *errors) 3669{ 3670 Py_ssize_t size = (Py_ssize_t)strlen(str); 3671 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3672} 3673 3674 3675PyObject* 3676PyUnicode_DecodeFSDefault(const char *s) { 3677 Py_ssize_t size = (Py_ssize_t)strlen(s); 3678 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3679} 3680 3681PyObject* 3682PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3683{ 3684#ifdef HAVE_MBCS 3685 return PyUnicode_DecodeMBCS(s, size, NULL); 3686#elif defined(__APPLE__) 3687 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3688#else 3689 PyInterpreterState *interp = PyThreadState_GET()->interp; 3690 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3691 cannot use it to encode and decode filenames before it is loaded. Load 3692 the Python codec requires to encode at least its own filename. Use the C 3693 version of the locale codec until the codec registry is initialized and 3694 the Python codec is loaded. 3695 3696 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3697 cannot only rely on it: check also interp->fscodec_initialized for 3698 subinterpreters. */ 3699 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3700 return PyUnicode_Decode(s, size, 3701 Py_FileSystemDefaultEncoding, 3702 "surrogateescape"); 3703 } 3704 else { 3705 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3706 } 3707#endif 3708} 3709 3710 3711int 3712_PyUnicode_HasNULChars(PyObject* s) 3713{ 3714 static PyObject *nul = NULL; 3715 3716 if (nul == NULL) 3717 nul = PyUnicode_FromStringAndSize("\0", 1); 3718 if (nul == NULL) 3719 return -1; 3720 return PyUnicode_Contains(s, nul); 3721} 3722 3723 3724int 3725PyUnicode_FSConverter(PyObject* arg, void* addr) 3726{ 3727 PyObject *output = NULL; 3728 Py_ssize_t size; 3729 void *data; 3730 if (arg == NULL) { 3731 Py_DECREF(*(PyObject**)addr); 3732 return 1; 3733 } 3734 if (PyBytes_Check(arg)) { 3735 output = arg; 3736 Py_INCREF(output); 3737 } 3738 else { 3739 arg = PyUnicode_FromObject(arg); 3740 if (!arg) 3741 return 0; 3742 output = PyUnicode_EncodeFSDefault(arg); 3743 Py_DECREF(arg); 3744 if (!output) 3745 return 0; 3746 if (!PyBytes_Check(output)) { 3747 Py_DECREF(output); 3748 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3749 return 0; 3750 } 3751 } 3752 size = PyBytes_GET_SIZE(output); 3753 data = PyBytes_AS_STRING(output); 3754 if (size != strlen(data)) { 3755 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3756 Py_DECREF(output); 3757 return 0; 3758 } 3759 *(PyObject**)addr = output; 3760 return Py_CLEANUP_SUPPORTED; 3761} 3762 3763 3764int 3765PyUnicode_FSDecoder(PyObject* arg, void* addr) 3766{ 3767 PyObject *output = NULL; 3768 if (arg == NULL) { 3769 Py_DECREF(*(PyObject**)addr); 3770 return 1; 3771 } 3772 if (PyUnicode_Check(arg)) { 3773 if (PyUnicode_READY(arg) == -1) 3774 return 0; 3775 output = arg; 3776 Py_INCREF(output); 3777 } 3778 else { 3779 arg = PyBytes_FromObject(arg); 3780 if (!arg) 3781 return 0; 3782 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3783 PyBytes_GET_SIZE(arg)); 3784 Py_DECREF(arg); 3785 if (!output) 3786 return 0; 3787 if (!PyUnicode_Check(output)) { 3788 Py_DECREF(output); 3789 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3790 return 0; 3791 } 3792 } 3793 if (PyUnicode_READY(output) == -1) { 3794 Py_DECREF(output); 3795 return 0; 3796 } 3797 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3798 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3799 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3800 Py_DECREF(output); 3801 return 0; 3802 } 3803 *(PyObject**)addr = output; 3804 return Py_CLEANUP_SUPPORTED; 3805} 3806 3807 3808char* 3809PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3810{ 3811 PyObject *bytes; 3812 3813 if (!PyUnicode_Check(unicode)) { 3814 PyErr_BadArgument(); 3815 return NULL; 3816 } 3817 if (PyUnicode_READY(unicode) == -1) 3818 return NULL; 3819 3820 if (PyUnicode_UTF8(unicode) == NULL) { 3821 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3822 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3823 if (bytes == NULL) 3824 return NULL; 3825 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3826 if (_PyUnicode_UTF8(unicode) == NULL) { 3827 Py_DECREF(bytes); 3828 return NULL; 3829 } 3830 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3831 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3832 PyBytes_AS_STRING(bytes), 3833 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3834 Py_DECREF(bytes); 3835 } 3836 3837 if (psize) 3838 *psize = PyUnicode_UTF8_LENGTH(unicode); 3839 return PyUnicode_UTF8(unicode); 3840} 3841 3842char* 3843PyUnicode_AsUTF8(PyObject *unicode) 3844{ 3845 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3846} 3847 3848#ifdef Py_DEBUG 3849static int unicode_as_unicode_calls = 0; 3850#endif 3851 3852 3853Py_UNICODE * 3854PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3855{ 3856 const unsigned char *one_byte; 3857#if SIZEOF_WCHAR_T == 4 3858 const Py_UCS2 *two_bytes; 3859#else 3860 const Py_UCS4 *four_bytes; 3861 const Py_UCS4 *ucs4_end; 3862 Py_ssize_t num_surrogates; 3863#endif 3864 wchar_t *w; 3865 wchar_t *wchar_end; 3866 3867 if (!PyUnicode_Check(unicode)) { 3868 PyErr_BadArgument(); 3869 return NULL; 3870 } 3871 if (_PyUnicode_WSTR(unicode) == NULL) { 3872 /* Non-ASCII compact unicode object */ 3873 assert(_PyUnicode_KIND(unicode) != 0); 3874 assert(PyUnicode_IS_READY(unicode)); 3875 3876#ifdef Py_DEBUG 3877 ++unicode_as_unicode_calls; 3878#endif 3879 3880 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3881#if SIZEOF_WCHAR_T == 2 3882 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3883 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3884 num_surrogates = 0; 3885 3886 for (; four_bytes < ucs4_end; ++four_bytes) { 3887 if (*four_bytes > 0xFFFF) 3888 ++num_surrogates; 3889 } 3890 3891 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3892 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3893 if (!_PyUnicode_WSTR(unicode)) { 3894 PyErr_NoMemory(); 3895 return NULL; 3896 } 3897 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3898 3899 w = _PyUnicode_WSTR(unicode); 3900 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3901 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3902 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3903 if (*four_bytes > 0xFFFF) { 3904 assert(*four_bytes <= MAX_UNICODE); 3905 /* encode surrogate pair in this case */ 3906 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3907 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3908 } 3909 else 3910 *w = *four_bytes; 3911 3912 if (w > wchar_end) { 3913 assert(0 && "Miscalculated string end"); 3914 } 3915 } 3916 *w = 0; 3917#else 3918 /* sizeof(wchar_t) == 4 */ 3919 Py_FatalError("Impossible unicode object state, wstr and str " 3920 "should share memory already."); 3921 return NULL; 3922#endif 3923 } 3924 else { 3925 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3926 (_PyUnicode_LENGTH(unicode) + 1)); 3927 if (!_PyUnicode_WSTR(unicode)) { 3928 PyErr_NoMemory(); 3929 return NULL; 3930 } 3931 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3932 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3933 w = _PyUnicode_WSTR(unicode); 3934 wchar_end = w + _PyUnicode_LENGTH(unicode); 3935 3936 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3937 one_byte = PyUnicode_1BYTE_DATA(unicode); 3938 for (; w < wchar_end; ++one_byte, ++w) 3939 *w = *one_byte; 3940 /* null-terminate the wstr */ 3941 *w = 0; 3942 } 3943 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3944#if SIZEOF_WCHAR_T == 4 3945 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3946 for (; w < wchar_end; ++two_bytes, ++w) 3947 *w = *two_bytes; 3948 /* null-terminate the wstr */ 3949 *w = 0; 3950#else 3951 /* sizeof(wchar_t) == 2 */ 3952 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3953 _PyUnicode_WSTR(unicode) = NULL; 3954 Py_FatalError("Impossible unicode object state, wstr " 3955 "and str should share memory already."); 3956 return NULL; 3957#endif 3958 } 3959 else { 3960 assert(0 && "This should never happen."); 3961 } 3962 } 3963 } 3964 if (size != NULL) 3965 *size = PyUnicode_WSTR_LENGTH(unicode); 3966 return _PyUnicode_WSTR(unicode); 3967} 3968 3969Py_UNICODE * 3970PyUnicode_AsUnicode(PyObject *unicode) 3971{ 3972 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3973} 3974 3975 3976Py_ssize_t 3977PyUnicode_GetSize(PyObject *unicode) 3978{ 3979 if (!PyUnicode_Check(unicode)) { 3980 PyErr_BadArgument(); 3981 goto onError; 3982 } 3983 return PyUnicode_GET_SIZE(unicode); 3984 3985 onError: 3986 return -1; 3987} 3988 3989Py_ssize_t 3990PyUnicode_GetLength(PyObject *unicode) 3991{ 3992 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3993 PyErr_BadArgument(); 3994 return -1; 3995 } 3996 3997 return PyUnicode_GET_LENGTH(unicode); 3998} 3999 4000Py_UCS4 4001PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4002{ 4003 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 4004 PyErr_BadArgument(); 4005 return (Py_UCS4)-1; 4006 } 4007 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4008 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4009 return (Py_UCS4)-1; 4010 } 4011 return PyUnicode_READ_CHAR(unicode, index); 4012} 4013 4014int 4015PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4016{ 4017 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4018 PyErr_BadArgument(); 4019 return -1; 4020 } 4021 assert(PyUnicode_IS_READY(unicode)); 4022 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4023 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4024 return -1; 4025 } 4026 if (unicode_check_modifiable(unicode)) 4027 return -1; 4028 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4029 PyErr_SetString(PyExc_ValueError, "character out of range"); 4030 return -1; 4031 } 4032 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4033 index, ch); 4034 return 0; 4035} 4036 4037const char * 4038PyUnicode_GetDefaultEncoding(void) 4039{ 4040 return "utf-8"; 4041} 4042 4043/* create or adjust a UnicodeDecodeError */ 4044static void 4045make_decode_exception(PyObject **exceptionObject, 4046 const char *encoding, 4047 const char *input, Py_ssize_t length, 4048 Py_ssize_t startpos, Py_ssize_t endpos, 4049 const char *reason) 4050{ 4051 if (*exceptionObject == NULL) { 4052 *exceptionObject = PyUnicodeDecodeError_Create( 4053 encoding, input, length, startpos, endpos, reason); 4054 } 4055 else { 4056 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4057 goto onError; 4058 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4059 goto onError; 4060 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4061 goto onError; 4062 } 4063 return; 4064 4065onError: 4066 Py_DECREF(*exceptionObject); 4067 *exceptionObject = NULL; 4068} 4069 4070/* error handling callback helper: 4071 build arguments, call the callback and check the arguments, 4072 if no exception occurred, copy the replacement to the output 4073 and adjust various state variables. 4074 return 0 on success, -1 on error 4075*/ 4076 4077static int 4078unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4079 const char *encoding, const char *reason, 4080 const char **input, const char **inend, Py_ssize_t *startinpos, 4081 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4082 PyObject **output, Py_ssize_t *outpos) 4083{ 4084 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4085 4086 PyObject *restuple = NULL; 4087 PyObject *repunicode = NULL; 4088 Py_ssize_t outsize; 4089 Py_ssize_t insize; 4090 Py_ssize_t requiredsize; 4091 Py_ssize_t newpos; 4092 PyObject *inputobj = NULL; 4093 int res = -1; 4094 4095 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4096 outsize = PyUnicode_GET_LENGTH(*output); 4097 else 4098 outsize = _PyUnicode_WSTR_LENGTH(*output); 4099 4100 if (*errorHandler == NULL) { 4101 *errorHandler = PyCodec_LookupError(errors); 4102 if (*errorHandler == NULL) 4103 goto onError; 4104 } 4105 4106 make_decode_exception(exceptionObject, 4107 encoding, 4108 *input, *inend - *input, 4109 *startinpos, *endinpos, 4110 reason); 4111 if (*exceptionObject == NULL) 4112 goto onError; 4113 4114 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4115 if (restuple == NULL) 4116 goto onError; 4117 if (!PyTuple_Check(restuple)) { 4118 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4119 goto onError; 4120 } 4121 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4122 goto onError; 4123 if (PyUnicode_READY(repunicode) == -1) 4124 goto onError; 4125 4126 /* Copy back the bytes variables, which might have been modified by the 4127 callback */ 4128 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4129 if (!inputobj) 4130 goto onError; 4131 if (!PyBytes_Check(inputobj)) { 4132 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4133 } 4134 *input = PyBytes_AS_STRING(inputobj); 4135 insize = PyBytes_GET_SIZE(inputobj); 4136 *inend = *input + insize; 4137 /* we can DECREF safely, as the exception has another reference, 4138 so the object won't go away. */ 4139 Py_DECREF(inputobj); 4140 4141 if (newpos<0) 4142 newpos = insize+newpos; 4143 if (newpos<0 || newpos>insize) { 4144 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4145 goto onError; 4146 } 4147 4148 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4149 /* need more space? (at least enough for what we 4150 have+the replacement+the rest of the string (starting 4151 at the new input position), so we won't have to check space 4152 when there are no errors in the rest of the string) */ 4153 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4154 requiredsize = *outpos + replen + insize-newpos; 4155 if (requiredsize > outsize) { 4156 if (requiredsize<2*outsize) 4157 requiredsize = 2*outsize; 4158 if (unicode_resize(output, requiredsize) < 0) 4159 goto onError; 4160 } 4161 if (unicode_widen(output, *outpos, 4162 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 4163 goto onError; 4164 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4165 *outpos += replen; 4166 } 4167 else { 4168 wchar_t *repwstr; 4169 Py_ssize_t repwlen; 4170 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4171 if (repwstr == NULL) 4172 goto onError; 4173 /* need more space? (at least enough for what we 4174 have+the replacement+the rest of the string (starting 4175 at the new input position), so we won't have to check space 4176 when there are no errors in the rest of the string) */ 4177 requiredsize = *outpos + repwlen + insize-newpos; 4178 if (requiredsize > outsize) { 4179 if (requiredsize < 2*outsize) 4180 requiredsize = 2*outsize; 4181 if (unicode_resize(output, requiredsize) < 0) 4182 goto onError; 4183 } 4184 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4185 *outpos += repwlen; 4186 } 4187 *endinpos = newpos; 4188 *inptr = *input + newpos; 4189 4190 /* we made it! */ 4191 res = 0; 4192 4193 onError: 4194 Py_XDECREF(restuple); 4195 return res; 4196} 4197 4198/* --- UTF-7 Codec -------------------------------------------------------- */ 4199 4200/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4201 4202/* Three simple macros defining base-64. */ 4203 4204/* Is c a base-64 character? */ 4205 4206#define IS_BASE64(c) \ 4207 (((c) >= 'A' && (c) <= 'Z') || \ 4208 ((c) >= 'a' && (c) <= 'z') || \ 4209 ((c) >= '0' && (c) <= '9') || \ 4210 (c) == '+' || (c) == '/') 4211 4212/* given that c is a base-64 character, what is its base-64 value? */ 4213 4214#define FROM_BASE64(c) \ 4215 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4216 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4217 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4218 (c) == '+' ? 62 : 63) 4219 4220/* What is the base-64 character of the bottom 6 bits of n? */ 4221 4222#define TO_BASE64(n) \ 4223 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4224 4225/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4226 * decoded as itself. We are permissive on decoding; the only ASCII 4227 * byte not decoding to itself is the + which begins a base64 4228 * string. */ 4229 4230#define DECODE_DIRECT(c) \ 4231 ((c) <= 127 && (c) != '+') 4232 4233/* The UTF-7 encoder treats ASCII characters differently according to 4234 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4235 * the above). See RFC2152. This array identifies these different 4236 * sets: 4237 * 0 : "Set D" 4238 * alphanumeric and '(),-./:? 4239 * 1 : "Set O" 4240 * !"#$%&*;<=>@[]^_`{|} 4241 * 2 : "whitespace" 4242 * ht nl cr sp 4243 * 3 : special (must be base64 encoded) 4244 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4245 */ 4246 4247static 4248char utf7_category[128] = { 4249/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4250 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4251/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4252 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4253/* sp ! " # $ % & ' ( ) * + , - . / */ 4254 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4255/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4257/* @ A B C D E F G H I J K L M N O */ 4258 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4259/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4261/* ` a b c d e f g h i j k l m n o */ 4262 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4263/* p q r s t u v w x y z { | } ~ del */ 4264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4265}; 4266 4267/* ENCODE_DIRECT: this character should be encoded as itself. The 4268 * answer depends on whether we are encoding set O as itself, and also 4269 * on whether we are encoding whitespace as itself. RFC2152 makes it 4270 * clear that the answers to these questions vary between 4271 * applications, so this code needs to be flexible. */ 4272 4273#define ENCODE_DIRECT(c, directO, directWS) \ 4274 ((c) < 128 && (c) > 0 && \ 4275 ((utf7_category[(c)] == 0) || \ 4276 (directWS && (utf7_category[(c)] == 2)) || \ 4277 (directO && (utf7_category[(c)] == 1)))) 4278 4279PyObject * 4280PyUnicode_DecodeUTF7(const char *s, 4281 Py_ssize_t size, 4282 const char *errors) 4283{ 4284 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4285} 4286 4287/* The decoder. The only state we preserve is our read position, 4288 * i.e. how many characters we have consumed. So if we end in the 4289 * middle of a shift sequence we have to back off the read position 4290 * and the output to the beginning of the sequence, otherwise we lose 4291 * all the shift state (seen bits, number of bits seen, high 4292 * surrogate). */ 4293 4294PyObject * 4295PyUnicode_DecodeUTF7Stateful(const char *s, 4296 Py_ssize_t size, 4297 const char *errors, 4298 Py_ssize_t *consumed) 4299{ 4300 const char *starts = s; 4301 Py_ssize_t startinpos; 4302 Py_ssize_t endinpos; 4303 Py_ssize_t outpos; 4304 const char *e; 4305 PyObject *unicode; 4306 const char *errmsg = ""; 4307 int inShift = 0; 4308 Py_ssize_t shiftOutStart; 4309 unsigned int base64bits = 0; 4310 unsigned long base64buffer = 0; 4311 Py_UCS4 surrogate = 0; 4312 PyObject *errorHandler = NULL; 4313 PyObject *exc = NULL; 4314 4315 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4316 unicode = PyUnicode_New(size, 127); 4317 if (!unicode) 4318 return NULL; 4319 if (size == 0) { 4320 if (consumed) 4321 *consumed = 0; 4322 return unicode; 4323 } 4324 4325 shiftOutStart = outpos = 0; 4326 e = s + size; 4327 4328 while (s < e) { 4329 Py_UCS4 ch; 4330 restart: 4331 ch = (unsigned char) *s; 4332 4333 if (inShift) { /* in a base-64 section */ 4334 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4335 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4336 base64bits += 6; 4337 s++; 4338 if (base64bits >= 16) { 4339 /* we have enough bits for a UTF-16 value */ 4340 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4341 base64bits -= 16; 4342 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4343 if (surrogate) { 4344 /* expecting a second surrogate */ 4345 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4346 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4347 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4348 goto onError; 4349 surrogate = 0; 4350 continue; 4351 } 4352 else { 4353 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4354 goto onError; 4355 surrogate = 0; 4356 } 4357 } 4358 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4359 /* first surrogate */ 4360 surrogate = outCh; 4361 } 4362 else { 4363 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4364 goto onError; 4365 } 4366 } 4367 } 4368 else { /* now leaving a base-64 section */ 4369 inShift = 0; 4370 s++; 4371 if (surrogate) { 4372 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4373 goto onError; 4374 surrogate = 0; 4375 } 4376 if (base64bits > 0) { /* left-over bits */ 4377 if (base64bits >= 6) { 4378 /* We've seen at least one base-64 character */ 4379 errmsg = "partial character in shift sequence"; 4380 goto utf7Error; 4381 } 4382 else { 4383 /* Some bits remain; they should be zero */ 4384 if (base64buffer != 0) { 4385 errmsg = "non-zero padding bits in shift sequence"; 4386 goto utf7Error; 4387 } 4388 } 4389 } 4390 if (ch != '-') { 4391 /* '-' is absorbed; other terminating 4392 characters are preserved */ 4393 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4394 goto onError; 4395 } 4396 } 4397 } 4398 else if ( ch == '+' ) { 4399 startinpos = s-starts; 4400 s++; /* consume '+' */ 4401 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4402 s++; 4403 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4404 goto onError; 4405 } 4406 else { /* begin base64-encoded section */ 4407 inShift = 1; 4408 shiftOutStart = outpos; 4409 base64bits = 0; 4410 } 4411 } 4412 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4413 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4414 goto onError; 4415 s++; 4416 } 4417 else { 4418 startinpos = s-starts; 4419 s++; 4420 errmsg = "unexpected special character"; 4421 goto utf7Error; 4422 } 4423 continue; 4424utf7Error: 4425 endinpos = s-starts; 4426 if (unicode_decode_call_errorhandler( 4427 errors, &errorHandler, 4428 "utf7", errmsg, 4429 &starts, &e, &startinpos, &endinpos, &exc, &s, 4430 &unicode, &outpos)) 4431 goto onError; 4432 } 4433 4434 /* end of string */ 4435 4436 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4437 /* if we're in an inconsistent state, that's an error */ 4438 if (surrogate || 4439 (base64bits >= 6) || 4440 (base64bits > 0 && base64buffer != 0)) { 4441 endinpos = size; 4442 if (unicode_decode_call_errorhandler( 4443 errors, &errorHandler, 4444 "utf7", "unterminated shift sequence", 4445 &starts, &e, &startinpos, &endinpos, &exc, &s, 4446 &unicode, &outpos)) 4447 goto onError; 4448 if (s < e) 4449 goto restart; 4450 } 4451 } 4452 4453 /* return state */ 4454 if (consumed) { 4455 if (inShift) { 4456 outpos = shiftOutStart; /* back off output */ 4457 *consumed = startinpos; 4458 } 4459 else { 4460 *consumed = s-starts; 4461 } 4462 } 4463 4464 if (unicode_resize(&unicode, outpos) < 0) 4465 goto onError; 4466 4467 Py_XDECREF(errorHandler); 4468 Py_XDECREF(exc); 4469 return unicode_result(unicode); 4470 4471 onError: 4472 Py_XDECREF(errorHandler); 4473 Py_XDECREF(exc); 4474 Py_DECREF(unicode); 4475 return NULL; 4476} 4477 4478 4479PyObject * 4480_PyUnicode_EncodeUTF7(PyObject *str, 4481 int base64SetO, 4482 int base64WhiteSpace, 4483 const char *errors) 4484{ 4485 int kind; 4486 void *data; 4487 Py_ssize_t len; 4488 PyObject *v; 4489 Py_ssize_t allocated; 4490 int inShift = 0; 4491 Py_ssize_t i; 4492 unsigned int base64bits = 0; 4493 unsigned long base64buffer = 0; 4494 char * out; 4495 char * start; 4496 4497 if (PyUnicode_READY(str) == -1) 4498 return NULL; 4499 kind = PyUnicode_KIND(str); 4500 data = PyUnicode_DATA(str); 4501 len = PyUnicode_GET_LENGTH(str); 4502 4503 if (len == 0) 4504 return PyBytes_FromStringAndSize(NULL, 0); 4505 4506 /* It might be possible to tighten this worst case */ 4507 allocated = 8 * len; 4508 if (allocated / 8 != len) 4509 return PyErr_NoMemory(); 4510 4511 v = PyBytes_FromStringAndSize(NULL, allocated); 4512 if (v == NULL) 4513 return NULL; 4514 4515 start = out = PyBytes_AS_STRING(v); 4516 for (i = 0; i < len; ++i) { 4517 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4518 4519 if (inShift) { 4520 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4521 /* shifting out */ 4522 if (base64bits) { /* output remaining bits */ 4523 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4524 base64buffer = 0; 4525 base64bits = 0; 4526 } 4527 inShift = 0; 4528 /* Characters not in the BASE64 set implicitly unshift the sequence 4529 so no '-' is required, except if the character is itself a '-' */ 4530 if (IS_BASE64(ch) || ch == '-') { 4531 *out++ = '-'; 4532 } 4533 *out++ = (char) ch; 4534 } 4535 else { 4536 goto encode_char; 4537 } 4538 } 4539 else { /* not in a shift sequence */ 4540 if (ch == '+') { 4541 *out++ = '+'; 4542 *out++ = '-'; 4543 } 4544 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4545 *out++ = (char) ch; 4546 } 4547 else { 4548 *out++ = '+'; 4549 inShift = 1; 4550 goto encode_char; 4551 } 4552 } 4553 continue; 4554encode_char: 4555 if (ch >= 0x10000) { 4556 assert(ch <= MAX_UNICODE); 4557 4558 /* code first surrogate */ 4559 base64bits += 16; 4560 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4561 while (base64bits >= 6) { 4562 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4563 base64bits -= 6; 4564 } 4565 /* prepare second surrogate */ 4566 ch = Py_UNICODE_LOW_SURROGATE(ch); 4567 } 4568 base64bits += 16; 4569 base64buffer = (base64buffer << 16) | ch; 4570 while (base64bits >= 6) { 4571 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4572 base64bits -= 6; 4573 } 4574 } 4575 if (base64bits) 4576 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4577 if (inShift) 4578 *out++ = '-'; 4579 if (_PyBytes_Resize(&v, out - start) < 0) 4580 return NULL; 4581 return v; 4582} 4583PyObject * 4584PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4585 Py_ssize_t size, 4586 int base64SetO, 4587 int base64WhiteSpace, 4588 const char *errors) 4589{ 4590 PyObject *result; 4591 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4592 if (tmp == NULL) 4593 return NULL; 4594 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4595 base64WhiteSpace, errors); 4596 Py_DECREF(tmp); 4597 return result; 4598} 4599 4600#undef IS_BASE64 4601#undef FROM_BASE64 4602#undef TO_BASE64 4603#undef DECODE_DIRECT 4604#undef ENCODE_DIRECT 4605 4606/* --- UTF-8 Codec -------------------------------------------------------- */ 4607 4608PyObject * 4609PyUnicode_DecodeUTF8(const char *s, 4610 Py_ssize_t size, 4611 const char *errors) 4612{ 4613 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4614} 4615 4616#include "stringlib/asciilib.h" 4617#include "stringlib/codecs.h" 4618#include "stringlib/undef.h" 4619 4620#include "stringlib/ucs1lib.h" 4621#include "stringlib/codecs.h" 4622#include "stringlib/undef.h" 4623 4624#include "stringlib/ucs2lib.h" 4625#include "stringlib/codecs.h" 4626#include "stringlib/undef.h" 4627 4628#include "stringlib/ucs4lib.h" 4629#include "stringlib/codecs.h" 4630#include "stringlib/undef.h" 4631 4632/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 4633#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 4634 4635/* Mask to quickly check whether a C 'long' contains a 4636 non-ASCII, UTF8-encoded char. */ 4637#if (SIZEOF_LONG == 8) 4638# define ASCII_CHAR_MASK 0x8080808080808080L 4639#elif (SIZEOF_LONG == 4) 4640# define ASCII_CHAR_MASK 0x80808080L 4641#else 4642# error C 'long' size should be either 4 or 8! 4643#endif 4644 4645static Py_ssize_t 4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4647{ 4648 const char *p = start; 4649 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK); 4650 4651#if SIZEOF_LONG <= SIZEOF_VOID_P 4652 assert(!((size_t) dest & LONG_PTR_MASK)); 4653 if (!((size_t) p & LONG_PTR_MASK)) { 4654 /* Fast path, see in STRINGLIB(utf8_decode) for 4655 an explanation. */ 4656 /* Help register allocation */ 4657 register const char *_p = p; 4658 register Py_UCS1 * q = dest; 4659 while (_p < aligned_end) { 4660 unsigned long value = *(const unsigned long *) _p; 4661 if (value & ASCII_CHAR_MASK) 4662 break; 4663 *((unsigned long *)q) = value; 4664 _p += SIZEOF_LONG; 4665 q += SIZEOF_LONG; 4666 } 4667 p = _p; 4668 while (p < end) { 4669 if ((unsigned char)*p & 0x80) 4670 break; 4671 *q++ = *p++; 4672 } 4673 return p - start; 4674 } 4675#endif 4676 while (p < end) { 4677 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4678 for an explanation. */ 4679 if (!((size_t) p & LONG_PTR_MASK)) { 4680 /* Help register allocation */ 4681 register const char *_p = p; 4682 while (_p < aligned_end) { 4683 unsigned long value = *(unsigned long *) _p; 4684 if (value & ASCII_CHAR_MASK) 4685 break; 4686 _p += SIZEOF_LONG; 4687 } 4688 p = _p; 4689 if (_p == end) 4690 break; 4691 } 4692 if ((unsigned char)*p & 0x80) 4693 break; 4694 ++p; 4695 } 4696 memcpy(dest, start, p - start); 4697 return p - start; 4698} 4699 4700PyObject * 4701PyUnicode_DecodeUTF8Stateful(const char *s, 4702 Py_ssize_t size, 4703 const char *errors, 4704 Py_ssize_t *consumed) 4705{ 4706 PyObject *unicode; 4707 const char *starts = s; 4708 const char *end = s + size; 4709 Py_ssize_t outpos; 4710 4711 Py_ssize_t startinpos; 4712 Py_ssize_t endinpos; 4713 const char *errmsg = ""; 4714 PyObject *errorHandler = NULL; 4715 PyObject *exc = NULL; 4716 4717 if (size == 0) { 4718 if (consumed) 4719 *consumed = 0; 4720 Py_INCREF(unicode_empty); 4721 return unicode_empty; 4722 } 4723 4724 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4725 if (size == 1 && (unsigned char)s[0] < 128) { 4726 if (consumed) 4727 *consumed = 1; 4728 return get_latin1_char((unsigned char)s[0]); 4729 } 4730 4731 unicode = PyUnicode_New(size, 127); 4732 if (!unicode) 4733 return NULL; 4734 4735 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4736 s += outpos; 4737 while (s < end) { 4738 Py_UCS4 ch; 4739 int kind = PyUnicode_KIND(unicode); 4740 if (kind == PyUnicode_1BYTE_KIND) { 4741 if (PyUnicode_IS_ASCII(unicode)) 4742 ch = asciilib_utf8_decode(&s, end, 4743 PyUnicode_1BYTE_DATA(unicode), &outpos); 4744 else 4745 ch = ucs1lib_utf8_decode(&s, end, 4746 PyUnicode_1BYTE_DATA(unicode), &outpos); 4747 } else if (kind == PyUnicode_2BYTE_KIND) { 4748 ch = ucs2lib_utf8_decode(&s, end, 4749 PyUnicode_2BYTE_DATA(unicode), &outpos); 4750 } else { 4751 assert(kind == PyUnicode_4BYTE_KIND); 4752 ch = ucs4lib_utf8_decode(&s, end, 4753 PyUnicode_4BYTE_DATA(unicode), &outpos); 4754 } 4755 4756 switch (ch) { 4757 case 0: 4758 if (s == end || consumed) 4759 goto End; 4760 errmsg = "unexpected end of data"; 4761 startinpos = s - starts; 4762 endinpos = startinpos + 1; 4763 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4764 endinpos++; 4765 break; 4766 case 1: 4767 errmsg = "invalid start byte"; 4768 startinpos = s - starts; 4769 endinpos = startinpos + 1; 4770 break; 4771 case 2: 4772 errmsg = "invalid continuation byte"; 4773 startinpos = s - starts; 4774 endinpos = startinpos + 1; 4775 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4776 endinpos++; 4777 break; 4778 default: 4779 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4780 goto onError; 4781 continue; 4782 } 4783 4784 if (unicode_decode_call_errorhandler( 4785 errors, &errorHandler, 4786 "utf-8", errmsg, 4787 &starts, &end, &startinpos, &endinpos, &exc, &s, 4788 &unicode, &outpos)) 4789 goto onError; 4790 } 4791 4792End: 4793 if (unicode_resize(&unicode, outpos) < 0) 4794 goto onError; 4795 4796 if (consumed) 4797 *consumed = s - starts; 4798 4799 Py_XDECREF(errorHandler); 4800 Py_XDECREF(exc); 4801 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4802 return unicode; 4803 4804onError: 4805 Py_XDECREF(errorHandler); 4806 Py_XDECREF(exc); 4807 Py_XDECREF(unicode); 4808 return NULL; 4809} 4810 4811#ifdef __APPLE__ 4812 4813/* Simplified UTF-8 decoder using surrogateescape error handler, 4814 used to decode the command line arguments on Mac OS X. */ 4815 4816wchar_t* 4817_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4818{ 4819 const char *e; 4820 wchar_t *unicode; 4821 Py_ssize_t outpos; 4822 4823 /* Note: size will always be longer than the resulting Unicode 4824 character count */ 4825 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4826 PyErr_NoMemory(); 4827 return NULL; 4828 } 4829 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4830 if (!unicode) 4831 return NULL; 4832 4833 /* Unpack UTF-8 encoded data */ 4834 e = s + size; 4835 outpos = 0; 4836 while (s < e) { 4837 Py_UCS4 ch; 4838#if SIZEOF_WCHAR_T == 4 4839 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4840#else 4841 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4842#endif 4843 if (ch > 0xFF) { 4844#if SIZEOF_WCHAR_T == 4 4845 assert(0); 4846#else 4847 assert(Py_UNICODE_IS_SURROGATE(ch)); 4848 /* compute and append the two surrogates: */ 4849 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4850 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4851#endif 4852 } 4853 else { 4854 if (!ch && s == e) 4855 break; 4856 /* surrogateescape */ 4857 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4858 } 4859 } 4860 unicode[outpos] = L'\0'; 4861 return unicode; 4862} 4863 4864#endif /* __APPLE__ */ 4865 4866/* Primary internal function which creates utf8 encoded bytes objects. 4867 4868 Allocation strategy: if the string is short, convert into a stack buffer 4869 and allocate exactly as much space needed at the end. Else allocate the 4870 maximum possible needed (4 result bytes per Unicode character), and return 4871 the excess memory at the end. 4872*/ 4873PyObject * 4874_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4875{ 4876 enum PyUnicode_Kind kind; 4877 void *data; 4878 Py_ssize_t size; 4879 4880 if (!PyUnicode_Check(unicode)) { 4881 PyErr_BadArgument(); 4882 return NULL; 4883 } 4884 4885 if (PyUnicode_READY(unicode) == -1) 4886 return NULL; 4887 4888 if (PyUnicode_UTF8(unicode)) 4889 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4890 PyUnicode_UTF8_LENGTH(unicode)); 4891 4892 kind = PyUnicode_KIND(unicode); 4893 data = PyUnicode_DATA(unicode); 4894 size = PyUnicode_GET_LENGTH(unicode); 4895 4896 switch (kind) { 4897 default: 4898 assert(0); 4899 case PyUnicode_1BYTE_KIND: 4900 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4901 assert(!PyUnicode_IS_ASCII(unicode)); 4902 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4903 case PyUnicode_2BYTE_KIND: 4904 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4905 case PyUnicode_4BYTE_KIND: 4906 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4907 } 4908} 4909 4910PyObject * 4911PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4912 Py_ssize_t size, 4913 const char *errors) 4914{ 4915 PyObject *v, *unicode; 4916 4917 unicode = PyUnicode_FromUnicode(s, size); 4918 if (unicode == NULL) 4919 return NULL; 4920 v = _PyUnicode_AsUTF8String(unicode, errors); 4921 Py_DECREF(unicode); 4922 return v; 4923} 4924 4925PyObject * 4926PyUnicode_AsUTF8String(PyObject *unicode) 4927{ 4928 return _PyUnicode_AsUTF8String(unicode, NULL); 4929} 4930 4931/* --- UTF-32 Codec ------------------------------------------------------- */ 4932 4933PyObject * 4934PyUnicode_DecodeUTF32(const char *s, 4935 Py_ssize_t size, 4936 const char *errors, 4937 int *byteorder) 4938{ 4939 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4940} 4941 4942PyObject * 4943PyUnicode_DecodeUTF32Stateful(const char *s, 4944 Py_ssize_t size, 4945 const char *errors, 4946 int *byteorder, 4947 Py_ssize_t *consumed) 4948{ 4949 const char *starts = s; 4950 Py_ssize_t startinpos; 4951 Py_ssize_t endinpos; 4952 Py_ssize_t outpos; 4953 PyObject *unicode; 4954 const unsigned char *q, *e; 4955 int bo = 0; /* assume native ordering by default */ 4956 const char *errmsg = ""; 4957 /* Offsets from q for retrieving bytes in the right order. */ 4958#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4959 int iorder[] = {0, 1, 2, 3}; 4960#else 4961 int iorder[] = {3, 2, 1, 0}; 4962#endif 4963 PyObject *errorHandler = NULL; 4964 PyObject *exc = NULL; 4965 4966 q = (unsigned char *)s; 4967 e = q + size; 4968 4969 if (byteorder) 4970 bo = *byteorder; 4971 4972 /* Check for BOM marks (U+FEFF) in the input and adjust current 4973 byte order setting accordingly. In native mode, the leading BOM 4974 mark is skipped, in all other modes, it is copied to the output 4975 stream as-is (giving a ZWNBSP character). */ 4976 if (bo == 0) { 4977 if (size >= 4) { 4978 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4979 (q[iorder[1]] << 8) | q[iorder[0]]; 4980#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4981 if (bom == 0x0000FEFF) { 4982 q += 4; 4983 bo = -1; 4984 } 4985 else if (bom == 0xFFFE0000) { 4986 q += 4; 4987 bo = 1; 4988 } 4989#else 4990 if (bom == 0x0000FEFF) { 4991 q += 4; 4992 bo = 1; 4993 } 4994 else if (bom == 0xFFFE0000) { 4995 q += 4; 4996 bo = -1; 4997 } 4998#endif 4999 } 5000 } 5001 5002 if (bo == -1) { 5003 /* force LE */ 5004 iorder[0] = 0; 5005 iorder[1] = 1; 5006 iorder[2] = 2; 5007 iorder[3] = 3; 5008 } 5009 else if (bo == 1) { 5010 /* force BE */ 5011 iorder[0] = 3; 5012 iorder[1] = 2; 5013 iorder[2] = 1; 5014 iorder[3] = 0; 5015 } 5016 5017 /* This might be one to much, because of a BOM */ 5018 unicode = PyUnicode_New((size+3)/4, 127); 5019 if (!unicode) 5020 return NULL; 5021 if (size == 0) 5022 return unicode; 5023 outpos = 0; 5024 5025 while (q < e) { 5026 Py_UCS4 ch; 5027 /* remaining bytes at the end? (size should be divisible by 4) */ 5028 if (e-q<4) { 5029 if (consumed) 5030 break; 5031 errmsg = "truncated data"; 5032 startinpos = ((const char *)q)-starts; 5033 endinpos = ((const char *)e)-starts; 5034 goto utf32Error; 5035 /* The remaining input chars are ignored if the callback 5036 chooses to skip the input */ 5037 } 5038 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 5039 (q[iorder[1]] << 8) | q[iorder[0]]; 5040 5041 if (ch >= 0x110000) 5042 { 5043 errmsg = "codepoint not in range(0x110000)"; 5044 startinpos = ((const char *)q)-starts; 5045 endinpos = startinpos+4; 5046 goto utf32Error; 5047 } 5048 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5049 goto onError; 5050 q += 4; 5051 continue; 5052 utf32Error: 5053 if (unicode_decode_call_errorhandler( 5054 errors, &errorHandler, 5055 "utf32", errmsg, 5056 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5057 &unicode, &outpos)) 5058 goto onError; 5059 } 5060 5061 if (byteorder) 5062 *byteorder = bo; 5063 5064 if (consumed) 5065 *consumed = (const char *)q-starts; 5066 5067 /* Adjust length */ 5068 if (unicode_resize(&unicode, outpos) < 0) 5069 goto onError; 5070 5071 Py_XDECREF(errorHandler); 5072 Py_XDECREF(exc); 5073 return unicode_result(unicode); 5074 5075 onError: 5076 Py_DECREF(unicode); 5077 Py_XDECREF(errorHandler); 5078 Py_XDECREF(exc); 5079 return NULL; 5080} 5081 5082PyObject * 5083_PyUnicode_EncodeUTF32(PyObject *str, 5084 const char *errors, 5085 int byteorder) 5086{ 5087 int kind; 5088 void *data; 5089 Py_ssize_t len; 5090 PyObject *v; 5091 unsigned char *p; 5092 Py_ssize_t nsize, bytesize, i; 5093 /* Offsets from p for storing byte pairs in the right order. */ 5094#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5095 int iorder[] = {0, 1, 2, 3}; 5096#else 5097 int iorder[] = {3, 2, 1, 0}; 5098#endif 5099 5100#define STORECHAR(CH) \ 5101 do { \ 5102 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5103 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5104 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5105 p[iorder[0]] = (CH) & 0xff; \ 5106 p += 4; \ 5107 } while(0) 5108 5109 if (!PyUnicode_Check(str)) { 5110 PyErr_BadArgument(); 5111 return NULL; 5112 } 5113 if (PyUnicode_READY(str) == -1) 5114 return NULL; 5115 kind = PyUnicode_KIND(str); 5116 data = PyUnicode_DATA(str); 5117 len = PyUnicode_GET_LENGTH(str); 5118 5119 nsize = len + (byteorder == 0); 5120 bytesize = nsize * 4; 5121 if (bytesize / 4 != nsize) 5122 return PyErr_NoMemory(); 5123 v = PyBytes_FromStringAndSize(NULL, bytesize); 5124 if (v == NULL) 5125 return NULL; 5126 5127 p = (unsigned char *)PyBytes_AS_STRING(v); 5128 if (byteorder == 0) 5129 STORECHAR(0xFEFF); 5130 if (len == 0) 5131 goto done; 5132 5133 if (byteorder == -1) { 5134 /* force LE */ 5135 iorder[0] = 0; 5136 iorder[1] = 1; 5137 iorder[2] = 2; 5138 iorder[3] = 3; 5139 } 5140 else if (byteorder == 1) { 5141 /* force BE */ 5142 iorder[0] = 3; 5143 iorder[1] = 2; 5144 iorder[2] = 1; 5145 iorder[3] = 0; 5146 } 5147 5148 for (i = 0; i < len; i++) 5149 STORECHAR(PyUnicode_READ(kind, data, i)); 5150 5151 done: 5152 return v; 5153#undef STORECHAR 5154} 5155 5156PyObject * 5157PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5158 Py_ssize_t size, 5159 const char *errors, 5160 int byteorder) 5161{ 5162 PyObject *result; 5163 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5164 if (tmp == NULL) 5165 return NULL; 5166 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5167 Py_DECREF(tmp); 5168 return result; 5169} 5170 5171PyObject * 5172PyUnicode_AsUTF32String(PyObject *unicode) 5173{ 5174 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5175} 5176 5177/* --- UTF-16 Codec ------------------------------------------------------- */ 5178 5179PyObject * 5180PyUnicode_DecodeUTF16(const char *s, 5181 Py_ssize_t size, 5182 const char *errors, 5183 int *byteorder) 5184{ 5185 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5186} 5187 5188PyObject * 5189PyUnicode_DecodeUTF16Stateful(const char *s, 5190 Py_ssize_t size, 5191 const char *errors, 5192 int *byteorder, 5193 Py_ssize_t *consumed) 5194{ 5195 const char *starts = s; 5196 Py_ssize_t startinpos; 5197 Py_ssize_t endinpos; 5198 Py_ssize_t outpos; 5199 PyObject *unicode; 5200 const unsigned char *q, *e; 5201 int bo = 0; /* assume native ordering by default */ 5202 int native_ordering; 5203 const char *errmsg = ""; 5204 PyObject *errorHandler = NULL; 5205 PyObject *exc = NULL; 5206 5207 q = (unsigned char *)s; 5208 e = q + size; 5209 5210 if (byteorder) 5211 bo = *byteorder; 5212 5213 /* Check for BOM marks (U+FEFF) in the input and adjust current 5214 byte order setting accordingly. In native mode, the leading BOM 5215 mark is skipped, in all other modes, it is copied to the output 5216 stream as-is (giving a ZWNBSP character). */ 5217 if (bo == 0 && size >= 2) { 5218 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5219 if (bom == 0xFEFF) { 5220 q += 2; 5221 bo = -1; 5222 } 5223 else if (bom == 0xFFFE) { 5224 q += 2; 5225 bo = 1; 5226 } 5227 if (byteorder) 5228 *byteorder = bo; 5229 } 5230 5231 if (q == e) { 5232 if (consumed) 5233 *consumed = size; 5234 Py_INCREF(unicode_empty); 5235 return unicode_empty; 5236 } 5237 5238#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5239 native_ordering = bo <= 0; 5240#else 5241 native_ordering = bo >= 0; 5242#endif 5243 5244 /* Note: size will always be longer than the resulting Unicode 5245 character count */ 5246 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5247 if (!unicode) 5248 return NULL; 5249 5250 outpos = 0; 5251 while (1) { 5252 Py_UCS4 ch = 0; 5253 if (e - q >= 2) { 5254 int kind = PyUnicode_KIND(unicode); 5255 if (kind == PyUnicode_1BYTE_KIND) { 5256 if (PyUnicode_IS_ASCII(unicode)) 5257 ch = asciilib_utf16_decode(&q, e, 5258 PyUnicode_1BYTE_DATA(unicode), &outpos, 5259 native_ordering); 5260 else 5261 ch = ucs1lib_utf16_decode(&q, e, 5262 PyUnicode_1BYTE_DATA(unicode), &outpos, 5263 native_ordering); 5264 } else if (kind == PyUnicode_2BYTE_KIND) { 5265 ch = ucs2lib_utf16_decode(&q, e, 5266 PyUnicode_2BYTE_DATA(unicode), &outpos, 5267 native_ordering); 5268 } else { 5269 assert(kind == PyUnicode_4BYTE_KIND); 5270 ch = ucs4lib_utf16_decode(&q, e, 5271 PyUnicode_4BYTE_DATA(unicode), &outpos, 5272 native_ordering); 5273 } 5274 } 5275 5276 switch (ch) 5277 { 5278 case 0: 5279 /* remaining byte at the end? (size should be even) */ 5280 if (q == e || consumed) 5281 goto End; 5282 errmsg = "truncated data"; 5283 startinpos = ((const char *)q) - starts; 5284 endinpos = ((const char *)e) - starts; 5285 break; 5286 /* The remaining input chars are ignored if the callback 5287 chooses to skip the input */ 5288 case 1: 5289 errmsg = "unexpected end of data"; 5290 startinpos = ((const char *)q) - 2 - starts; 5291 endinpos = ((const char *)e) - starts; 5292 break; 5293 case 2: 5294 errmsg = "illegal encoding"; 5295 startinpos = ((const char *)q) - 2 - starts; 5296 endinpos = startinpos + 2; 5297 break; 5298 case 3: 5299 errmsg = "illegal UTF-16 surrogate"; 5300 startinpos = ((const char *)q) - 4 - starts; 5301 endinpos = startinpos + 2; 5302 break; 5303 default: 5304 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5305 goto onError; 5306 continue; 5307 } 5308 5309 if (unicode_decode_call_errorhandler( 5310 errors, 5311 &errorHandler, 5312 "utf16", errmsg, 5313 &starts, 5314 (const char **)&e, 5315 &startinpos, 5316 &endinpos, 5317 &exc, 5318 (const char **)&q, 5319 &unicode, 5320 &outpos)) 5321 goto onError; 5322 } 5323 5324End: 5325 if (consumed) 5326 *consumed = (const char *)q-starts; 5327 5328 /* Adjust length */ 5329 if (unicode_resize(&unicode, outpos) < 0) 5330 goto onError; 5331 5332 Py_XDECREF(errorHandler); 5333 Py_XDECREF(exc); 5334 return unicode_result(unicode); 5335 5336 onError: 5337 Py_DECREF(unicode); 5338 Py_XDECREF(errorHandler); 5339 Py_XDECREF(exc); 5340 return NULL; 5341} 5342 5343PyObject * 5344_PyUnicode_EncodeUTF16(PyObject *str, 5345 const char *errors, 5346 int byteorder) 5347{ 5348 enum PyUnicode_Kind kind; 5349 const void *data; 5350 Py_ssize_t len; 5351 PyObject *v; 5352 unsigned short *out; 5353 Py_ssize_t bytesize; 5354 Py_ssize_t pairs; 5355#ifdef WORDS_BIGENDIAN 5356 int native_ordering = byteorder >= 0; 5357#else 5358 int native_ordering = byteorder <= 0; 5359#endif 5360 5361 if (!PyUnicode_Check(str)) { 5362 PyErr_BadArgument(); 5363 return NULL; 5364 } 5365 if (PyUnicode_READY(str) == -1) 5366 return NULL; 5367 kind = PyUnicode_KIND(str); 5368 data = PyUnicode_DATA(str); 5369 len = PyUnicode_GET_LENGTH(str); 5370 5371 pairs = 0; 5372 if (kind == PyUnicode_4BYTE_KIND) { 5373 const Py_UCS4 *in = (const Py_UCS4 *)data; 5374 const Py_UCS4 *end = in + len; 5375 while (in < end) 5376 if (*in++ >= 0x10000) 5377 pairs++; 5378 } 5379 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5380 return PyErr_NoMemory(); 5381 bytesize = (len + pairs + (byteorder == 0)) * 2; 5382 v = PyBytes_FromStringAndSize(NULL, bytesize); 5383 if (v == NULL) 5384 return NULL; 5385 5386 /* output buffer is 2-bytes aligned */ 5387 assert(((Py_uintptr_t)PyBytes_AS_STRING(v) & 1) == 0); 5388 out = (unsigned short *)PyBytes_AS_STRING(v); 5389 if (byteorder == 0) 5390 *out++ = 0xFEFF; 5391 if (len == 0) 5392 goto done; 5393 5394 switch (kind) { 5395 case PyUnicode_1BYTE_KIND: { 5396 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5397 break; 5398 } 5399 case PyUnicode_2BYTE_KIND: { 5400 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5401 break; 5402 } 5403 case PyUnicode_4BYTE_KIND: { 5404 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5405 break; 5406 } 5407 default: 5408 assert(0); 5409 } 5410 5411 done: 5412 return v; 5413} 5414 5415PyObject * 5416PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5417 Py_ssize_t size, 5418 const char *errors, 5419 int byteorder) 5420{ 5421 PyObject *result; 5422 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5423 if (tmp == NULL) 5424 return NULL; 5425 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5426 Py_DECREF(tmp); 5427 return result; 5428} 5429 5430PyObject * 5431PyUnicode_AsUTF16String(PyObject *unicode) 5432{ 5433 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5434} 5435 5436/* --- Unicode Escape Codec ----------------------------------------------- */ 5437 5438/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5439 if all the escapes in the string make it still a valid ASCII string. 5440 Returns -1 if any escapes were found which cause the string to 5441 pop out of ASCII range. Otherwise returns the length of the 5442 required buffer to hold the string. 5443 */ 5444static Py_ssize_t 5445length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5446{ 5447 const unsigned char *p = (const unsigned char *)s; 5448 const unsigned char *end = p + size; 5449 Py_ssize_t length = 0; 5450 5451 if (size < 0) 5452 return -1; 5453 5454 for (; p < end; ++p) { 5455 if (*p > 127) { 5456 /* Non-ASCII */ 5457 return -1; 5458 } 5459 else if (*p != '\\') { 5460 /* Normal character */ 5461 ++length; 5462 } 5463 else { 5464 /* Backslash-escape, check next char */ 5465 ++p; 5466 /* Escape sequence reaches till end of string or 5467 non-ASCII follow-up. */ 5468 if (p >= end || *p > 127) 5469 return -1; 5470 switch (*p) { 5471 case '\n': 5472 /* backslash + \n result in zero characters */ 5473 break; 5474 case '\\': case '\'': case '\"': 5475 case 'b': case 'f': case 't': 5476 case 'n': case 'r': case 'v': case 'a': 5477 ++length; 5478 break; 5479 case '0': case '1': case '2': case '3': 5480 case '4': case '5': case '6': case '7': 5481 case 'x': case 'u': case 'U': case 'N': 5482 /* these do not guarantee ASCII characters */ 5483 return -1; 5484 default: 5485 /* count the backslash + the other character */ 5486 length += 2; 5487 } 5488 } 5489 } 5490 return length; 5491} 5492 5493static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5494 5495PyObject * 5496PyUnicode_DecodeUnicodeEscape(const char *s, 5497 Py_ssize_t size, 5498 const char *errors) 5499{ 5500 const char *starts = s; 5501 Py_ssize_t startinpos; 5502 Py_ssize_t endinpos; 5503 int j; 5504 PyObject *v; 5505 const char *end; 5506 char* message; 5507 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5508 PyObject *errorHandler = NULL; 5509 PyObject *exc = NULL; 5510 Py_ssize_t len; 5511 Py_ssize_t i; 5512 5513 len = length_of_escaped_ascii_string(s, size); 5514 5515 /* After length_of_escaped_ascii_string() there are two alternatives, 5516 either the string is pure ASCII with named escapes like \n, etc. 5517 and we determined it's exact size (common case) 5518 or it contains \x, \u, ... escape sequences. then we create a 5519 legacy wchar string and resize it at the end of this function. */ 5520 if (len >= 0) { 5521 v = PyUnicode_New(len, 127); 5522 if (!v) 5523 goto onError; 5524 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5525 } 5526 else { 5527 /* Escaped strings will always be longer than the resulting 5528 Unicode string, so we start with size here and then reduce the 5529 length after conversion to the true value. 5530 (but if the error callback returns a long replacement string 5531 we'll have to allocate more space) */ 5532 v = PyUnicode_New(size, 127); 5533 if (!v) 5534 goto onError; 5535 len = size; 5536 } 5537 5538 if (size == 0) 5539 return v; 5540 i = 0; 5541 end = s + size; 5542 5543 while (s < end) { 5544 unsigned char c; 5545 Py_UCS4 x; 5546 int digits; 5547 5548 /* The only case in which i == ascii_length is a backslash 5549 followed by a newline. */ 5550 assert(i <= len); 5551 5552 /* Non-escape characters are interpreted as Unicode ordinals */ 5553 if (*s != '\\') { 5554 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5555 goto onError; 5556 continue; 5557 } 5558 5559 startinpos = s-starts; 5560 /* \ - Escapes */ 5561 s++; 5562 c = *s++; 5563 if (s > end) 5564 c = '\0'; /* Invalid after \ */ 5565 5566 /* The only case in which i == ascii_length is a backslash 5567 followed by a newline. */ 5568 assert(i < len || (i == len && c == '\n')); 5569 5570 switch (c) { 5571 5572 /* \x escapes */ 5573#define WRITECHAR(ch) \ 5574 do { \ 5575 if (unicode_putchar(&v, &i, ch) < 0) \ 5576 goto onError; \ 5577 }while(0) 5578 5579 case '\n': break; 5580 case '\\': WRITECHAR('\\'); break; 5581 case '\'': WRITECHAR('\''); break; 5582 case '\"': WRITECHAR('\"'); break; 5583 case 'b': WRITECHAR('\b'); break; 5584 /* FF */ 5585 case 'f': WRITECHAR('\014'); break; 5586 case 't': WRITECHAR('\t'); break; 5587 case 'n': WRITECHAR('\n'); break; 5588 case 'r': WRITECHAR('\r'); break; 5589 /* VT */ 5590 case 'v': WRITECHAR('\013'); break; 5591 /* BEL, not classic C */ 5592 case 'a': WRITECHAR('\007'); break; 5593 5594 /* \OOO (octal) escapes */ 5595 case '0': case '1': case '2': case '3': 5596 case '4': case '5': case '6': case '7': 5597 x = s[-1] - '0'; 5598 if (s < end && '0' <= *s && *s <= '7') { 5599 x = (x<<3) + *s++ - '0'; 5600 if (s < end && '0' <= *s && *s <= '7') 5601 x = (x<<3) + *s++ - '0'; 5602 } 5603 WRITECHAR(x); 5604 break; 5605 5606 /* hex escapes */ 5607 /* \xXX */ 5608 case 'x': 5609 digits = 2; 5610 message = "truncated \\xXX escape"; 5611 goto hexescape; 5612 5613 /* \uXXXX */ 5614 case 'u': 5615 digits = 4; 5616 message = "truncated \\uXXXX escape"; 5617 goto hexescape; 5618 5619 /* \UXXXXXXXX */ 5620 case 'U': 5621 digits = 8; 5622 message = "truncated \\UXXXXXXXX escape"; 5623 hexescape: 5624 chr = 0; 5625 if (s+digits>end) { 5626 endinpos = size; 5627 if (unicode_decode_call_errorhandler( 5628 errors, &errorHandler, 5629 "unicodeescape", "end of string in escape sequence", 5630 &starts, &end, &startinpos, &endinpos, &exc, &s, 5631 &v, &i)) 5632 goto onError; 5633 goto nextByte; 5634 } 5635 for (j = 0; j < digits; ++j) { 5636 c = (unsigned char) s[j]; 5637 if (!Py_ISXDIGIT(c)) { 5638 endinpos = (s+j+1)-starts; 5639 if (unicode_decode_call_errorhandler( 5640 errors, &errorHandler, 5641 "unicodeescape", message, 5642 &starts, &end, &startinpos, &endinpos, &exc, &s, 5643 &v, &i)) 5644 goto onError; 5645 len = PyUnicode_GET_LENGTH(v); 5646 goto nextByte; 5647 } 5648 chr = (chr<<4) & ~0xF; 5649 if (c >= '0' && c <= '9') 5650 chr += c - '0'; 5651 else if (c >= 'a' && c <= 'f') 5652 chr += 10 + c - 'a'; 5653 else 5654 chr += 10 + c - 'A'; 5655 } 5656 s += j; 5657 if (chr == 0xffffffff && PyErr_Occurred()) 5658 /* _decoding_error will have already written into the 5659 target buffer. */ 5660 break; 5661 store: 5662 /* when we get here, chr is a 32-bit unicode character */ 5663 if (chr <= MAX_UNICODE) { 5664 WRITECHAR(chr); 5665 } else { 5666 endinpos = s-starts; 5667 if (unicode_decode_call_errorhandler( 5668 errors, &errorHandler, 5669 "unicodeescape", "illegal Unicode character", 5670 &starts, &end, &startinpos, &endinpos, &exc, &s, 5671 &v, &i)) 5672 goto onError; 5673 } 5674 break; 5675 5676 /* \N{name} */ 5677 case 'N': 5678 message = "malformed \\N character escape"; 5679 if (ucnhash_CAPI == NULL) { 5680 /* load the unicode data module */ 5681 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5682 PyUnicodeData_CAPSULE_NAME, 1); 5683 if (ucnhash_CAPI == NULL) 5684 goto ucnhashError; 5685 } 5686 if (*s == '{') { 5687 const char *start = s+1; 5688 /* look for the closing brace */ 5689 while (*s != '}' && s < end) 5690 s++; 5691 if (s > start && s < end && *s == '}') { 5692 /* found a name. look it up in the unicode database */ 5693 message = "unknown Unicode character name"; 5694 s++; 5695 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5696 &chr, 0)) 5697 goto store; 5698 } 5699 } 5700 endinpos = s-starts; 5701 if (unicode_decode_call_errorhandler( 5702 errors, &errorHandler, 5703 "unicodeescape", message, 5704 &starts, &end, &startinpos, &endinpos, &exc, &s, 5705 &v, &i)) 5706 goto onError; 5707 break; 5708 5709 default: 5710 if (s > end) { 5711 message = "\\ at end of string"; 5712 s--; 5713 endinpos = s-starts; 5714 if (unicode_decode_call_errorhandler( 5715 errors, &errorHandler, 5716 "unicodeescape", message, 5717 &starts, &end, &startinpos, &endinpos, &exc, &s, 5718 &v, &i)) 5719 goto onError; 5720 } 5721 else { 5722 WRITECHAR('\\'); 5723 WRITECHAR(s[-1]); 5724 } 5725 break; 5726 } 5727 nextByte: 5728 ; 5729 } 5730#undef WRITECHAR 5731 5732 if (unicode_resize(&v, i) < 0) 5733 goto onError; 5734 Py_XDECREF(errorHandler); 5735 Py_XDECREF(exc); 5736 return unicode_result(v); 5737 5738 ucnhashError: 5739 PyErr_SetString( 5740 PyExc_UnicodeError, 5741 "\\N escapes not supported (can't load unicodedata module)" 5742 ); 5743 Py_XDECREF(v); 5744 Py_XDECREF(errorHandler); 5745 Py_XDECREF(exc); 5746 return NULL; 5747 5748 onError: 5749 Py_XDECREF(v); 5750 Py_XDECREF(errorHandler); 5751 Py_XDECREF(exc); 5752 return NULL; 5753} 5754 5755/* Return a Unicode-Escape string version of the Unicode object. 5756 5757 If quotes is true, the string is enclosed in u"" or u'' quotes as 5758 appropriate. 5759 5760*/ 5761 5762PyObject * 5763PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5764{ 5765 Py_ssize_t i, len; 5766 PyObject *repr; 5767 char *p; 5768 int kind; 5769 void *data; 5770 Py_ssize_t expandsize = 0; 5771 5772 /* Initial allocation is based on the longest-possible unichr 5773 escape. 5774 5775 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5776 unichr, so in this case it's the longest unichr escape. In 5777 narrow (UTF-16) builds this is five chars per source unichr 5778 since there are two unichrs in the surrogate pair, so in narrow 5779 (UTF-16) builds it's not the longest unichr escape. 5780 5781 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5782 so in the narrow (UTF-16) build case it's the longest unichr 5783 escape. 5784 */ 5785 5786 if (!PyUnicode_Check(unicode)) { 5787 PyErr_BadArgument(); 5788 return NULL; 5789 } 5790 if (PyUnicode_READY(unicode) == -1) 5791 return NULL; 5792 len = PyUnicode_GET_LENGTH(unicode); 5793 kind = PyUnicode_KIND(unicode); 5794 data = PyUnicode_DATA(unicode); 5795 switch (kind) { 5796 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5797 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5798 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5799 } 5800 5801 if (len == 0) 5802 return PyBytes_FromStringAndSize(NULL, 0); 5803 5804 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5805 return PyErr_NoMemory(); 5806 5807 repr = PyBytes_FromStringAndSize(NULL, 5808 2 5809 + expandsize*len 5810 + 1); 5811 if (repr == NULL) 5812 return NULL; 5813 5814 p = PyBytes_AS_STRING(repr); 5815 5816 for (i = 0; i < len; i++) { 5817 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5818 5819 /* Escape backslashes */ 5820 if (ch == '\\') { 5821 *p++ = '\\'; 5822 *p++ = (char) ch; 5823 continue; 5824 } 5825 5826 /* Map 21-bit characters to '\U00xxxxxx' */ 5827 else if (ch >= 0x10000) { 5828 assert(ch <= MAX_UNICODE); 5829 *p++ = '\\'; 5830 *p++ = 'U'; 5831 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5832 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5833 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5834 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5835 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5836 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5837 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5838 *p++ = Py_hexdigits[ch & 0x0000000F]; 5839 continue; 5840 } 5841 5842 /* Map 16-bit characters to '\uxxxx' */ 5843 if (ch >= 256) { 5844 *p++ = '\\'; 5845 *p++ = 'u'; 5846 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5847 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5848 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5849 *p++ = Py_hexdigits[ch & 0x000F]; 5850 } 5851 5852 /* Map special whitespace to '\t', \n', '\r' */ 5853 else if (ch == '\t') { 5854 *p++ = '\\'; 5855 *p++ = 't'; 5856 } 5857 else if (ch == '\n') { 5858 *p++ = '\\'; 5859 *p++ = 'n'; 5860 } 5861 else if (ch == '\r') { 5862 *p++ = '\\'; 5863 *p++ = 'r'; 5864 } 5865 5866 /* Map non-printable US ASCII to '\xhh' */ 5867 else if (ch < ' ' || ch >= 0x7F) { 5868 *p++ = '\\'; 5869 *p++ = 'x'; 5870 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5871 *p++ = Py_hexdigits[ch & 0x000F]; 5872 } 5873 5874 /* Copy everything else as-is */ 5875 else 5876 *p++ = (char) ch; 5877 } 5878 5879 assert(p - PyBytes_AS_STRING(repr) > 0); 5880 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5881 return NULL; 5882 return repr; 5883} 5884 5885PyObject * 5886PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5887 Py_ssize_t size) 5888{ 5889 PyObject *result; 5890 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5891 if (tmp == NULL) 5892 return NULL; 5893 result = PyUnicode_AsUnicodeEscapeString(tmp); 5894 Py_DECREF(tmp); 5895 return result; 5896} 5897 5898/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5899 5900PyObject * 5901PyUnicode_DecodeRawUnicodeEscape(const char *s, 5902 Py_ssize_t size, 5903 const char *errors) 5904{ 5905 const char *starts = s; 5906 Py_ssize_t startinpos; 5907 Py_ssize_t endinpos; 5908 Py_ssize_t outpos; 5909 PyObject *v; 5910 const char *end; 5911 const char *bs; 5912 PyObject *errorHandler = NULL; 5913 PyObject *exc = NULL; 5914 5915 /* Escaped strings will always be longer than the resulting 5916 Unicode string, so we start with size here and then reduce the 5917 length after conversion to the true value. (But decoding error 5918 handler might have to resize the string) */ 5919 v = PyUnicode_New(size, 127); 5920 if (v == NULL) 5921 goto onError; 5922 if (size == 0) 5923 return v; 5924 outpos = 0; 5925 end = s + size; 5926 while (s < end) { 5927 unsigned char c; 5928 Py_UCS4 x; 5929 int i; 5930 int count; 5931 5932 /* Non-escape characters are interpreted as Unicode ordinals */ 5933 if (*s != '\\') { 5934 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5935 goto onError; 5936 continue; 5937 } 5938 startinpos = s-starts; 5939 5940 /* \u-escapes are only interpreted iff the number of leading 5941 backslashes if odd */ 5942 bs = s; 5943 for (;s < end;) { 5944 if (*s != '\\') 5945 break; 5946 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5947 goto onError; 5948 } 5949 if (((s - bs) & 1) == 0 || 5950 s >= end || 5951 (*s != 'u' && *s != 'U')) { 5952 continue; 5953 } 5954 outpos--; 5955 count = *s=='u' ? 4 : 8; 5956 s++; 5957 5958 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5959 for (x = 0, i = 0; i < count; ++i, ++s) { 5960 c = (unsigned char)*s; 5961 if (!Py_ISXDIGIT(c)) { 5962 endinpos = s-starts; 5963 if (unicode_decode_call_errorhandler( 5964 errors, &errorHandler, 5965 "rawunicodeescape", "truncated \\uXXXX", 5966 &starts, &end, &startinpos, &endinpos, &exc, &s, 5967 &v, &outpos)) 5968 goto onError; 5969 goto nextByte; 5970 } 5971 x = (x<<4) & ~0xF; 5972 if (c >= '0' && c <= '9') 5973 x += c - '0'; 5974 else if (c >= 'a' && c <= 'f') 5975 x += 10 + c - 'a'; 5976 else 5977 x += 10 + c - 'A'; 5978 } 5979 if (x <= MAX_UNICODE) { 5980 if (unicode_putchar(&v, &outpos, x) < 0) 5981 goto onError; 5982 } else { 5983 endinpos = s-starts; 5984 if (unicode_decode_call_errorhandler( 5985 errors, &errorHandler, 5986 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5987 &starts, &end, &startinpos, &endinpos, &exc, &s, 5988 &v, &outpos)) 5989 goto onError; 5990 } 5991 nextByte: 5992 ; 5993 } 5994 if (unicode_resize(&v, outpos) < 0) 5995 goto onError; 5996 Py_XDECREF(errorHandler); 5997 Py_XDECREF(exc); 5998 return unicode_result(v); 5999 6000 onError: 6001 Py_XDECREF(v); 6002 Py_XDECREF(errorHandler); 6003 Py_XDECREF(exc); 6004 return NULL; 6005} 6006 6007 6008PyObject * 6009PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6010{ 6011 PyObject *repr; 6012 char *p; 6013 char *q; 6014 Py_ssize_t expandsize, pos; 6015 int kind; 6016 void *data; 6017 Py_ssize_t len; 6018 6019 if (!PyUnicode_Check(unicode)) { 6020 PyErr_BadArgument(); 6021 return NULL; 6022 } 6023 if (PyUnicode_READY(unicode) == -1) 6024 return NULL; 6025 kind = PyUnicode_KIND(unicode); 6026 data = PyUnicode_DATA(unicode); 6027 len = PyUnicode_GET_LENGTH(unicode); 6028 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6029 bytes, and 1 byte characters 4. */ 6030 expandsize = kind * 2 + 2; 6031 6032 if (len > PY_SSIZE_T_MAX / expandsize) 6033 return PyErr_NoMemory(); 6034 6035 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6036 if (repr == NULL) 6037 return NULL; 6038 if (len == 0) 6039 return repr; 6040 6041 p = q = PyBytes_AS_STRING(repr); 6042 for (pos = 0; pos < len; pos++) { 6043 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6044 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6045 if (ch >= 0x10000) { 6046 assert(ch <= MAX_UNICODE); 6047 *p++ = '\\'; 6048 *p++ = 'U'; 6049 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6050 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6051 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6052 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6053 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6054 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6055 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6056 *p++ = Py_hexdigits[ch & 15]; 6057 } 6058 /* Map 16-bit characters to '\uxxxx' */ 6059 else if (ch >= 256) { 6060 *p++ = '\\'; 6061 *p++ = 'u'; 6062 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6063 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6064 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6065 *p++ = Py_hexdigits[ch & 15]; 6066 } 6067 /* Copy everything else as-is */ 6068 else 6069 *p++ = (char) ch; 6070 } 6071 6072 assert(p > q); 6073 if (_PyBytes_Resize(&repr, p - q) < 0) 6074 return NULL; 6075 return repr; 6076} 6077 6078PyObject * 6079PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6080 Py_ssize_t size) 6081{ 6082 PyObject *result; 6083 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6084 if (tmp == NULL) 6085 return NULL; 6086 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6087 Py_DECREF(tmp); 6088 return result; 6089} 6090 6091/* --- Unicode Internal Codec ------------------------------------------- */ 6092 6093PyObject * 6094_PyUnicode_DecodeUnicodeInternal(const char *s, 6095 Py_ssize_t size, 6096 const char *errors) 6097{ 6098 const char *starts = s; 6099 Py_ssize_t startinpos; 6100 Py_ssize_t endinpos; 6101 Py_ssize_t outpos; 6102 PyObject *v; 6103 const char *end; 6104 const char *reason; 6105 PyObject *errorHandler = NULL; 6106 PyObject *exc = NULL; 6107 6108 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6109 "unicode_internal codec has been deprecated", 6110 1)) 6111 return NULL; 6112 6113 /* XXX overflow detection missing */ 6114 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6115 if (v == NULL) 6116 goto onError; 6117 if (PyUnicode_GET_LENGTH(v) == 0) 6118 return v; 6119 outpos = 0; 6120 end = s + size; 6121 6122 while (s < end) { 6123 Py_UNICODE uch; 6124 Py_UCS4 ch; 6125 /* We copy the raw representation one byte at a time because the 6126 pointer may be unaligned (see test_codeccallbacks). */ 6127 ((char *) &uch)[0] = s[0]; 6128 ((char *) &uch)[1] = s[1]; 6129#ifdef Py_UNICODE_WIDE 6130 ((char *) &uch)[2] = s[2]; 6131 ((char *) &uch)[3] = s[3]; 6132#endif 6133 ch = uch; 6134 6135 /* We have to sanity check the raw data, otherwise doom looms for 6136 some malformed UCS-4 data. */ 6137 if ( 6138#ifdef Py_UNICODE_WIDE 6139 ch > 0x10ffff || 6140#endif 6141 end-s < Py_UNICODE_SIZE 6142 ) 6143 { 6144 startinpos = s - starts; 6145 if (end-s < Py_UNICODE_SIZE) { 6146 endinpos = end-starts; 6147 reason = "truncated input"; 6148 } 6149 else { 6150 endinpos = s - starts + Py_UNICODE_SIZE; 6151 reason = "illegal code point (> 0x10FFFF)"; 6152 } 6153 if (unicode_decode_call_errorhandler( 6154 errors, &errorHandler, 6155 "unicode_internal", reason, 6156 &starts, &end, &startinpos, &endinpos, &exc, &s, 6157 &v, &outpos)) 6158 goto onError; 6159 continue; 6160 } 6161 6162 s += Py_UNICODE_SIZE; 6163#ifndef Py_UNICODE_WIDE 6164 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6165 { 6166 Py_UNICODE uch2; 6167 ((char *) &uch2)[0] = s[0]; 6168 ((char *) &uch2)[1] = s[1]; 6169 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6170 { 6171 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6172 s += Py_UNICODE_SIZE; 6173 } 6174 } 6175#endif 6176 6177 if (unicode_putchar(&v, &outpos, ch) < 0) 6178 goto onError; 6179 } 6180 6181 if (unicode_resize(&v, outpos) < 0) 6182 goto onError; 6183 Py_XDECREF(errorHandler); 6184 Py_XDECREF(exc); 6185 return unicode_result(v); 6186 6187 onError: 6188 Py_XDECREF(v); 6189 Py_XDECREF(errorHandler); 6190 Py_XDECREF(exc); 6191 return NULL; 6192} 6193 6194/* --- Latin-1 Codec ------------------------------------------------------ */ 6195 6196PyObject * 6197PyUnicode_DecodeLatin1(const char *s, 6198 Py_ssize_t size, 6199 const char *errors) 6200{ 6201 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6202 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6203} 6204 6205/* create or adjust a UnicodeEncodeError */ 6206static void 6207make_encode_exception(PyObject **exceptionObject, 6208 const char *encoding, 6209 PyObject *unicode, 6210 Py_ssize_t startpos, Py_ssize_t endpos, 6211 const char *reason) 6212{ 6213 if (*exceptionObject == NULL) { 6214 *exceptionObject = PyObject_CallFunction( 6215 PyExc_UnicodeEncodeError, "sOnns", 6216 encoding, unicode, startpos, endpos, reason); 6217 } 6218 else { 6219 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6220 goto onError; 6221 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6222 goto onError; 6223 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6224 goto onError; 6225 return; 6226 onError: 6227 Py_DECREF(*exceptionObject); 6228 *exceptionObject = NULL; 6229 } 6230} 6231 6232/* raises a UnicodeEncodeError */ 6233static void 6234raise_encode_exception(PyObject **exceptionObject, 6235 const char *encoding, 6236 PyObject *unicode, 6237 Py_ssize_t startpos, Py_ssize_t endpos, 6238 const char *reason) 6239{ 6240 make_encode_exception(exceptionObject, 6241 encoding, unicode, startpos, endpos, reason); 6242 if (*exceptionObject != NULL) 6243 PyCodec_StrictErrors(*exceptionObject); 6244} 6245 6246/* error handling callback helper: 6247 build arguments, call the callback and check the arguments, 6248 put the result into newpos and return the replacement string, which 6249 has to be freed by the caller */ 6250static PyObject * 6251unicode_encode_call_errorhandler(const char *errors, 6252 PyObject **errorHandler, 6253 const char *encoding, const char *reason, 6254 PyObject *unicode, PyObject **exceptionObject, 6255 Py_ssize_t startpos, Py_ssize_t endpos, 6256 Py_ssize_t *newpos) 6257{ 6258 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6259 Py_ssize_t len; 6260 PyObject *restuple; 6261 PyObject *resunicode; 6262 6263 if (*errorHandler == NULL) { 6264 *errorHandler = PyCodec_LookupError(errors); 6265 if (*errorHandler == NULL) 6266 return NULL; 6267 } 6268 6269 if (PyUnicode_READY(unicode) == -1) 6270 return NULL; 6271 len = PyUnicode_GET_LENGTH(unicode); 6272 6273 make_encode_exception(exceptionObject, 6274 encoding, unicode, startpos, endpos, reason); 6275 if (*exceptionObject == NULL) 6276 return NULL; 6277 6278 restuple = PyObject_CallFunctionObjArgs( 6279 *errorHandler, *exceptionObject, NULL); 6280 if (restuple == NULL) 6281 return NULL; 6282 if (!PyTuple_Check(restuple)) { 6283 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6284 Py_DECREF(restuple); 6285 return NULL; 6286 } 6287 if (!PyArg_ParseTuple(restuple, argparse, 6288 &resunicode, newpos)) { 6289 Py_DECREF(restuple); 6290 return NULL; 6291 } 6292 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6293 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6294 Py_DECREF(restuple); 6295 return NULL; 6296 } 6297 if (*newpos<0) 6298 *newpos = len + *newpos; 6299 if (*newpos<0 || *newpos>len) { 6300 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6301 Py_DECREF(restuple); 6302 return NULL; 6303 } 6304 Py_INCREF(resunicode); 6305 Py_DECREF(restuple); 6306 return resunicode; 6307} 6308 6309static PyObject * 6310unicode_encode_ucs1(PyObject *unicode, 6311 const char *errors, 6312 unsigned int limit) 6313{ 6314 /* input state */ 6315 Py_ssize_t pos=0, size; 6316 int kind; 6317 void *data; 6318 /* output object */ 6319 PyObject *res; 6320 /* pointer into the output */ 6321 char *str; 6322 /* current output position */ 6323 Py_ssize_t ressize; 6324 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6325 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6326 PyObject *errorHandler = NULL; 6327 PyObject *exc = NULL; 6328 /* the following variable is used for caching string comparisons 6329 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6330 int known_errorHandler = -1; 6331 6332 if (PyUnicode_READY(unicode) == -1) 6333 return NULL; 6334 size = PyUnicode_GET_LENGTH(unicode); 6335 kind = PyUnicode_KIND(unicode); 6336 data = PyUnicode_DATA(unicode); 6337 /* allocate enough for a simple encoding without 6338 replacements, if we need more, we'll resize */ 6339 if (size == 0) 6340 return PyBytes_FromStringAndSize(NULL, 0); 6341 res = PyBytes_FromStringAndSize(NULL, size); 6342 if (res == NULL) 6343 return NULL; 6344 str = PyBytes_AS_STRING(res); 6345 ressize = size; 6346 6347 while (pos < size) { 6348 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6349 6350 /* can we encode this? */ 6351 if (c<limit) { 6352 /* no overflow check, because we know that the space is enough */ 6353 *str++ = (char)c; 6354 ++pos; 6355 } 6356 else { 6357 Py_ssize_t requiredsize; 6358 PyObject *repunicode; 6359 Py_ssize_t repsize, newpos, respos, i; 6360 /* startpos for collecting unencodable chars */ 6361 Py_ssize_t collstart = pos; 6362 Py_ssize_t collend = pos; 6363 /* find all unecodable characters */ 6364 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6365 ++collend; 6366 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6367 if (known_errorHandler==-1) { 6368 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6369 known_errorHandler = 1; 6370 else if (!strcmp(errors, "replace")) 6371 known_errorHandler = 2; 6372 else if (!strcmp(errors, "ignore")) 6373 known_errorHandler = 3; 6374 else if (!strcmp(errors, "xmlcharrefreplace")) 6375 known_errorHandler = 4; 6376 else 6377 known_errorHandler = 0; 6378 } 6379 switch (known_errorHandler) { 6380 case 1: /* strict */ 6381 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6382 goto onError; 6383 case 2: /* replace */ 6384 while (collstart++<collend) 6385 *str++ = '?'; /* fall through */ 6386 case 3: /* ignore */ 6387 pos = collend; 6388 break; 6389 case 4: /* xmlcharrefreplace */ 6390 respos = str - PyBytes_AS_STRING(res); 6391 /* determine replacement size */ 6392 for (i = collstart, repsize = 0; i < collend; ++i) { 6393 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6394 if (ch < 10) 6395 repsize += 2+1+1; 6396 else if (ch < 100) 6397 repsize += 2+2+1; 6398 else if (ch < 1000) 6399 repsize += 2+3+1; 6400 else if (ch < 10000) 6401 repsize += 2+4+1; 6402 else if (ch < 100000) 6403 repsize += 2+5+1; 6404 else if (ch < 1000000) 6405 repsize += 2+6+1; 6406 else { 6407 assert(ch <= MAX_UNICODE); 6408 repsize += 2+7+1; 6409 } 6410 } 6411 requiredsize = respos+repsize+(size-collend); 6412 if (requiredsize > ressize) { 6413 if (requiredsize<2*ressize) 6414 requiredsize = 2*ressize; 6415 if (_PyBytes_Resize(&res, requiredsize)) 6416 goto onError; 6417 str = PyBytes_AS_STRING(res) + respos; 6418 ressize = requiredsize; 6419 } 6420 /* generate replacement */ 6421 for (i = collstart; i < collend; ++i) { 6422 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6423 } 6424 pos = collend; 6425 break; 6426 default: 6427 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6428 encoding, reason, unicode, &exc, 6429 collstart, collend, &newpos); 6430 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6431 PyUnicode_READY(repunicode) == -1)) 6432 goto onError; 6433 if (PyBytes_Check(repunicode)) { 6434 /* Directly copy bytes result to output. */ 6435 repsize = PyBytes_Size(repunicode); 6436 if (repsize > 1) { 6437 /* Make room for all additional bytes. */ 6438 respos = str - PyBytes_AS_STRING(res); 6439 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6440 Py_DECREF(repunicode); 6441 goto onError; 6442 } 6443 str = PyBytes_AS_STRING(res) + respos; 6444 ressize += repsize-1; 6445 } 6446 memcpy(str, PyBytes_AsString(repunicode), repsize); 6447 str += repsize; 6448 pos = newpos; 6449 Py_DECREF(repunicode); 6450 break; 6451 } 6452 /* need more space? (at least enough for what we 6453 have+the replacement+the rest of the string, so 6454 we won't have to check space for encodable characters) */ 6455 respos = str - PyBytes_AS_STRING(res); 6456 repsize = PyUnicode_GET_LENGTH(repunicode); 6457 requiredsize = respos+repsize+(size-collend); 6458 if (requiredsize > ressize) { 6459 if (requiredsize<2*ressize) 6460 requiredsize = 2*ressize; 6461 if (_PyBytes_Resize(&res, requiredsize)) { 6462 Py_DECREF(repunicode); 6463 goto onError; 6464 } 6465 str = PyBytes_AS_STRING(res) + respos; 6466 ressize = requiredsize; 6467 } 6468 /* check if there is anything unencodable in the replacement 6469 and copy it to the output */ 6470 for (i = 0; repsize-->0; ++i, ++str) { 6471 c = PyUnicode_READ_CHAR(repunicode, i); 6472 if (c >= limit) { 6473 raise_encode_exception(&exc, encoding, unicode, 6474 pos, pos+1, reason); 6475 Py_DECREF(repunicode); 6476 goto onError; 6477 } 6478 *str = (char)c; 6479 } 6480 pos = newpos; 6481 Py_DECREF(repunicode); 6482 } 6483 } 6484 } 6485 /* Resize if we allocated to much */ 6486 size = str - PyBytes_AS_STRING(res); 6487 if (size < ressize) { /* If this falls res will be NULL */ 6488 assert(size >= 0); 6489 if (_PyBytes_Resize(&res, size) < 0) 6490 goto onError; 6491 } 6492 6493 Py_XDECREF(errorHandler); 6494 Py_XDECREF(exc); 6495 return res; 6496 6497 onError: 6498 Py_XDECREF(res); 6499 Py_XDECREF(errorHandler); 6500 Py_XDECREF(exc); 6501 return NULL; 6502} 6503 6504/* Deprecated */ 6505PyObject * 6506PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6507 Py_ssize_t size, 6508 const char *errors) 6509{ 6510 PyObject *result; 6511 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6512 if (unicode == NULL) 6513 return NULL; 6514 result = unicode_encode_ucs1(unicode, errors, 256); 6515 Py_DECREF(unicode); 6516 return result; 6517} 6518 6519PyObject * 6520_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6521{ 6522 if (!PyUnicode_Check(unicode)) { 6523 PyErr_BadArgument(); 6524 return NULL; 6525 } 6526 if (PyUnicode_READY(unicode) == -1) 6527 return NULL; 6528 /* Fast path: if it is a one-byte string, construct 6529 bytes object directly. */ 6530 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6531 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6532 PyUnicode_GET_LENGTH(unicode)); 6533 /* Non-Latin-1 characters present. Defer to above function to 6534 raise the exception. */ 6535 return unicode_encode_ucs1(unicode, errors, 256); 6536} 6537 6538PyObject* 6539PyUnicode_AsLatin1String(PyObject *unicode) 6540{ 6541 return _PyUnicode_AsLatin1String(unicode, NULL); 6542} 6543 6544/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6545 6546PyObject * 6547PyUnicode_DecodeASCII(const char *s, 6548 Py_ssize_t size, 6549 const char *errors) 6550{ 6551 const char *starts = s; 6552 PyObject *unicode; 6553 int kind; 6554 void *data; 6555 Py_ssize_t startinpos; 6556 Py_ssize_t endinpos; 6557 Py_ssize_t outpos; 6558 const char *e; 6559 PyObject *errorHandler = NULL; 6560 PyObject *exc = NULL; 6561 6562 if (size == 0) { 6563 Py_INCREF(unicode_empty); 6564 return unicode_empty; 6565 } 6566 6567 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6568 if (size == 1 && (unsigned char)s[0] < 128) 6569 return get_latin1_char((unsigned char)s[0]); 6570 6571 unicode = PyUnicode_New(size, 127); 6572 if (unicode == NULL) 6573 goto onError; 6574 6575 e = s + size; 6576 data = PyUnicode_1BYTE_DATA(unicode); 6577 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6578 if (outpos == size) 6579 return unicode; 6580 6581 s += outpos; 6582 kind = PyUnicode_1BYTE_KIND; 6583 while (s < e) { 6584 register unsigned char c = (unsigned char)*s; 6585 if (c < 128) { 6586 PyUnicode_WRITE(kind, data, outpos++, c); 6587 ++s; 6588 } 6589 else { 6590 startinpos = s-starts; 6591 endinpos = startinpos + 1; 6592 if (unicode_decode_call_errorhandler( 6593 errors, &errorHandler, 6594 "ascii", "ordinal not in range(128)", 6595 &starts, &e, &startinpos, &endinpos, &exc, &s, 6596 &unicode, &outpos)) 6597 goto onError; 6598 kind = PyUnicode_KIND(unicode); 6599 data = PyUnicode_DATA(unicode); 6600 } 6601 } 6602 if (unicode_resize(&unicode, outpos) < 0) 6603 goto onError; 6604 Py_XDECREF(errorHandler); 6605 Py_XDECREF(exc); 6606 assert(_PyUnicode_CheckConsistency(unicode, 1)); 6607 return unicode; 6608 6609 onError: 6610 Py_XDECREF(unicode); 6611 Py_XDECREF(errorHandler); 6612 Py_XDECREF(exc); 6613 return NULL; 6614} 6615 6616/* Deprecated */ 6617PyObject * 6618PyUnicode_EncodeASCII(const Py_UNICODE *p, 6619 Py_ssize_t size, 6620 const char *errors) 6621{ 6622 PyObject *result; 6623 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6624 if (unicode == NULL) 6625 return NULL; 6626 result = unicode_encode_ucs1(unicode, errors, 128); 6627 Py_DECREF(unicode); 6628 return result; 6629} 6630 6631PyObject * 6632_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6633{ 6634 if (!PyUnicode_Check(unicode)) { 6635 PyErr_BadArgument(); 6636 return NULL; 6637 } 6638 if (PyUnicode_READY(unicode) == -1) 6639 return NULL; 6640 /* Fast path: if it is an ASCII-only string, construct bytes object 6641 directly. Else defer to above function to raise the exception. */ 6642 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6643 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6644 PyUnicode_GET_LENGTH(unicode)); 6645 return unicode_encode_ucs1(unicode, errors, 128); 6646} 6647 6648PyObject * 6649PyUnicode_AsASCIIString(PyObject *unicode) 6650{ 6651 return _PyUnicode_AsASCIIString(unicode, NULL); 6652} 6653 6654#ifdef HAVE_MBCS 6655 6656/* --- MBCS codecs for Windows -------------------------------------------- */ 6657 6658#if SIZEOF_INT < SIZEOF_SIZE_T 6659#define NEED_RETRY 6660#endif 6661 6662#ifndef WC_ERR_INVALID_CHARS 6663# define WC_ERR_INVALID_CHARS 0x0080 6664#endif 6665 6666static char* 6667code_page_name(UINT code_page, PyObject **obj) 6668{ 6669 *obj = NULL; 6670 if (code_page == CP_ACP) 6671 return "mbcs"; 6672 if (code_page == CP_UTF7) 6673 return "CP_UTF7"; 6674 if (code_page == CP_UTF8) 6675 return "CP_UTF8"; 6676 6677 *obj = PyBytes_FromFormat("cp%u", code_page); 6678 if (*obj == NULL) 6679 return NULL; 6680 return PyBytes_AS_STRING(*obj); 6681} 6682 6683static int 6684is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6685{ 6686 const char *curr = s + offset; 6687 const char *prev; 6688 6689 if (!IsDBCSLeadByteEx(code_page, *curr)) 6690 return 0; 6691 6692 prev = CharPrevExA(code_page, s, curr, 0); 6693 if (prev == curr) 6694 return 1; 6695 /* FIXME: This code is limited to "true" double-byte encodings, 6696 as it assumes an incomplete character consists of a single 6697 byte. */ 6698 if (curr - prev == 2) 6699 return 1; 6700 if (!IsDBCSLeadByteEx(code_page, *prev)) 6701 return 1; 6702 return 0; 6703} 6704 6705static DWORD 6706decode_code_page_flags(UINT code_page) 6707{ 6708 if (code_page == CP_UTF7) { 6709 /* The CP_UTF7 decoder only supports flags=0 */ 6710 return 0; 6711 } 6712 else 6713 return MB_ERR_INVALID_CHARS; 6714} 6715 6716/* 6717 * Decode a byte string from a Windows code page into unicode object in strict 6718 * mode. 6719 * 6720 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6721 * WindowsError and returns -1 on other error. 6722 */ 6723static int 6724decode_code_page_strict(UINT code_page, 6725 PyObject **v, 6726 const char *in, 6727 int insize) 6728{ 6729 const DWORD flags = decode_code_page_flags(code_page); 6730 wchar_t *out; 6731 DWORD outsize; 6732 6733 /* First get the size of the result */ 6734 assert(insize > 0); 6735 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6736 if (outsize <= 0) 6737 goto error; 6738 6739 if (*v == NULL) { 6740 /* Create unicode object */ 6741 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6742 *v = (PyObject*)_PyUnicode_New(outsize); 6743 if (*v == NULL) 6744 return -1; 6745 out = PyUnicode_AS_UNICODE(*v); 6746 } 6747 else { 6748 /* Extend unicode object */ 6749 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6750 if (unicode_resize(v, n + outsize) < 0) 6751 return -1; 6752 out = PyUnicode_AS_UNICODE(*v) + n; 6753 } 6754 6755 /* Do the conversion */ 6756 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6757 if (outsize <= 0) 6758 goto error; 6759 return insize; 6760 6761error: 6762 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6763 return -2; 6764 PyErr_SetFromWindowsErr(0); 6765 return -1; 6766} 6767 6768/* 6769 * Decode a byte string from a code page into unicode object with an error 6770 * handler. 6771 * 6772 * Returns consumed size if succeed, or raise a WindowsError or 6773 * UnicodeDecodeError exception and returns -1 on error. 6774 */ 6775static int 6776decode_code_page_errors(UINT code_page, 6777 PyObject **v, 6778 const char *in, const int size, 6779 const char *errors) 6780{ 6781 const char *startin = in; 6782 const char *endin = in + size; 6783 const DWORD flags = decode_code_page_flags(code_page); 6784 /* Ideally, we should get reason from FormatMessage. This is the Windows 6785 2000 English version of the message. */ 6786 const char *reason = "No mapping for the Unicode character exists " 6787 "in the target code page."; 6788 /* each step cannot decode more than 1 character, but a character can be 6789 represented as a surrogate pair */ 6790 wchar_t buffer[2], *startout, *out; 6791 int insize, outsize; 6792 PyObject *errorHandler = NULL; 6793 PyObject *exc = NULL; 6794 PyObject *encoding_obj = NULL; 6795 char *encoding; 6796 DWORD err; 6797 int ret = -1; 6798 6799 assert(size > 0); 6800 6801 encoding = code_page_name(code_page, &encoding_obj); 6802 if (encoding == NULL) 6803 return -1; 6804 6805 if (errors == NULL || strcmp(errors, "strict") == 0) { 6806 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6807 UnicodeDecodeError. */ 6808 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6809 if (exc != NULL) { 6810 PyCodec_StrictErrors(exc); 6811 Py_CLEAR(exc); 6812 } 6813 goto error; 6814 } 6815 6816 if (*v == NULL) { 6817 /* Create unicode object */ 6818 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6819 PyErr_NoMemory(); 6820 goto error; 6821 } 6822 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6823 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6824 if (*v == NULL) 6825 goto error; 6826 startout = PyUnicode_AS_UNICODE(*v); 6827 } 6828 else { 6829 /* Extend unicode object */ 6830 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6831 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6832 PyErr_NoMemory(); 6833 goto error; 6834 } 6835 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6836 goto error; 6837 startout = PyUnicode_AS_UNICODE(*v) + n; 6838 } 6839 6840 /* Decode the byte string character per character */ 6841 out = startout; 6842 while (in < endin) 6843 { 6844 /* Decode a character */ 6845 insize = 1; 6846 do 6847 { 6848 outsize = MultiByteToWideChar(code_page, flags, 6849 in, insize, 6850 buffer, Py_ARRAY_LENGTH(buffer)); 6851 if (outsize > 0) 6852 break; 6853 err = GetLastError(); 6854 if (err != ERROR_NO_UNICODE_TRANSLATION 6855 && err != ERROR_INSUFFICIENT_BUFFER) 6856 { 6857 PyErr_SetFromWindowsErr(0); 6858 goto error; 6859 } 6860 insize++; 6861 } 6862 /* 4=maximum length of a UTF-8 sequence */ 6863 while (insize <= 4 && (in + insize) <= endin); 6864 6865 if (outsize <= 0) { 6866 Py_ssize_t startinpos, endinpos, outpos; 6867 6868 startinpos = in - startin; 6869 endinpos = startinpos + 1; 6870 outpos = out - PyUnicode_AS_UNICODE(*v); 6871 if (unicode_decode_call_errorhandler( 6872 errors, &errorHandler, 6873 encoding, reason, 6874 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6875 v, &outpos)) 6876 { 6877 goto error; 6878 } 6879 out = PyUnicode_AS_UNICODE(*v) + outpos; 6880 } 6881 else { 6882 in += insize; 6883 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6884 out += outsize; 6885 } 6886 } 6887 6888 /* write a NUL character at the end */ 6889 *out = 0; 6890 6891 /* Extend unicode object */ 6892 outsize = out - startout; 6893 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6894 if (unicode_resize(v, outsize) < 0) 6895 goto error; 6896 ret = size; 6897 6898error: 6899 Py_XDECREF(encoding_obj); 6900 Py_XDECREF(errorHandler); 6901 Py_XDECREF(exc); 6902 return ret; 6903} 6904 6905static PyObject * 6906decode_code_page_stateful(int code_page, 6907 const char *s, Py_ssize_t size, 6908 const char *errors, Py_ssize_t *consumed) 6909{ 6910 PyObject *v = NULL; 6911 int chunk_size, final, converted, done; 6912 6913 if (code_page < 0) { 6914 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6915 return NULL; 6916 } 6917 6918 if (consumed) 6919 *consumed = 0; 6920 6921 do 6922 { 6923#ifdef NEED_RETRY 6924 if (size > INT_MAX) { 6925 chunk_size = INT_MAX; 6926 final = 0; 6927 done = 0; 6928 } 6929 else 6930#endif 6931 { 6932 chunk_size = (int)size; 6933 final = (consumed == NULL); 6934 done = 1; 6935 } 6936 6937 /* Skip trailing lead-byte unless 'final' is set */ 6938 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6939 --chunk_size; 6940 6941 if (chunk_size == 0 && done) { 6942 if (v != NULL) 6943 break; 6944 Py_INCREF(unicode_empty); 6945 return unicode_empty; 6946 } 6947 6948 6949 converted = decode_code_page_strict(code_page, &v, 6950 s, chunk_size); 6951 if (converted == -2) 6952 converted = decode_code_page_errors(code_page, &v, 6953 s, chunk_size, 6954 errors); 6955 assert(converted != 0); 6956 6957 if (converted < 0) { 6958 Py_XDECREF(v); 6959 return NULL; 6960 } 6961 6962 if (consumed) 6963 *consumed += converted; 6964 6965 s += converted; 6966 size -= converted; 6967 } while (!done); 6968 6969 return unicode_result(v); 6970} 6971 6972PyObject * 6973PyUnicode_DecodeCodePageStateful(int code_page, 6974 const char *s, 6975 Py_ssize_t size, 6976 const char *errors, 6977 Py_ssize_t *consumed) 6978{ 6979 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6980} 6981 6982PyObject * 6983PyUnicode_DecodeMBCSStateful(const char *s, 6984 Py_ssize_t size, 6985 const char *errors, 6986 Py_ssize_t *consumed) 6987{ 6988 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6989} 6990 6991PyObject * 6992PyUnicode_DecodeMBCS(const char *s, 6993 Py_ssize_t size, 6994 const char *errors) 6995{ 6996 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6997} 6998 6999static DWORD 7000encode_code_page_flags(UINT code_page, const char *errors) 7001{ 7002 if (code_page == CP_UTF8) { 7003 if (winver.dwMajorVersion >= 6) 7004 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7005 and later */ 7006 return WC_ERR_INVALID_CHARS; 7007 else 7008 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7009 return 0; 7010 } 7011 else if (code_page == CP_UTF7) { 7012 /* CP_UTF7 only supports flags=0 */ 7013 return 0; 7014 } 7015 else { 7016 if (errors != NULL && strcmp(errors, "replace") == 0) 7017 return 0; 7018 else 7019 return WC_NO_BEST_FIT_CHARS; 7020 } 7021} 7022 7023/* 7024 * Encode a Unicode string to a Windows code page into a byte string in strict 7025 * mode. 7026 * 7027 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7028 * a WindowsError and returns -1 on other error. 7029 */ 7030static int 7031encode_code_page_strict(UINT code_page, PyObject **outbytes, 7032 PyObject *unicode, Py_ssize_t offset, int len, 7033 const char* errors) 7034{ 7035 BOOL usedDefaultChar = FALSE; 7036 BOOL *pusedDefaultChar = &usedDefaultChar; 7037 int outsize; 7038 PyObject *exc = NULL; 7039 wchar_t *p; 7040 Py_ssize_t size; 7041 const DWORD flags = encode_code_page_flags(code_page, NULL); 7042 char *out; 7043 /* Create a substring so that we can get the UTF-16 representation 7044 of just the slice under consideration. */ 7045 PyObject *substring; 7046 7047 assert(len > 0); 7048 7049 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7050 pusedDefaultChar = &usedDefaultChar; 7051 else 7052 pusedDefaultChar = NULL; 7053 7054 substring = PyUnicode_Substring(unicode, offset, offset+len); 7055 if (substring == NULL) 7056 return -1; 7057 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7058 if (p == NULL) { 7059 Py_DECREF(substring); 7060 return -1; 7061 } 7062 7063 /* First get the size of the result */ 7064 outsize = WideCharToMultiByte(code_page, flags, 7065 p, size, 7066 NULL, 0, 7067 NULL, pusedDefaultChar); 7068 if (outsize <= 0) 7069 goto error; 7070 /* If we used a default char, then we failed! */ 7071 if (pusedDefaultChar && *pusedDefaultChar) { 7072 Py_DECREF(substring); 7073 return -2; 7074 } 7075 7076 if (*outbytes == NULL) { 7077 /* Create string object */ 7078 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7079 if (*outbytes == NULL) { 7080 Py_DECREF(substring); 7081 return -1; 7082 } 7083 out = PyBytes_AS_STRING(*outbytes); 7084 } 7085 else { 7086 /* Extend string object */ 7087 const Py_ssize_t n = PyBytes_Size(*outbytes); 7088 if (outsize > PY_SSIZE_T_MAX - n) { 7089 PyErr_NoMemory(); 7090 Py_DECREF(substring); 7091 return -1; 7092 } 7093 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7094 Py_DECREF(substring); 7095 return -1; 7096 } 7097 out = PyBytes_AS_STRING(*outbytes) + n; 7098 } 7099 7100 /* Do the conversion */ 7101 outsize = WideCharToMultiByte(code_page, flags, 7102 p, size, 7103 out, outsize, 7104 NULL, pusedDefaultChar); 7105 Py_CLEAR(substring); 7106 if (outsize <= 0) 7107 goto error; 7108 if (pusedDefaultChar && *pusedDefaultChar) 7109 return -2; 7110 return 0; 7111 7112error: 7113 Py_XDECREF(substring); 7114 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7115 return -2; 7116 PyErr_SetFromWindowsErr(0); 7117 return -1; 7118} 7119 7120/* 7121 * Encode a Unicode string to a Windows code page into a byte string using a 7122 * error handler. 7123 * 7124 * Returns consumed characters if succeed, or raise a WindowsError and returns 7125 * -1 on other error. 7126 */ 7127static int 7128encode_code_page_errors(UINT code_page, PyObject **outbytes, 7129 PyObject *unicode, Py_ssize_t unicode_offset, 7130 Py_ssize_t insize, const char* errors) 7131{ 7132 const DWORD flags = encode_code_page_flags(code_page, errors); 7133 Py_ssize_t pos = unicode_offset; 7134 Py_ssize_t endin = unicode_offset + insize; 7135 /* Ideally, we should get reason from FormatMessage. This is the Windows 7136 2000 English version of the message. */ 7137 const char *reason = "invalid character"; 7138 /* 4=maximum length of a UTF-8 sequence */ 7139 char buffer[4]; 7140 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7141 Py_ssize_t outsize; 7142 char *out; 7143 PyObject *errorHandler = NULL; 7144 PyObject *exc = NULL; 7145 PyObject *encoding_obj = NULL; 7146 char *encoding; 7147 Py_ssize_t newpos, newoutsize; 7148 PyObject *rep; 7149 int ret = -1; 7150 7151 assert(insize > 0); 7152 7153 encoding = code_page_name(code_page, &encoding_obj); 7154 if (encoding == NULL) 7155 return -1; 7156 7157 if (errors == NULL || strcmp(errors, "strict") == 0) { 7158 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7159 then we raise a UnicodeEncodeError. */ 7160 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7161 if (exc != NULL) { 7162 PyCodec_StrictErrors(exc); 7163 Py_DECREF(exc); 7164 } 7165 Py_XDECREF(encoding_obj); 7166 return -1; 7167 } 7168 7169 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7170 pusedDefaultChar = &usedDefaultChar; 7171 else 7172 pusedDefaultChar = NULL; 7173 7174 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7175 PyErr_NoMemory(); 7176 goto error; 7177 } 7178 outsize = insize * Py_ARRAY_LENGTH(buffer); 7179 7180 if (*outbytes == NULL) { 7181 /* Create string object */ 7182 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7183 if (*outbytes == NULL) 7184 goto error; 7185 out = PyBytes_AS_STRING(*outbytes); 7186 } 7187 else { 7188 /* Extend string object */ 7189 Py_ssize_t n = PyBytes_Size(*outbytes); 7190 if (n > PY_SSIZE_T_MAX - outsize) { 7191 PyErr_NoMemory(); 7192 goto error; 7193 } 7194 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7195 goto error; 7196 out = PyBytes_AS_STRING(*outbytes) + n; 7197 } 7198 7199 /* Encode the string character per character */ 7200 while (pos < endin) 7201 { 7202 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7203 wchar_t chars[2]; 7204 int charsize; 7205 if (ch < 0x10000) { 7206 chars[0] = (wchar_t)ch; 7207 charsize = 1; 7208 } 7209 else { 7210 ch -= 0x10000; 7211 chars[0] = 0xd800 + (ch >> 10); 7212 chars[1] = 0xdc00 + (ch & 0x3ff); 7213 charsize = 2; 7214 } 7215 7216 outsize = WideCharToMultiByte(code_page, flags, 7217 chars, charsize, 7218 buffer, Py_ARRAY_LENGTH(buffer), 7219 NULL, pusedDefaultChar); 7220 if (outsize > 0) { 7221 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7222 { 7223 pos++; 7224 memcpy(out, buffer, outsize); 7225 out += outsize; 7226 continue; 7227 } 7228 } 7229 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7230 PyErr_SetFromWindowsErr(0); 7231 goto error; 7232 } 7233 7234 rep = unicode_encode_call_errorhandler( 7235 errors, &errorHandler, encoding, reason, 7236 unicode, &exc, 7237 pos, pos + 1, &newpos); 7238 if (rep == NULL) 7239 goto error; 7240 pos = newpos; 7241 7242 if (PyBytes_Check(rep)) { 7243 outsize = PyBytes_GET_SIZE(rep); 7244 if (outsize != 1) { 7245 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7246 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7247 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7248 Py_DECREF(rep); 7249 goto error; 7250 } 7251 out = PyBytes_AS_STRING(*outbytes) + offset; 7252 } 7253 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7254 out += outsize; 7255 } 7256 else { 7257 Py_ssize_t i; 7258 enum PyUnicode_Kind kind; 7259 void *data; 7260 7261 if (PyUnicode_READY(rep) == -1) { 7262 Py_DECREF(rep); 7263 goto error; 7264 } 7265 7266 outsize = PyUnicode_GET_LENGTH(rep); 7267 if (outsize != 1) { 7268 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7269 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7270 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7271 Py_DECREF(rep); 7272 goto error; 7273 } 7274 out = PyBytes_AS_STRING(*outbytes) + offset; 7275 } 7276 kind = PyUnicode_KIND(rep); 7277 data = PyUnicode_DATA(rep); 7278 for (i=0; i < outsize; i++) { 7279 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7280 if (ch > 127) { 7281 raise_encode_exception(&exc, 7282 encoding, unicode, 7283 pos, pos + 1, 7284 "unable to encode error handler result to ASCII"); 7285 Py_DECREF(rep); 7286 goto error; 7287 } 7288 *out = (unsigned char)ch; 7289 out++; 7290 } 7291 } 7292 Py_DECREF(rep); 7293 } 7294 /* write a NUL byte */ 7295 *out = 0; 7296 outsize = out - PyBytes_AS_STRING(*outbytes); 7297 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7298 if (_PyBytes_Resize(outbytes, outsize) < 0) 7299 goto error; 7300 ret = 0; 7301 7302error: 7303 Py_XDECREF(encoding_obj); 7304 Py_XDECREF(errorHandler); 7305 Py_XDECREF(exc); 7306 return ret; 7307} 7308 7309static PyObject * 7310encode_code_page(int code_page, 7311 PyObject *unicode, 7312 const char *errors) 7313{ 7314 Py_ssize_t len; 7315 PyObject *outbytes = NULL; 7316 Py_ssize_t offset; 7317 int chunk_len, ret, done; 7318 7319 if (PyUnicode_READY(unicode) == -1) 7320 return NULL; 7321 len = PyUnicode_GET_LENGTH(unicode); 7322 7323 if (code_page < 0) { 7324 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7325 return NULL; 7326 } 7327 7328 if (len == 0) 7329 return PyBytes_FromStringAndSize(NULL, 0); 7330 7331 offset = 0; 7332 do 7333 { 7334#ifdef NEED_RETRY 7335 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7336 chunks. */ 7337 if (len > INT_MAX/2) { 7338 chunk_len = INT_MAX/2; 7339 done = 0; 7340 } 7341 else 7342#endif 7343 { 7344 chunk_len = (int)len; 7345 done = 1; 7346 } 7347 7348 ret = encode_code_page_strict(code_page, &outbytes, 7349 unicode, offset, chunk_len, 7350 errors); 7351 if (ret == -2) 7352 ret = encode_code_page_errors(code_page, &outbytes, 7353 unicode, offset, 7354 chunk_len, errors); 7355 if (ret < 0) { 7356 Py_XDECREF(outbytes); 7357 return NULL; 7358 } 7359 7360 offset += chunk_len; 7361 len -= chunk_len; 7362 } while (!done); 7363 7364 return outbytes; 7365} 7366 7367PyObject * 7368PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7369 Py_ssize_t size, 7370 const char *errors) 7371{ 7372 PyObject *unicode, *res; 7373 unicode = PyUnicode_FromUnicode(p, size); 7374 if (unicode == NULL) 7375 return NULL; 7376 res = encode_code_page(CP_ACP, unicode, errors); 7377 Py_DECREF(unicode); 7378 return res; 7379} 7380 7381PyObject * 7382PyUnicode_EncodeCodePage(int code_page, 7383 PyObject *unicode, 7384 const char *errors) 7385{ 7386 return encode_code_page(code_page, unicode, errors); 7387} 7388 7389PyObject * 7390PyUnicode_AsMBCSString(PyObject *unicode) 7391{ 7392 if (!PyUnicode_Check(unicode)) { 7393 PyErr_BadArgument(); 7394 return NULL; 7395 } 7396 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7397} 7398 7399#undef NEED_RETRY 7400 7401#endif /* HAVE_MBCS */ 7402 7403/* --- Character Mapping Codec -------------------------------------------- */ 7404 7405PyObject * 7406PyUnicode_DecodeCharmap(const char *s, 7407 Py_ssize_t size, 7408 PyObject *mapping, 7409 const char *errors) 7410{ 7411 const char *starts = s; 7412 Py_ssize_t startinpos; 7413 Py_ssize_t endinpos; 7414 Py_ssize_t outpos; 7415 const char *e; 7416 PyObject *v; 7417 Py_ssize_t extrachars = 0; 7418 PyObject *errorHandler = NULL; 7419 PyObject *exc = NULL; 7420 7421 /* Default to Latin-1 */ 7422 if (mapping == NULL) 7423 return PyUnicode_DecodeLatin1(s, size, errors); 7424 7425 v = PyUnicode_New(size, 127); 7426 if (v == NULL) 7427 goto onError; 7428 if (size == 0) 7429 return v; 7430 outpos = 0; 7431 e = s + size; 7432 if (PyUnicode_CheckExact(mapping)) { 7433 Py_ssize_t maplen; 7434 enum PyUnicode_Kind kind; 7435 void *data; 7436 Py_UCS4 x; 7437 7438 if (PyUnicode_READY(mapping) == -1) 7439 return NULL; 7440 7441 maplen = PyUnicode_GET_LENGTH(mapping); 7442 data = PyUnicode_DATA(mapping); 7443 kind = PyUnicode_KIND(mapping); 7444 while (s < e) { 7445 unsigned char ch = *s; 7446 7447 if (ch < maplen) 7448 x = PyUnicode_READ(kind, data, ch); 7449 else 7450 x = 0xfffe; /* invalid value */ 7451 7452 if (x == 0xfffe) 7453 { 7454 /* undefined mapping */ 7455 startinpos = s-starts; 7456 endinpos = startinpos+1; 7457 if (unicode_decode_call_errorhandler( 7458 errors, &errorHandler, 7459 "charmap", "character maps to <undefined>", 7460 &starts, &e, &startinpos, &endinpos, &exc, &s, 7461 &v, &outpos)) { 7462 goto onError; 7463 } 7464 continue; 7465 } 7466 7467 if (unicode_putchar(&v, &outpos, x) < 0) 7468 goto onError; 7469 ++s; 7470 } 7471 } 7472 else { 7473 while (s < e) { 7474 unsigned char ch = *s; 7475 PyObject *w, *x; 7476 7477 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7478 w = PyLong_FromLong((long)ch); 7479 if (w == NULL) 7480 goto onError; 7481 x = PyObject_GetItem(mapping, w); 7482 Py_DECREF(w); 7483 if (x == NULL) { 7484 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7485 /* No mapping found means: mapping is undefined. */ 7486 PyErr_Clear(); 7487 x = Py_None; 7488 Py_INCREF(x); 7489 } else 7490 goto onError; 7491 } 7492 7493 /* Apply mapping */ 7494 if (PyLong_Check(x)) { 7495 long value = PyLong_AS_LONG(x); 7496 if (value < 0 || value > 65535) { 7497 PyErr_SetString(PyExc_TypeError, 7498 "character mapping must be in range(65536)"); 7499 Py_DECREF(x); 7500 goto onError; 7501 } 7502 if (unicode_putchar(&v, &outpos, value) < 0) 7503 goto onError; 7504 } 7505 else if (x == Py_None) { 7506 /* undefined mapping */ 7507 startinpos = s-starts; 7508 endinpos = startinpos+1; 7509 if (unicode_decode_call_errorhandler( 7510 errors, &errorHandler, 7511 "charmap", "character maps to <undefined>", 7512 &starts, &e, &startinpos, &endinpos, &exc, &s, 7513 &v, &outpos)) { 7514 Py_DECREF(x); 7515 goto onError; 7516 } 7517 Py_DECREF(x); 7518 continue; 7519 } 7520 else if (PyUnicode_Check(x)) { 7521 Py_ssize_t targetsize; 7522 7523 if (PyUnicode_READY(x) == -1) 7524 goto onError; 7525 targetsize = PyUnicode_GET_LENGTH(x); 7526 7527 if (targetsize == 1) { 7528 /* 1-1 mapping */ 7529 if (unicode_putchar(&v, &outpos, 7530 PyUnicode_READ_CHAR(x, 0)) < 0) 7531 goto onError; 7532 } 7533 else if (targetsize > 1) { 7534 /* 1-n mapping */ 7535 if (targetsize > extrachars) { 7536 /* resize first */ 7537 Py_ssize_t needed = (targetsize - extrachars) + \ 7538 (targetsize << 2); 7539 extrachars += needed; 7540 /* XXX overflow detection missing */ 7541 if (unicode_resize(&v, 7542 PyUnicode_GET_LENGTH(v) + needed) < 0) 7543 { 7544 Py_DECREF(x); 7545 goto onError; 7546 } 7547 } 7548 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7549 goto onError; 7550 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7551 outpos += targetsize; 7552 extrachars -= targetsize; 7553 } 7554 /* 1-0 mapping: skip the character */ 7555 } 7556 else { 7557 /* wrong return value */ 7558 PyErr_SetString(PyExc_TypeError, 7559 "character mapping must return integer, None or str"); 7560 Py_DECREF(x); 7561 goto onError; 7562 } 7563 Py_DECREF(x); 7564 ++s; 7565 } 7566 } 7567 if (unicode_resize(&v, outpos) < 0) 7568 goto onError; 7569 Py_XDECREF(errorHandler); 7570 Py_XDECREF(exc); 7571 return unicode_result(v); 7572 7573 onError: 7574 Py_XDECREF(errorHandler); 7575 Py_XDECREF(exc); 7576 Py_XDECREF(v); 7577 return NULL; 7578} 7579 7580/* Charmap encoding: the lookup table */ 7581 7582struct encoding_map { 7583 PyObject_HEAD 7584 unsigned char level1[32]; 7585 int count2, count3; 7586 unsigned char level23[1]; 7587}; 7588 7589static PyObject* 7590encoding_map_size(PyObject *obj, PyObject* args) 7591{ 7592 struct encoding_map *map = (struct encoding_map*)obj; 7593 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7594 128*map->count3); 7595} 7596 7597static PyMethodDef encoding_map_methods[] = { 7598 {"size", encoding_map_size, METH_NOARGS, 7599 PyDoc_STR("Return the size (in bytes) of this object") }, 7600 { 0 } 7601}; 7602 7603static void 7604encoding_map_dealloc(PyObject* o) 7605{ 7606 PyObject_FREE(o); 7607} 7608 7609static PyTypeObject EncodingMapType = { 7610 PyVarObject_HEAD_INIT(NULL, 0) 7611 "EncodingMap", /*tp_name*/ 7612 sizeof(struct encoding_map), /*tp_basicsize*/ 7613 0, /*tp_itemsize*/ 7614 /* methods */ 7615 encoding_map_dealloc, /*tp_dealloc*/ 7616 0, /*tp_print*/ 7617 0, /*tp_getattr*/ 7618 0, /*tp_setattr*/ 7619 0, /*tp_reserved*/ 7620 0, /*tp_repr*/ 7621 0, /*tp_as_number*/ 7622 0, /*tp_as_sequence*/ 7623 0, /*tp_as_mapping*/ 7624 0, /*tp_hash*/ 7625 0, /*tp_call*/ 7626 0, /*tp_str*/ 7627 0, /*tp_getattro*/ 7628 0, /*tp_setattro*/ 7629 0, /*tp_as_buffer*/ 7630 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7631 0, /*tp_doc*/ 7632 0, /*tp_traverse*/ 7633 0, /*tp_clear*/ 7634 0, /*tp_richcompare*/ 7635 0, /*tp_weaklistoffset*/ 7636 0, /*tp_iter*/ 7637 0, /*tp_iternext*/ 7638 encoding_map_methods, /*tp_methods*/ 7639 0, /*tp_members*/ 7640 0, /*tp_getset*/ 7641 0, /*tp_base*/ 7642 0, /*tp_dict*/ 7643 0, /*tp_descr_get*/ 7644 0, /*tp_descr_set*/ 7645 0, /*tp_dictoffset*/ 7646 0, /*tp_init*/ 7647 0, /*tp_alloc*/ 7648 0, /*tp_new*/ 7649 0, /*tp_free*/ 7650 0, /*tp_is_gc*/ 7651}; 7652 7653PyObject* 7654PyUnicode_BuildEncodingMap(PyObject* string) 7655{ 7656 PyObject *result; 7657 struct encoding_map *mresult; 7658 int i; 7659 int need_dict = 0; 7660 unsigned char level1[32]; 7661 unsigned char level2[512]; 7662 unsigned char *mlevel1, *mlevel2, *mlevel3; 7663 int count2 = 0, count3 = 0; 7664 int kind; 7665 void *data; 7666 Py_UCS4 ch; 7667 7668 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7669 PyErr_BadArgument(); 7670 return NULL; 7671 } 7672 kind = PyUnicode_KIND(string); 7673 data = PyUnicode_DATA(string); 7674 memset(level1, 0xFF, sizeof level1); 7675 memset(level2, 0xFF, sizeof level2); 7676 7677 /* If there isn't a one-to-one mapping of NULL to \0, 7678 or if there are non-BMP characters, we need to use 7679 a mapping dictionary. */ 7680 if (PyUnicode_READ(kind, data, 0) != 0) 7681 need_dict = 1; 7682 for (i = 1; i < 256; i++) { 7683 int l1, l2; 7684 ch = PyUnicode_READ(kind, data, i); 7685 if (ch == 0 || ch > 0xFFFF) { 7686 need_dict = 1; 7687 break; 7688 } 7689 if (ch == 0xFFFE) 7690 /* unmapped character */ 7691 continue; 7692 l1 = ch >> 11; 7693 l2 = ch >> 7; 7694 if (level1[l1] == 0xFF) 7695 level1[l1] = count2++; 7696 if (level2[l2] == 0xFF) 7697 level2[l2] = count3++; 7698 } 7699 7700 if (count2 >= 0xFF || count3 >= 0xFF) 7701 need_dict = 1; 7702 7703 if (need_dict) { 7704 PyObject *result = PyDict_New(); 7705 PyObject *key, *value; 7706 if (!result) 7707 return NULL; 7708 for (i = 0; i < 256; i++) { 7709 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7710 value = PyLong_FromLong(i); 7711 if (!key || !value) 7712 goto failed1; 7713 if (PyDict_SetItem(result, key, value) == -1) 7714 goto failed1; 7715 Py_DECREF(key); 7716 Py_DECREF(value); 7717 } 7718 return result; 7719 failed1: 7720 Py_XDECREF(key); 7721 Py_XDECREF(value); 7722 Py_DECREF(result); 7723 return NULL; 7724 } 7725 7726 /* Create a three-level trie */ 7727 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7728 16*count2 + 128*count3 - 1); 7729 if (!result) 7730 return PyErr_NoMemory(); 7731 PyObject_Init(result, &EncodingMapType); 7732 mresult = (struct encoding_map*)result; 7733 mresult->count2 = count2; 7734 mresult->count3 = count3; 7735 mlevel1 = mresult->level1; 7736 mlevel2 = mresult->level23; 7737 mlevel3 = mresult->level23 + 16*count2; 7738 memcpy(mlevel1, level1, 32); 7739 memset(mlevel2, 0xFF, 16*count2); 7740 memset(mlevel3, 0, 128*count3); 7741 count3 = 0; 7742 for (i = 1; i < 256; i++) { 7743 int o1, o2, o3, i2, i3; 7744 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7745 /* unmapped character */ 7746 continue; 7747 o1 = PyUnicode_READ(kind, data, i)>>11; 7748 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7749 i2 = 16*mlevel1[o1] + o2; 7750 if (mlevel2[i2] == 0xFF) 7751 mlevel2[i2] = count3++; 7752 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7753 i3 = 128*mlevel2[i2] + o3; 7754 mlevel3[i3] = i; 7755 } 7756 return result; 7757} 7758 7759static int 7760encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7761{ 7762 struct encoding_map *map = (struct encoding_map*)mapping; 7763 int l1 = c>>11; 7764 int l2 = (c>>7) & 0xF; 7765 int l3 = c & 0x7F; 7766 int i; 7767 7768 if (c > 0xFFFF) 7769 return -1; 7770 if (c == 0) 7771 return 0; 7772 /* level 1*/ 7773 i = map->level1[l1]; 7774 if (i == 0xFF) { 7775 return -1; 7776 } 7777 /* level 2*/ 7778 i = map->level23[16*i+l2]; 7779 if (i == 0xFF) { 7780 return -1; 7781 } 7782 /* level 3 */ 7783 i = map->level23[16*map->count2 + 128*i + l3]; 7784 if (i == 0) { 7785 return -1; 7786 } 7787 return i; 7788} 7789 7790/* Lookup the character ch in the mapping. If the character 7791 can't be found, Py_None is returned (or NULL, if another 7792 error occurred). */ 7793static PyObject * 7794charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7795{ 7796 PyObject *w = PyLong_FromLong((long)c); 7797 PyObject *x; 7798 7799 if (w == NULL) 7800 return NULL; 7801 x = PyObject_GetItem(mapping, w); 7802 Py_DECREF(w); 7803 if (x == NULL) { 7804 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7805 /* No mapping found means: mapping is undefined. */ 7806 PyErr_Clear(); 7807 x = Py_None; 7808 Py_INCREF(x); 7809 return x; 7810 } else 7811 return NULL; 7812 } 7813 else if (x == Py_None) 7814 return x; 7815 else if (PyLong_Check(x)) { 7816 long value = PyLong_AS_LONG(x); 7817 if (value < 0 || value > 255) { 7818 PyErr_SetString(PyExc_TypeError, 7819 "character mapping must be in range(256)"); 7820 Py_DECREF(x); 7821 return NULL; 7822 } 7823 return x; 7824 } 7825 else if (PyBytes_Check(x)) 7826 return x; 7827 else { 7828 /* wrong return value */ 7829 PyErr_Format(PyExc_TypeError, 7830 "character mapping must return integer, bytes or None, not %.400s", 7831 x->ob_type->tp_name); 7832 Py_DECREF(x); 7833 return NULL; 7834 } 7835} 7836 7837static int 7838charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7839{ 7840 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7841 /* exponentially overallocate to minimize reallocations */ 7842 if (requiredsize < 2*outsize) 7843 requiredsize = 2*outsize; 7844 if (_PyBytes_Resize(outobj, requiredsize)) 7845 return -1; 7846 return 0; 7847} 7848 7849typedef enum charmapencode_result { 7850 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7851} charmapencode_result; 7852/* lookup the character, put the result in the output string and adjust 7853 various state variables. Resize the output bytes object if not enough 7854 space is available. Return a new reference to the object that 7855 was put in the output buffer, or Py_None, if the mapping was undefined 7856 (in which case no character was written) or NULL, if a 7857 reallocation error occurred. The caller must decref the result */ 7858static charmapencode_result 7859charmapencode_output(Py_UCS4 c, PyObject *mapping, 7860 PyObject **outobj, Py_ssize_t *outpos) 7861{ 7862 PyObject *rep; 7863 char *outstart; 7864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7865 7866 if (Py_TYPE(mapping) == &EncodingMapType) { 7867 int res = encoding_map_lookup(c, mapping); 7868 Py_ssize_t requiredsize = *outpos+1; 7869 if (res == -1) 7870 return enc_FAILED; 7871 if (outsize<requiredsize) 7872 if (charmapencode_resize(outobj, outpos, requiredsize)) 7873 return enc_EXCEPTION; 7874 outstart = PyBytes_AS_STRING(*outobj); 7875 outstart[(*outpos)++] = (char)res; 7876 return enc_SUCCESS; 7877 } 7878 7879 rep = charmapencode_lookup(c, mapping); 7880 if (rep==NULL) 7881 return enc_EXCEPTION; 7882 else if (rep==Py_None) { 7883 Py_DECREF(rep); 7884 return enc_FAILED; 7885 } else { 7886 if (PyLong_Check(rep)) { 7887 Py_ssize_t requiredsize = *outpos+1; 7888 if (outsize<requiredsize) 7889 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7890 Py_DECREF(rep); 7891 return enc_EXCEPTION; 7892 } 7893 outstart = PyBytes_AS_STRING(*outobj); 7894 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7895 } 7896 else { 7897 const char *repchars = PyBytes_AS_STRING(rep); 7898 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7899 Py_ssize_t requiredsize = *outpos+repsize; 7900 if (outsize<requiredsize) 7901 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7902 Py_DECREF(rep); 7903 return enc_EXCEPTION; 7904 } 7905 outstart = PyBytes_AS_STRING(*outobj); 7906 memcpy(outstart + *outpos, repchars, repsize); 7907 *outpos += repsize; 7908 } 7909 } 7910 Py_DECREF(rep); 7911 return enc_SUCCESS; 7912} 7913 7914/* handle an error in PyUnicode_EncodeCharmap 7915 Return 0 on success, -1 on error */ 7916static int 7917charmap_encoding_error( 7918 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7919 PyObject **exceptionObject, 7920 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7921 PyObject **res, Py_ssize_t *respos) 7922{ 7923 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7924 Py_ssize_t size, repsize; 7925 Py_ssize_t newpos; 7926 enum PyUnicode_Kind kind; 7927 void *data; 7928 Py_ssize_t index; 7929 /* startpos for collecting unencodable chars */ 7930 Py_ssize_t collstartpos = *inpos; 7931 Py_ssize_t collendpos = *inpos+1; 7932 Py_ssize_t collpos; 7933 char *encoding = "charmap"; 7934 char *reason = "character maps to <undefined>"; 7935 charmapencode_result x; 7936 Py_UCS4 ch; 7937 int val; 7938 7939 if (PyUnicode_READY(unicode) == -1) 7940 return -1; 7941 size = PyUnicode_GET_LENGTH(unicode); 7942 /* find all unencodable characters */ 7943 while (collendpos < size) { 7944 PyObject *rep; 7945 if (Py_TYPE(mapping) == &EncodingMapType) { 7946 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7947 val = encoding_map_lookup(ch, mapping); 7948 if (val != -1) 7949 break; 7950 ++collendpos; 7951 continue; 7952 } 7953 7954 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7955 rep = charmapencode_lookup(ch, mapping); 7956 if (rep==NULL) 7957 return -1; 7958 else if (rep!=Py_None) { 7959 Py_DECREF(rep); 7960 break; 7961 } 7962 Py_DECREF(rep); 7963 ++collendpos; 7964 } 7965 /* cache callback name lookup 7966 * (if not done yet, i.e. it's the first error) */ 7967 if (*known_errorHandler==-1) { 7968 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7969 *known_errorHandler = 1; 7970 else if (!strcmp(errors, "replace")) 7971 *known_errorHandler = 2; 7972 else if (!strcmp(errors, "ignore")) 7973 *known_errorHandler = 3; 7974 else if (!strcmp(errors, "xmlcharrefreplace")) 7975 *known_errorHandler = 4; 7976 else 7977 *known_errorHandler = 0; 7978 } 7979 switch (*known_errorHandler) { 7980 case 1: /* strict */ 7981 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7982 return -1; 7983 case 2: /* replace */ 7984 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7985 x = charmapencode_output('?', mapping, res, respos); 7986 if (x==enc_EXCEPTION) { 7987 return -1; 7988 } 7989 else if (x==enc_FAILED) { 7990 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 7991 return -1; 7992 } 7993 } 7994 /* fall through */ 7995 case 3: /* ignore */ 7996 *inpos = collendpos; 7997 break; 7998 case 4: /* xmlcharrefreplace */ 7999 /* generate replacement (temporarily (mis)uses p) */ 8000 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8001 char buffer[2+29+1+1]; 8002 char *cp; 8003 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8004 for (cp = buffer; *cp; ++cp) { 8005 x = charmapencode_output(*cp, mapping, res, respos); 8006 if (x==enc_EXCEPTION) 8007 return -1; 8008 else if (x==enc_FAILED) { 8009 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8010 return -1; 8011 } 8012 } 8013 } 8014 *inpos = collendpos; 8015 break; 8016 default: 8017 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8018 encoding, reason, unicode, exceptionObject, 8019 collstartpos, collendpos, &newpos); 8020 if (repunicode == NULL) 8021 return -1; 8022 if (PyBytes_Check(repunicode)) { 8023 /* Directly copy bytes result to output. */ 8024 Py_ssize_t outsize = PyBytes_Size(*res); 8025 Py_ssize_t requiredsize; 8026 repsize = PyBytes_Size(repunicode); 8027 requiredsize = *respos + repsize; 8028 if (requiredsize > outsize) 8029 /* Make room for all additional bytes. */ 8030 if (charmapencode_resize(res, respos, requiredsize)) { 8031 Py_DECREF(repunicode); 8032 return -1; 8033 } 8034 memcpy(PyBytes_AsString(*res) + *respos, 8035 PyBytes_AsString(repunicode), repsize); 8036 *respos += repsize; 8037 *inpos = newpos; 8038 Py_DECREF(repunicode); 8039 break; 8040 } 8041 /* generate replacement */ 8042 if (PyUnicode_READY(repunicode) == -1) { 8043 Py_DECREF(repunicode); 8044 return -1; 8045 } 8046 repsize = PyUnicode_GET_LENGTH(repunicode); 8047 data = PyUnicode_DATA(repunicode); 8048 kind = PyUnicode_KIND(repunicode); 8049 for (index = 0; index < repsize; index++) { 8050 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8051 x = charmapencode_output(repch, mapping, res, respos); 8052 if (x==enc_EXCEPTION) { 8053 Py_DECREF(repunicode); 8054 return -1; 8055 } 8056 else if (x==enc_FAILED) { 8057 Py_DECREF(repunicode); 8058 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8059 return -1; 8060 } 8061 } 8062 *inpos = newpos; 8063 Py_DECREF(repunicode); 8064 } 8065 return 0; 8066} 8067 8068PyObject * 8069_PyUnicode_EncodeCharmap(PyObject *unicode, 8070 PyObject *mapping, 8071 const char *errors) 8072{ 8073 /* output object */ 8074 PyObject *res = NULL; 8075 /* current input position */ 8076 Py_ssize_t inpos = 0; 8077 Py_ssize_t size; 8078 /* current output position */ 8079 Py_ssize_t respos = 0; 8080 PyObject *errorHandler = NULL; 8081 PyObject *exc = NULL; 8082 /* the following variable is used for caching string comparisons 8083 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8084 * 3=ignore, 4=xmlcharrefreplace */ 8085 int known_errorHandler = -1; 8086 8087 if (PyUnicode_READY(unicode) == -1) 8088 return NULL; 8089 size = PyUnicode_GET_LENGTH(unicode); 8090 8091 /* Default to Latin-1 */ 8092 if (mapping == NULL) 8093 return unicode_encode_ucs1(unicode, errors, 256); 8094 8095 /* allocate enough for a simple encoding without 8096 replacements, if we need more, we'll resize */ 8097 res = PyBytes_FromStringAndSize(NULL, size); 8098 if (res == NULL) 8099 goto onError; 8100 if (size == 0) 8101 return res; 8102 8103 while (inpos<size) { 8104 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8105 /* try to encode it */ 8106 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8107 if (x==enc_EXCEPTION) /* error */ 8108 goto onError; 8109 if (x==enc_FAILED) { /* unencodable character */ 8110 if (charmap_encoding_error(unicode, &inpos, mapping, 8111 &exc, 8112 &known_errorHandler, &errorHandler, errors, 8113 &res, &respos)) { 8114 goto onError; 8115 } 8116 } 8117 else 8118 /* done with this character => adjust input position */ 8119 ++inpos; 8120 } 8121 8122 /* Resize if we allocated to much */ 8123 if (respos<PyBytes_GET_SIZE(res)) 8124 if (_PyBytes_Resize(&res, respos) < 0) 8125 goto onError; 8126 8127 Py_XDECREF(exc); 8128 Py_XDECREF(errorHandler); 8129 return res; 8130 8131 onError: 8132 Py_XDECREF(res); 8133 Py_XDECREF(exc); 8134 Py_XDECREF(errorHandler); 8135 return NULL; 8136} 8137 8138/* Deprecated */ 8139PyObject * 8140PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8141 Py_ssize_t size, 8142 PyObject *mapping, 8143 const char *errors) 8144{ 8145 PyObject *result; 8146 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8147 if (unicode == NULL) 8148 return NULL; 8149 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8150 Py_DECREF(unicode); 8151 return result; 8152} 8153 8154PyObject * 8155PyUnicode_AsCharmapString(PyObject *unicode, 8156 PyObject *mapping) 8157{ 8158 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8159 PyErr_BadArgument(); 8160 return NULL; 8161 } 8162 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8163} 8164 8165/* create or adjust a UnicodeTranslateError */ 8166static void 8167make_translate_exception(PyObject **exceptionObject, 8168 PyObject *unicode, 8169 Py_ssize_t startpos, Py_ssize_t endpos, 8170 const char *reason) 8171{ 8172 if (*exceptionObject == NULL) { 8173 *exceptionObject = _PyUnicodeTranslateError_Create( 8174 unicode, startpos, endpos, reason); 8175 } 8176 else { 8177 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8178 goto onError; 8179 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8180 goto onError; 8181 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8182 goto onError; 8183 return; 8184 onError: 8185 Py_DECREF(*exceptionObject); 8186 *exceptionObject = NULL; 8187 } 8188} 8189 8190/* raises a UnicodeTranslateError */ 8191static void 8192raise_translate_exception(PyObject **exceptionObject, 8193 PyObject *unicode, 8194 Py_ssize_t startpos, Py_ssize_t endpos, 8195 const char *reason) 8196{ 8197 make_translate_exception(exceptionObject, 8198 unicode, startpos, endpos, reason); 8199 if (*exceptionObject != NULL) 8200 PyCodec_StrictErrors(*exceptionObject); 8201} 8202 8203/* error handling callback helper: 8204 build arguments, call the callback and check the arguments, 8205 put the result into newpos and return the replacement string, which 8206 has to be freed by the caller */ 8207static PyObject * 8208unicode_translate_call_errorhandler(const char *errors, 8209 PyObject **errorHandler, 8210 const char *reason, 8211 PyObject *unicode, PyObject **exceptionObject, 8212 Py_ssize_t startpos, Py_ssize_t endpos, 8213 Py_ssize_t *newpos) 8214{ 8215 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8216 8217 Py_ssize_t i_newpos; 8218 PyObject *restuple; 8219 PyObject *resunicode; 8220 8221 if (*errorHandler == NULL) { 8222 *errorHandler = PyCodec_LookupError(errors); 8223 if (*errorHandler == NULL) 8224 return NULL; 8225 } 8226 8227 make_translate_exception(exceptionObject, 8228 unicode, startpos, endpos, reason); 8229 if (*exceptionObject == NULL) 8230 return NULL; 8231 8232 restuple = PyObject_CallFunctionObjArgs( 8233 *errorHandler, *exceptionObject, NULL); 8234 if (restuple == NULL) 8235 return NULL; 8236 if (!PyTuple_Check(restuple)) { 8237 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8238 Py_DECREF(restuple); 8239 return NULL; 8240 } 8241 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8242 &resunicode, &i_newpos)) { 8243 Py_DECREF(restuple); 8244 return NULL; 8245 } 8246 if (i_newpos<0) 8247 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8248 else 8249 *newpos = i_newpos; 8250 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8251 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8252 Py_DECREF(restuple); 8253 return NULL; 8254 } 8255 Py_INCREF(resunicode); 8256 Py_DECREF(restuple); 8257 return resunicode; 8258} 8259 8260/* Lookup the character ch in the mapping and put the result in result, 8261 which must be decrefed by the caller. 8262 Return 0 on success, -1 on error */ 8263static int 8264charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8265{ 8266 PyObject *w = PyLong_FromLong((long)c); 8267 PyObject *x; 8268 8269 if (w == NULL) 8270 return -1; 8271 x = PyObject_GetItem(mapping, w); 8272 Py_DECREF(w); 8273 if (x == NULL) { 8274 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8275 /* No mapping found means: use 1:1 mapping. */ 8276 PyErr_Clear(); 8277 *result = NULL; 8278 return 0; 8279 } else 8280 return -1; 8281 } 8282 else if (x == Py_None) { 8283 *result = x; 8284 return 0; 8285 } 8286 else if (PyLong_Check(x)) { 8287 long value = PyLong_AS_LONG(x); 8288 long max = PyUnicode_GetMax(); 8289 if (value < 0 || value > max) { 8290 PyErr_Format(PyExc_TypeError, 8291 "character mapping must be in range(0x%x)", max+1); 8292 Py_DECREF(x); 8293 return -1; 8294 } 8295 *result = x; 8296 return 0; 8297 } 8298 else if (PyUnicode_Check(x)) { 8299 *result = x; 8300 return 0; 8301 } 8302 else { 8303 /* wrong return value */ 8304 PyErr_SetString(PyExc_TypeError, 8305 "character mapping must return integer, None or str"); 8306 Py_DECREF(x); 8307 return -1; 8308 } 8309} 8310/* ensure that *outobj is at least requiredsize characters long, 8311 if not reallocate and adjust various state variables. 8312 Return 0 on success, -1 on error */ 8313static int 8314charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8315 Py_ssize_t requiredsize) 8316{ 8317 Py_ssize_t oldsize = *psize; 8318 Py_UCS4 *new_outobj; 8319 if (requiredsize > oldsize) { 8320 /* exponentially overallocate to minimize reallocations */ 8321 if (requiredsize < 2 * oldsize) 8322 requiredsize = 2 * oldsize; 8323 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8324 if (new_outobj == 0) 8325 return -1; 8326 *outobj = new_outobj; 8327 *psize = requiredsize; 8328 } 8329 return 0; 8330} 8331/* lookup the character, put the result in the output string and adjust 8332 various state variables. Return a new reference to the object that 8333 was put in the output buffer in *result, or Py_None, if the mapping was 8334 undefined (in which case no character was written). 8335 The called must decref result. 8336 Return 0 on success, -1 on error. */ 8337static int 8338charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8339 PyObject *mapping, Py_UCS4 **output, 8340 Py_ssize_t *osize, Py_ssize_t *opos, 8341 PyObject **res) 8342{ 8343 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8344 if (charmaptranslate_lookup(curinp, mapping, res)) 8345 return -1; 8346 if (*res==NULL) { 8347 /* not found => default to 1:1 mapping */ 8348 (*output)[(*opos)++] = curinp; 8349 } 8350 else if (*res==Py_None) 8351 ; 8352 else if (PyLong_Check(*res)) { 8353 /* no overflow check, because we know that the space is enough */ 8354 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8355 } 8356 else if (PyUnicode_Check(*res)) { 8357 Py_ssize_t repsize; 8358 if (PyUnicode_READY(*res) == -1) 8359 return -1; 8360 repsize = PyUnicode_GET_LENGTH(*res); 8361 if (repsize==1) { 8362 /* no overflow check, because we know that the space is enough */ 8363 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8364 } 8365 else if (repsize!=0) { 8366 /* more than one character */ 8367 Py_ssize_t requiredsize = *opos + 8368 (PyUnicode_GET_LENGTH(input) - ipos) + 8369 repsize - 1; 8370 Py_ssize_t i; 8371 if (charmaptranslate_makespace(output, osize, requiredsize)) 8372 return -1; 8373 for(i = 0; i < repsize; i++) 8374 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8375 } 8376 } 8377 else 8378 return -1; 8379 return 0; 8380} 8381 8382PyObject * 8383_PyUnicode_TranslateCharmap(PyObject *input, 8384 PyObject *mapping, 8385 const char *errors) 8386{ 8387 /* input object */ 8388 char *idata; 8389 Py_ssize_t size, i; 8390 int kind; 8391 /* output buffer */ 8392 Py_UCS4 *output = NULL; 8393 Py_ssize_t osize; 8394 PyObject *res; 8395 /* current output position */ 8396 Py_ssize_t opos; 8397 char *reason = "character maps to <undefined>"; 8398 PyObject *errorHandler = NULL; 8399 PyObject *exc = NULL; 8400 /* the following variable is used for caching string comparisons 8401 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8402 * 3=ignore, 4=xmlcharrefreplace */ 8403 int known_errorHandler = -1; 8404 8405 if (mapping == NULL) { 8406 PyErr_BadArgument(); 8407 return NULL; 8408 } 8409 8410 if (PyUnicode_READY(input) == -1) 8411 return NULL; 8412 idata = (char*)PyUnicode_DATA(input); 8413 kind = PyUnicode_KIND(input); 8414 size = PyUnicode_GET_LENGTH(input); 8415 i = 0; 8416 8417 if (size == 0) { 8418 Py_INCREF(input); 8419 return input; 8420 } 8421 8422 /* allocate enough for a simple 1:1 translation without 8423 replacements, if we need more, we'll resize */ 8424 osize = size; 8425 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8426 opos = 0; 8427 if (output == NULL) { 8428 PyErr_NoMemory(); 8429 goto onError; 8430 } 8431 8432 while (i<size) { 8433 /* try to encode it */ 8434 PyObject *x = NULL; 8435 if (charmaptranslate_output(input, i, mapping, 8436 &output, &osize, &opos, &x)) { 8437 Py_XDECREF(x); 8438 goto onError; 8439 } 8440 Py_XDECREF(x); 8441 if (x!=Py_None) /* it worked => adjust input pointer */ 8442 ++i; 8443 else { /* untranslatable character */ 8444 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8445 Py_ssize_t repsize; 8446 Py_ssize_t newpos; 8447 Py_ssize_t uni2; 8448 /* startpos for collecting untranslatable chars */ 8449 Py_ssize_t collstart = i; 8450 Py_ssize_t collend = i+1; 8451 Py_ssize_t coll; 8452 8453 /* find all untranslatable characters */ 8454 while (collend < size) { 8455 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8456 goto onError; 8457 Py_XDECREF(x); 8458 if (x!=Py_None) 8459 break; 8460 ++collend; 8461 } 8462 /* cache callback name lookup 8463 * (if not done yet, i.e. it's the first error) */ 8464 if (known_errorHandler==-1) { 8465 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8466 known_errorHandler = 1; 8467 else if (!strcmp(errors, "replace")) 8468 known_errorHandler = 2; 8469 else if (!strcmp(errors, "ignore")) 8470 known_errorHandler = 3; 8471 else if (!strcmp(errors, "xmlcharrefreplace")) 8472 known_errorHandler = 4; 8473 else 8474 known_errorHandler = 0; 8475 } 8476 switch (known_errorHandler) { 8477 case 1: /* strict */ 8478 raise_translate_exception(&exc, input, collstart, 8479 collend, reason); 8480 goto onError; 8481 case 2: /* replace */ 8482 /* No need to check for space, this is a 1:1 replacement */ 8483 for (coll = collstart; coll<collend; coll++) 8484 output[opos++] = '?'; 8485 /* fall through */ 8486 case 3: /* ignore */ 8487 i = collend; 8488 break; 8489 case 4: /* xmlcharrefreplace */ 8490 /* generate replacement (temporarily (mis)uses i) */ 8491 for (i = collstart; i < collend; ++i) { 8492 char buffer[2+29+1+1]; 8493 char *cp; 8494 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8495 if (charmaptranslate_makespace(&output, &osize, 8496 opos+strlen(buffer)+(size-collend))) 8497 goto onError; 8498 for (cp = buffer; *cp; ++cp) 8499 output[opos++] = *cp; 8500 } 8501 i = collend; 8502 break; 8503 default: 8504 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8505 reason, input, &exc, 8506 collstart, collend, &newpos); 8507 if (repunicode == NULL) 8508 goto onError; 8509 if (PyUnicode_READY(repunicode) == -1) { 8510 Py_DECREF(repunicode); 8511 goto onError; 8512 } 8513 /* generate replacement */ 8514 repsize = PyUnicode_GET_LENGTH(repunicode); 8515 if (charmaptranslate_makespace(&output, &osize, 8516 opos+repsize+(size-collend))) { 8517 Py_DECREF(repunicode); 8518 goto onError; 8519 } 8520 for (uni2 = 0; repsize-->0; ++uni2) 8521 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8522 i = newpos; 8523 Py_DECREF(repunicode); 8524 } 8525 } 8526 } 8527 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8528 if (!res) 8529 goto onError; 8530 PyMem_Free(output); 8531 Py_XDECREF(exc); 8532 Py_XDECREF(errorHandler); 8533 return res; 8534 8535 onError: 8536 PyMem_Free(output); 8537 Py_XDECREF(exc); 8538 Py_XDECREF(errorHandler); 8539 return NULL; 8540} 8541 8542/* Deprecated. Use PyUnicode_Translate instead. */ 8543PyObject * 8544PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8545 Py_ssize_t size, 8546 PyObject *mapping, 8547 const char *errors) 8548{ 8549 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8550 if (!unicode) 8551 return NULL; 8552 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8553} 8554 8555PyObject * 8556PyUnicode_Translate(PyObject *str, 8557 PyObject *mapping, 8558 const char *errors) 8559{ 8560 PyObject *result; 8561 8562 str = PyUnicode_FromObject(str); 8563 if (str == NULL) 8564 goto onError; 8565 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8566 Py_DECREF(str); 8567 return result; 8568 8569 onError: 8570 Py_XDECREF(str); 8571 return NULL; 8572} 8573 8574static Py_UCS4 8575fix_decimal_and_space_to_ascii(PyObject *self) 8576{ 8577 /* No need to call PyUnicode_READY(self) because this function is only 8578 called as a callback from fixup() which does it already. */ 8579 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8580 const int kind = PyUnicode_KIND(self); 8581 void *data = PyUnicode_DATA(self); 8582 Py_UCS4 maxchar = 127, ch, fixed; 8583 int modified = 0; 8584 Py_ssize_t i; 8585 8586 for (i = 0; i < len; ++i) { 8587 ch = PyUnicode_READ(kind, data, i); 8588 fixed = 0; 8589 if (ch > 127) { 8590 if (Py_UNICODE_ISSPACE(ch)) 8591 fixed = ' '; 8592 else { 8593 const int decimal = Py_UNICODE_TODECIMAL(ch); 8594 if (decimal >= 0) 8595 fixed = '0' + decimal; 8596 } 8597 if (fixed != 0) { 8598 modified = 1; 8599 maxchar = MAX_MAXCHAR(maxchar, fixed); 8600 PyUnicode_WRITE(kind, data, i, fixed); 8601 } 8602 else 8603 maxchar = MAX_MAXCHAR(maxchar, ch); 8604 } 8605 } 8606 8607 return (modified) ? maxchar : 0; 8608} 8609 8610PyObject * 8611_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8612{ 8613 if (!PyUnicode_Check(unicode)) { 8614 PyErr_BadInternalCall(); 8615 return NULL; 8616 } 8617 if (PyUnicode_READY(unicode) == -1) 8618 return NULL; 8619 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8620 /* If the string is already ASCII, just return the same string */ 8621 Py_INCREF(unicode); 8622 return unicode; 8623 } 8624 return fixup(unicode, fix_decimal_and_space_to_ascii); 8625} 8626 8627PyObject * 8628PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8629 Py_ssize_t length) 8630{ 8631 PyObject *decimal; 8632 Py_ssize_t i; 8633 Py_UCS4 maxchar; 8634 enum PyUnicode_Kind kind; 8635 void *data; 8636 8637 maxchar = 127; 8638 for (i = 0; i < length; i++) { 8639 Py_UNICODE ch = s[i]; 8640 if (ch > 127) { 8641 int decimal = Py_UNICODE_TODECIMAL(ch); 8642 if (decimal >= 0) 8643 ch = '0' + decimal; 8644 maxchar = MAX_MAXCHAR(maxchar, ch); 8645 } 8646 } 8647 8648 /* Copy to a new string */ 8649 decimal = PyUnicode_New(length, maxchar); 8650 if (decimal == NULL) 8651 return decimal; 8652 kind = PyUnicode_KIND(decimal); 8653 data = PyUnicode_DATA(decimal); 8654 /* Iterate over code points */ 8655 for (i = 0; i < length; i++) { 8656 Py_UNICODE ch = s[i]; 8657 if (ch > 127) { 8658 int decimal = Py_UNICODE_TODECIMAL(ch); 8659 if (decimal >= 0) 8660 ch = '0' + decimal; 8661 } 8662 PyUnicode_WRITE(kind, data, i, ch); 8663 } 8664 return unicode_result(decimal); 8665} 8666/* --- Decimal Encoder ---------------------------------------------------- */ 8667 8668int 8669PyUnicode_EncodeDecimal(Py_UNICODE *s, 8670 Py_ssize_t length, 8671 char *output, 8672 const char *errors) 8673{ 8674 PyObject *unicode; 8675 Py_ssize_t i; 8676 enum PyUnicode_Kind kind; 8677 void *data; 8678 8679 if (output == NULL) { 8680 PyErr_BadArgument(); 8681 return -1; 8682 } 8683 8684 unicode = PyUnicode_FromUnicode(s, length); 8685 if (unicode == NULL) 8686 return -1; 8687 8688 if (PyUnicode_READY(unicode) == -1) { 8689 Py_DECREF(unicode); 8690 return -1; 8691 } 8692 kind = PyUnicode_KIND(unicode); 8693 data = PyUnicode_DATA(unicode); 8694 8695 for (i=0; i < length; ) { 8696 PyObject *exc; 8697 Py_UCS4 ch; 8698 int decimal; 8699 Py_ssize_t startpos; 8700 8701 ch = PyUnicode_READ(kind, data, i); 8702 8703 if (Py_UNICODE_ISSPACE(ch)) { 8704 *output++ = ' '; 8705 i++; 8706 continue; 8707 } 8708 decimal = Py_UNICODE_TODECIMAL(ch); 8709 if (decimal >= 0) { 8710 *output++ = '0' + decimal; 8711 i++; 8712 continue; 8713 } 8714 if (0 < ch && ch < 256) { 8715 *output++ = (char)ch; 8716 i++; 8717 continue; 8718 } 8719 8720 startpos = i; 8721 exc = NULL; 8722 raise_encode_exception(&exc, "decimal", unicode, 8723 startpos, startpos+1, 8724 "invalid decimal Unicode string"); 8725 Py_XDECREF(exc); 8726 Py_DECREF(unicode); 8727 return -1; 8728 } 8729 /* 0-terminate the output string */ 8730 *output++ = '\0'; 8731 Py_DECREF(unicode); 8732 return 0; 8733} 8734 8735/* --- Helpers ------------------------------------------------------------ */ 8736 8737static Py_ssize_t 8738any_find_slice(int direction, PyObject* s1, PyObject* s2, 8739 Py_ssize_t start, 8740 Py_ssize_t end) 8741{ 8742 int kind1, kind2, kind; 8743 void *buf1, *buf2; 8744 Py_ssize_t len1, len2, result; 8745 8746 kind1 = PyUnicode_KIND(s1); 8747 kind2 = PyUnicode_KIND(s2); 8748 kind = kind1 > kind2 ? kind1 : kind2; 8749 buf1 = PyUnicode_DATA(s1); 8750 buf2 = PyUnicode_DATA(s2); 8751 if (kind1 != kind) 8752 buf1 = _PyUnicode_AsKind(s1, kind); 8753 if (!buf1) 8754 return -2; 8755 if (kind2 != kind) 8756 buf2 = _PyUnicode_AsKind(s2, kind); 8757 if (!buf2) { 8758 if (kind1 != kind) PyMem_Free(buf1); 8759 return -2; 8760 } 8761 len1 = PyUnicode_GET_LENGTH(s1); 8762 len2 = PyUnicode_GET_LENGTH(s2); 8763 8764 if (direction > 0) { 8765 switch (kind) { 8766 case PyUnicode_1BYTE_KIND: 8767 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8768 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8769 else 8770 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8771 break; 8772 case PyUnicode_2BYTE_KIND: 8773 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8774 break; 8775 case PyUnicode_4BYTE_KIND: 8776 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8777 break; 8778 default: 8779 assert(0); result = -2; 8780 } 8781 } 8782 else { 8783 switch (kind) { 8784 case PyUnicode_1BYTE_KIND: 8785 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8786 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8787 else 8788 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8789 break; 8790 case PyUnicode_2BYTE_KIND: 8791 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8792 break; 8793 case PyUnicode_4BYTE_KIND: 8794 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8795 break; 8796 default: 8797 assert(0); result = -2; 8798 } 8799 } 8800 8801 if (kind1 != kind) 8802 PyMem_Free(buf1); 8803 if (kind2 != kind) 8804 PyMem_Free(buf2); 8805 8806 return result; 8807} 8808 8809Py_ssize_t 8810_PyUnicode_InsertThousandsGrouping( 8811 PyObject *unicode, Py_ssize_t index, 8812 Py_ssize_t n_buffer, 8813 void *digits, Py_ssize_t n_digits, 8814 Py_ssize_t min_width, 8815 const char *grouping, PyObject *thousands_sep, 8816 Py_UCS4 *maxchar) 8817{ 8818 unsigned int kind, thousands_sep_kind; 8819 char *data, *thousands_sep_data; 8820 Py_ssize_t thousands_sep_len; 8821 Py_ssize_t len; 8822 8823 if (unicode != NULL) { 8824 kind = PyUnicode_KIND(unicode); 8825 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8826 } 8827 else { 8828 kind = PyUnicode_1BYTE_KIND; 8829 data = NULL; 8830 } 8831 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8832 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8833 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8834 if (unicode != NULL && thousands_sep_kind != kind) { 8835 if (thousands_sep_kind < kind) { 8836 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8837 if (!thousands_sep_data) 8838 return -1; 8839 } 8840 else { 8841 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8842 if (!data) 8843 return -1; 8844 } 8845 } 8846 8847 switch (kind) { 8848 case PyUnicode_1BYTE_KIND: 8849 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8850 len = asciilib_InsertThousandsGrouping( 8851 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8852 min_width, grouping, 8853 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8854 else 8855 len = ucs1lib_InsertThousandsGrouping( 8856 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8857 min_width, grouping, 8858 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8859 break; 8860 case PyUnicode_2BYTE_KIND: 8861 len = ucs2lib_InsertThousandsGrouping( 8862 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8863 min_width, grouping, 8864 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8865 break; 8866 case PyUnicode_4BYTE_KIND: 8867 len = ucs4lib_InsertThousandsGrouping( 8868 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8869 min_width, grouping, 8870 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8871 break; 8872 default: 8873 assert(0); 8874 return -1; 8875 } 8876 if (unicode != NULL && thousands_sep_kind != kind) { 8877 if (thousands_sep_kind < kind) 8878 PyMem_Free(thousands_sep_data); 8879 else 8880 PyMem_Free(data); 8881 } 8882 if (unicode == NULL) { 8883 *maxchar = 127; 8884 if (len != n_digits) { 8885 *maxchar = MAX_MAXCHAR(*maxchar, 8886 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8887 } 8888 } 8889 return len; 8890} 8891 8892 8893/* helper macro to fixup start/end slice values */ 8894#define ADJUST_INDICES(start, end, len) \ 8895 if (end > len) \ 8896 end = len; \ 8897 else if (end < 0) { \ 8898 end += len; \ 8899 if (end < 0) \ 8900 end = 0; \ 8901 } \ 8902 if (start < 0) { \ 8903 start += len; \ 8904 if (start < 0) \ 8905 start = 0; \ 8906 } 8907 8908Py_ssize_t 8909PyUnicode_Count(PyObject *str, 8910 PyObject *substr, 8911 Py_ssize_t start, 8912 Py_ssize_t end) 8913{ 8914 Py_ssize_t result; 8915 PyObject* str_obj; 8916 PyObject* sub_obj; 8917 int kind1, kind2, kind; 8918 void *buf1 = NULL, *buf2 = NULL; 8919 Py_ssize_t len1, len2; 8920 8921 str_obj = PyUnicode_FromObject(str); 8922 if (!str_obj) 8923 return -1; 8924 sub_obj = PyUnicode_FromObject(substr); 8925 if (!sub_obj) { 8926 Py_DECREF(str_obj); 8927 return -1; 8928 } 8929 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8930 Py_DECREF(sub_obj); 8931 Py_DECREF(str_obj); 8932 return -1; 8933 } 8934 8935 kind1 = PyUnicode_KIND(str_obj); 8936 kind2 = PyUnicode_KIND(sub_obj); 8937 kind = kind1; 8938 buf1 = PyUnicode_DATA(str_obj); 8939 buf2 = PyUnicode_DATA(sub_obj); 8940 if (kind2 != kind) { 8941 if (kind2 > kind) { 8942 Py_DECREF(sub_obj); 8943 Py_DECREF(str_obj); 8944 return 0; 8945 } 8946 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8947 } 8948 if (!buf2) 8949 goto onError; 8950 len1 = PyUnicode_GET_LENGTH(str_obj); 8951 len2 = PyUnicode_GET_LENGTH(sub_obj); 8952 8953 ADJUST_INDICES(start, end, len1); 8954 switch (kind) { 8955 case PyUnicode_1BYTE_KIND: 8956 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8957 result = asciilib_count( 8958 ((Py_UCS1*)buf1) + start, end - start, 8959 buf2, len2, PY_SSIZE_T_MAX 8960 ); 8961 else 8962 result = ucs1lib_count( 8963 ((Py_UCS1*)buf1) + start, end - start, 8964 buf2, len2, PY_SSIZE_T_MAX 8965 ); 8966 break; 8967 case PyUnicode_2BYTE_KIND: 8968 result = ucs2lib_count( 8969 ((Py_UCS2*)buf1) + start, end - start, 8970 buf2, len2, PY_SSIZE_T_MAX 8971 ); 8972 break; 8973 case PyUnicode_4BYTE_KIND: 8974 result = ucs4lib_count( 8975 ((Py_UCS4*)buf1) + start, end - start, 8976 buf2, len2, PY_SSIZE_T_MAX 8977 ); 8978 break; 8979 default: 8980 assert(0); result = 0; 8981 } 8982 8983 Py_DECREF(sub_obj); 8984 Py_DECREF(str_obj); 8985 8986 if (kind2 != kind) 8987 PyMem_Free(buf2); 8988 8989 return result; 8990 onError: 8991 Py_DECREF(sub_obj); 8992 Py_DECREF(str_obj); 8993 if (kind2 != kind && buf2) 8994 PyMem_Free(buf2); 8995 return -1; 8996} 8997 8998Py_ssize_t 8999PyUnicode_Find(PyObject *str, 9000 PyObject *sub, 9001 Py_ssize_t start, 9002 Py_ssize_t end, 9003 int direction) 9004{ 9005 Py_ssize_t result; 9006 9007 str = PyUnicode_FromObject(str); 9008 if (!str) 9009 return -2; 9010 sub = PyUnicode_FromObject(sub); 9011 if (!sub) { 9012 Py_DECREF(str); 9013 return -2; 9014 } 9015 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9016 Py_DECREF(sub); 9017 Py_DECREF(str); 9018 return -2; 9019 } 9020 9021 result = any_find_slice(direction, 9022 str, sub, start, end 9023 ); 9024 9025 Py_DECREF(str); 9026 Py_DECREF(sub); 9027 9028 return result; 9029} 9030 9031Py_ssize_t 9032PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9033 Py_ssize_t start, Py_ssize_t end, 9034 int direction) 9035{ 9036 int kind; 9037 Py_ssize_t result; 9038 if (PyUnicode_READY(str) == -1) 9039 return -2; 9040 if (start < 0 || end < 0) { 9041 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9042 return -2; 9043 } 9044 if (end > PyUnicode_GET_LENGTH(str)) 9045 end = PyUnicode_GET_LENGTH(str); 9046 kind = PyUnicode_KIND(str); 9047 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9048 kind, end-start, ch, direction); 9049 if (result == -1) 9050 return -1; 9051 else 9052 return start + result; 9053} 9054 9055static int 9056tailmatch(PyObject *self, 9057 PyObject *substring, 9058 Py_ssize_t start, 9059 Py_ssize_t end, 9060 int direction) 9061{ 9062 int kind_self; 9063 int kind_sub; 9064 void *data_self; 9065 void *data_sub; 9066 Py_ssize_t offset; 9067 Py_ssize_t i; 9068 Py_ssize_t end_sub; 9069 9070 if (PyUnicode_READY(self) == -1 || 9071 PyUnicode_READY(substring) == -1) 9072 return 0; 9073 9074 if (PyUnicode_GET_LENGTH(substring) == 0) 9075 return 1; 9076 9077 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9078 end -= PyUnicode_GET_LENGTH(substring); 9079 if (end < start) 9080 return 0; 9081 9082 kind_self = PyUnicode_KIND(self); 9083 data_self = PyUnicode_DATA(self); 9084 kind_sub = PyUnicode_KIND(substring); 9085 data_sub = PyUnicode_DATA(substring); 9086 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9087 9088 if (direction > 0) 9089 offset = end; 9090 else 9091 offset = start; 9092 9093 if (PyUnicode_READ(kind_self, data_self, offset) == 9094 PyUnicode_READ(kind_sub, data_sub, 0) && 9095 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9096 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9097 /* If both are of the same kind, memcmp is sufficient */ 9098 if (kind_self == kind_sub) { 9099 return ! memcmp((char *)data_self + 9100 (offset * PyUnicode_KIND(substring)), 9101 data_sub, 9102 PyUnicode_GET_LENGTH(substring) * 9103 PyUnicode_KIND(substring)); 9104 } 9105 /* otherwise we have to compare each character by first accesing it */ 9106 else { 9107 /* We do not need to compare 0 and len(substring)-1 because 9108 the if statement above ensured already that they are equal 9109 when we end up here. */ 9110 // TODO: honor direction and do a forward or backwards search 9111 for (i = 1; i < end_sub; ++i) { 9112 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9113 PyUnicode_READ(kind_sub, data_sub, i)) 9114 return 0; 9115 } 9116 return 1; 9117 } 9118 } 9119 9120 return 0; 9121} 9122 9123Py_ssize_t 9124PyUnicode_Tailmatch(PyObject *str, 9125 PyObject *substr, 9126 Py_ssize_t start, 9127 Py_ssize_t end, 9128 int direction) 9129{ 9130 Py_ssize_t result; 9131 9132 str = PyUnicode_FromObject(str); 9133 if (str == NULL) 9134 return -1; 9135 substr = PyUnicode_FromObject(substr); 9136 if (substr == NULL) { 9137 Py_DECREF(str); 9138 return -1; 9139 } 9140 9141 result = tailmatch(str, substr, 9142 start, end, direction); 9143 Py_DECREF(str); 9144 Py_DECREF(substr); 9145 return result; 9146} 9147 9148/* Apply fixfct filter to the Unicode object self and return a 9149 reference to the modified object */ 9150 9151static PyObject * 9152fixup(PyObject *self, 9153 Py_UCS4 (*fixfct)(PyObject *s)) 9154{ 9155 PyObject *u; 9156 Py_UCS4 maxchar_old, maxchar_new = 0; 9157 PyObject *v; 9158 9159 u = _PyUnicode_Copy(self); 9160 if (u == NULL) 9161 return NULL; 9162 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9163 9164 /* fix functions return the new maximum character in a string, 9165 if the kind of the resulting unicode object does not change, 9166 everything is fine. Otherwise we need to change the string kind 9167 and re-run the fix function. */ 9168 maxchar_new = fixfct(u); 9169 9170 if (maxchar_new == 0) { 9171 /* no changes */; 9172 if (PyUnicode_CheckExact(self)) { 9173 Py_DECREF(u); 9174 Py_INCREF(self); 9175 return self; 9176 } 9177 else 9178 return u; 9179 } 9180 9181 maxchar_new = align_maxchar(maxchar_new); 9182 9183 if (maxchar_new == maxchar_old) 9184 return u; 9185 9186 /* In case the maximum character changed, we need to 9187 convert the string to the new category. */ 9188 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9189 if (v == NULL) { 9190 Py_DECREF(u); 9191 return NULL; 9192 } 9193 if (maxchar_new > maxchar_old) { 9194 /* If the maxchar increased so that the kind changed, not all 9195 characters are representable anymore and we need to fix the 9196 string again. This only happens in very few cases. */ 9197 _PyUnicode_FastCopyCharacters(v, 0, 9198 self, 0, PyUnicode_GET_LENGTH(self)); 9199 maxchar_old = fixfct(v); 9200 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9201 } 9202 else { 9203 _PyUnicode_FastCopyCharacters(v, 0, 9204 u, 0, PyUnicode_GET_LENGTH(self)); 9205 } 9206 Py_DECREF(u); 9207 assert(_PyUnicode_CheckConsistency(v, 1)); 9208 return v; 9209} 9210 9211static PyObject * 9212ascii_upper_or_lower(PyObject *self, int lower) 9213{ 9214 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9215 char *resdata, *data = PyUnicode_DATA(self); 9216 PyObject *res; 9217 9218 res = PyUnicode_New(len, 127); 9219 if (res == NULL) 9220 return NULL; 9221 resdata = PyUnicode_DATA(res); 9222 if (lower) 9223 _Py_bytes_lower(resdata, data, len); 9224 else 9225 _Py_bytes_upper(resdata, data, len); 9226 return res; 9227} 9228 9229static Py_UCS4 9230handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9231{ 9232 Py_ssize_t j; 9233 int final_sigma; 9234 Py_UCS4 c; 9235 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9236 9237 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9238 9239 where ! is a negation and \p{xxx} is a character with property xxx. 9240 */ 9241 for (j = i - 1; j >= 0; j--) { 9242 c = PyUnicode_READ(kind, data, j); 9243 if (!_PyUnicode_IsCaseIgnorable(c)) 9244 break; 9245 } 9246 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9247 if (final_sigma) { 9248 for (j = i + 1; j < length; j++) { 9249 c = PyUnicode_READ(kind, data, j); 9250 if (!_PyUnicode_IsCaseIgnorable(c)) 9251 break; 9252 } 9253 final_sigma = j == length || !_PyUnicode_IsCased(c); 9254 } 9255 return (final_sigma) ? 0x3C2 : 0x3C3; 9256} 9257 9258static int 9259lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9260 Py_UCS4 c, Py_UCS4 *mapped) 9261{ 9262 /* Obscure special case. */ 9263 if (c == 0x3A3) { 9264 mapped[0] = handle_capital_sigma(kind, data, length, i); 9265 return 1; 9266 } 9267 return _PyUnicode_ToLowerFull(c, mapped); 9268} 9269 9270static Py_ssize_t 9271do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9272{ 9273 Py_ssize_t i, k = 0; 9274 int n_res, j; 9275 Py_UCS4 c, mapped[3]; 9276 9277 c = PyUnicode_READ(kind, data, 0); 9278 n_res = _PyUnicode_ToUpperFull(c, mapped); 9279 for (j = 0; j < n_res; j++) { 9280 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9281 res[k++] = mapped[j]; 9282 } 9283 for (i = 1; i < length; i++) { 9284 c = PyUnicode_READ(kind, data, i); 9285 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9286 for (j = 0; j < n_res; j++) { 9287 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9288 res[k++] = mapped[j]; 9289 } 9290 } 9291 return k; 9292} 9293 9294static Py_ssize_t 9295do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9296 Py_ssize_t i, k = 0; 9297 9298 for (i = 0; i < length; i++) { 9299 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9300 int n_res, j; 9301 if (Py_UNICODE_ISUPPER(c)) { 9302 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9303 } 9304 else if (Py_UNICODE_ISLOWER(c)) { 9305 n_res = _PyUnicode_ToUpperFull(c, mapped); 9306 } 9307 else { 9308 n_res = 1; 9309 mapped[0] = c; 9310 } 9311 for (j = 0; j < n_res; j++) { 9312 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9313 res[k++] = mapped[j]; 9314 } 9315 } 9316 return k; 9317} 9318 9319static Py_ssize_t 9320do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9321 Py_UCS4 *maxchar, int lower) 9322{ 9323 Py_ssize_t i, k = 0; 9324 9325 for (i = 0; i < length; i++) { 9326 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9327 int n_res, j; 9328 if (lower) 9329 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9330 else 9331 n_res = _PyUnicode_ToUpperFull(c, mapped); 9332 for (j = 0; j < n_res; j++) { 9333 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9334 res[k++] = mapped[j]; 9335 } 9336 } 9337 return k; 9338} 9339 9340static Py_ssize_t 9341do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9342{ 9343 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9344} 9345 9346static Py_ssize_t 9347do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9348{ 9349 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9350} 9351 9352static Py_ssize_t 9353do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9354{ 9355 Py_ssize_t i, k = 0; 9356 9357 for (i = 0; i < length; i++) { 9358 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9359 Py_UCS4 mapped[3]; 9360 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9361 for (j = 0; j < n_res; j++) { 9362 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9363 res[k++] = mapped[j]; 9364 } 9365 } 9366 return k; 9367} 9368 9369static Py_ssize_t 9370do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9371{ 9372 Py_ssize_t i, k = 0; 9373 int previous_is_cased; 9374 9375 previous_is_cased = 0; 9376 for (i = 0; i < length; i++) { 9377 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9378 Py_UCS4 mapped[3]; 9379 int n_res, j; 9380 9381 if (previous_is_cased) 9382 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9383 else 9384 n_res = _PyUnicode_ToTitleFull(c, mapped); 9385 9386 for (j = 0; j < n_res; j++) { 9387 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9388 res[k++] = mapped[j]; 9389 } 9390 9391 previous_is_cased = _PyUnicode_IsCased(c); 9392 } 9393 return k; 9394} 9395 9396static PyObject * 9397case_operation(PyObject *self, 9398 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9399{ 9400 PyObject *res = NULL; 9401 Py_ssize_t length, newlength = 0; 9402 int kind, outkind; 9403 void *data, *outdata; 9404 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9405 9406 assert(PyUnicode_IS_READY(self)); 9407 9408 kind = PyUnicode_KIND(self); 9409 data = PyUnicode_DATA(self); 9410 length = PyUnicode_GET_LENGTH(self); 9411 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9412 if (tmp == NULL) 9413 return PyErr_NoMemory(); 9414 newlength = perform(kind, data, length, tmp, &maxchar); 9415 res = PyUnicode_New(newlength, maxchar); 9416 if (res == NULL) 9417 goto leave; 9418 tmpend = tmp + newlength; 9419 outdata = PyUnicode_DATA(res); 9420 outkind = PyUnicode_KIND(res); 9421 switch (outkind) { 9422 case PyUnicode_1BYTE_KIND: 9423 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9424 break; 9425 case PyUnicode_2BYTE_KIND: 9426 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9427 break; 9428 case PyUnicode_4BYTE_KIND: 9429 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9430 break; 9431 default: 9432 assert(0); 9433 break; 9434 } 9435 leave: 9436 PyMem_FREE(tmp); 9437 return res; 9438} 9439 9440PyObject * 9441PyUnicode_Join(PyObject *separator, PyObject *seq) 9442{ 9443 PyObject *sep = NULL; 9444 Py_ssize_t seplen; 9445 PyObject *res = NULL; /* the result */ 9446 PyObject *fseq; /* PySequence_Fast(seq) */ 9447 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9448 PyObject **items; 9449 PyObject *item; 9450 Py_ssize_t sz, i, res_offset; 9451 Py_UCS4 maxchar; 9452 Py_UCS4 item_maxchar; 9453 int use_memcpy; 9454 unsigned char *res_data = NULL, *sep_data = NULL; 9455 PyObject *last_obj; 9456 unsigned int kind = 0; 9457 9458 fseq = PySequence_Fast(seq, ""); 9459 if (fseq == NULL) { 9460 return NULL; 9461 } 9462 9463 /* NOTE: the following code can't call back into Python code, 9464 * so we are sure that fseq won't be mutated. 9465 */ 9466 9467 seqlen = PySequence_Fast_GET_SIZE(fseq); 9468 /* If empty sequence, return u"". */ 9469 if (seqlen == 0) { 9470 Py_DECREF(fseq); 9471 Py_INCREF(unicode_empty); 9472 res = unicode_empty; 9473 return res; 9474 } 9475 9476 /* If singleton sequence with an exact Unicode, return that. */ 9477 last_obj = NULL; 9478 items = PySequence_Fast_ITEMS(fseq); 9479 if (seqlen == 1) { 9480 if (PyUnicode_CheckExact(items[0])) { 9481 res = items[0]; 9482 Py_INCREF(res); 9483 Py_DECREF(fseq); 9484 return res; 9485 } 9486 seplen = 0; 9487 maxchar = 0; 9488 } 9489 else { 9490 /* Set up sep and seplen */ 9491 if (separator == NULL) { 9492 /* fall back to a blank space separator */ 9493 sep = PyUnicode_FromOrdinal(' '); 9494 if (!sep) 9495 goto onError; 9496 seplen = 1; 9497 maxchar = 32; 9498 } 9499 else { 9500 if (!PyUnicode_Check(separator)) { 9501 PyErr_Format(PyExc_TypeError, 9502 "separator: expected str instance," 9503 " %.80s found", 9504 Py_TYPE(separator)->tp_name); 9505 goto onError; 9506 } 9507 if (PyUnicode_READY(separator)) 9508 goto onError; 9509 sep = separator; 9510 seplen = PyUnicode_GET_LENGTH(separator); 9511 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9512 /* inc refcount to keep this code path symmetric with the 9513 above case of a blank separator */ 9514 Py_INCREF(sep); 9515 } 9516 last_obj = sep; 9517 } 9518 9519 /* There are at least two things to join, or else we have a subclass 9520 * of str in the sequence. 9521 * Do a pre-pass to figure out the total amount of space we'll 9522 * need (sz), and see whether all argument are strings. 9523 */ 9524 sz = 0; 9525#ifdef Py_DEBUG 9526 use_memcpy = 0; 9527#else 9528 use_memcpy = 1; 9529#endif 9530 for (i = 0; i < seqlen; i++) { 9531 const Py_ssize_t old_sz = sz; 9532 item = items[i]; 9533 if (!PyUnicode_Check(item)) { 9534 PyErr_Format(PyExc_TypeError, 9535 "sequence item %zd: expected str instance," 9536 " %.80s found", 9537 i, Py_TYPE(item)->tp_name); 9538 goto onError; 9539 } 9540 if (PyUnicode_READY(item) == -1) 9541 goto onError; 9542 sz += PyUnicode_GET_LENGTH(item); 9543 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9544 maxchar = MAX_MAXCHAR(maxchar, item_maxchar); 9545 if (i != 0) 9546 sz += seplen; 9547 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9548 PyErr_SetString(PyExc_OverflowError, 9549 "join() result is too long for a Python string"); 9550 goto onError; 9551 } 9552 if (use_memcpy && last_obj != NULL) { 9553 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9554 use_memcpy = 0; 9555 } 9556 last_obj = item; 9557 } 9558 9559 res = PyUnicode_New(sz, maxchar); 9560 if (res == NULL) 9561 goto onError; 9562 9563 /* Catenate everything. */ 9564#ifdef Py_DEBUG 9565 use_memcpy = 0; 9566#else 9567 if (use_memcpy) { 9568 res_data = PyUnicode_1BYTE_DATA(res); 9569 kind = PyUnicode_KIND(res); 9570 if (seplen != 0) 9571 sep_data = PyUnicode_1BYTE_DATA(sep); 9572 } 9573#endif 9574 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9575 Py_ssize_t itemlen; 9576 item = items[i]; 9577 /* Copy item, and maybe the separator. */ 9578 if (i && seplen != 0) { 9579 if (use_memcpy) { 9580 Py_MEMCPY(res_data, 9581 sep_data, 9582 kind * seplen); 9583 res_data += kind * seplen; 9584 } 9585 else { 9586 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9587 res_offset += seplen; 9588 } 9589 } 9590 itemlen = PyUnicode_GET_LENGTH(item); 9591 if (itemlen != 0) { 9592 if (use_memcpy) { 9593 Py_MEMCPY(res_data, 9594 PyUnicode_DATA(item), 9595 kind * itemlen); 9596 res_data += kind * itemlen; 9597 } 9598 else { 9599 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9600 res_offset += itemlen; 9601 } 9602 } 9603 } 9604 if (use_memcpy) 9605 assert(res_data == PyUnicode_1BYTE_DATA(res) 9606 + kind * PyUnicode_GET_LENGTH(res)); 9607 else 9608 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9609 9610 Py_DECREF(fseq); 9611 Py_XDECREF(sep); 9612 assert(_PyUnicode_CheckConsistency(res, 1)); 9613 return res; 9614 9615 onError: 9616 Py_DECREF(fseq); 9617 Py_XDECREF(sep); 9618 Py_XDECREF(res); 9619 return NULL; 9620} 9621 9622#define FILL(kind, data, value, start, length) \ 9623 do { \ 9624 Py_ssize_t i_ = 0; \ 9625 assert(kind != PyUnicode_WCHAR_KIND); \ 9626 switch ((kind)) { \ 9627 case PyUnicode_1BYTE_KIND: { \ 9628 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9629 memset(to_, (unsigned char)value, (length)); \ 9630 break; \ 9631 } \ 9632 case PyUnicode_2BYTE_KIND: { \ 9633 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9634 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9635 break; \ 9636 } \ 9637 case PyUnicode_4BYTE_KIND: { \ 9638 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9639 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9640 break; \ 9641 default: assert(0); \ 9642 } \ 9643 } \ 9644 } while (0) 9645 9646void 9647_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9648 Py_UCS4 fill_char) 9649{ 9650 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9651 const void *data = PyUnicode_DATA(unicode); 9652 assert(PyUnicode_IS_READY(unicode)); 9653 assert(unicode_modifiable(unicode)); 9654 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9655 assert(start >= 0); 9656 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9657 FILL(kind, data, fill_char, start, length); 9658} 9659 9660Py_ssize_t 9661PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9662 Py_UCS4 fill_char) 9663{ 9664 Py_ssize_t maxlen; 9665 9666 if (!PyUnicode_Check(unicode)) { 9667 PyErr_BadInternalCall(); 9668 return -1; 9669 } 9670 if (PyUnicode_READY(unicode) == -1) 9671 return -1; 9672 if (unicode_check_modifiable(unicode)) 9673 return -1; 9674 9675 if (start < 0) { 9676 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9677 return -1; 9678 } 9679 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9680 PyErr_SetString(PyExc_ValueError, 9681 "fill character is bigger than " 9682 "the string maximum character"); 9683 return -1; 9684 } 9685 9686 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9687 length = Py_MIN(maxlen, length); 9688 if (length <= 0) 9689 return 0; 9690 9691 _PyUnicode_FastFill(unicode, start, length, fill_char); 9692 return length; 9693} 9694 9695static PyObject * 9696pad(PyObject *self, 9697 Py_ssize_t left, 9698 Py_ssize_t right, 9699 Py_UCS4 fill) 9700{ 9701 PyObject *u; 9702 Py_UCS4 maxchar; 9703 int kind; 9704 void *data; 9705 9706 if (left < 0) 9707 left = 0; 9708 if (right < 0) 9709 right = 0; 9710 9711 if (left == 0 && right == 0) 9712 return unicode_result_unchanged(self); 9713 9714 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9715 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9716 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9717 return NULL; 9718 } 9719 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9720 maxchar = MAX_MAXCHAR(maxchar, fill); 9721 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9722 if (!u) 9723 return NULL; 9724 9725 kind = PyUnicode_KIND(u); 9726 data = PyUnicode_DATA(u); 9727 if (left) 9728 FILL(kind, data, fill, 0, left); 9729 if (right) 9730 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9731 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9732 assert(_PyUnicode_CheckConsistency(u, 1)); 9733 return u; 9734} 9735 9736PyObject * 9737PyUnicode_Splitlines(PyObject *string, int keepends) 9738{ 9739 PyObject *list; 9740 9741 string = PyUnicode_FromObject(string); 9742 if (string == NULL) 9743 return NULL; 9744 if (PyUnicode_READY(string) == -1) { 9745 Py_DECREF(string); 9746 return NULL; 9747 } 9748 9749 switch (PyUnicode_KIND(string)) { 9750 case PyUnicode_1BYTE_KIND: 9751 if (PyUnicode_IS_ASCII(string)) 9752 list = asciilib_splitlines( 9753 string, PyUnicode_1BYTE_DATA(string), 9754 PyUnicode_GET_LENGTH(string), keepends); 9755 else 9756 list = ucs1lib_splitlines( 9757 string, PyUnicode_1BYTE_DATA(string), 9758 PyUnicode_GET_LENGTH(string), keepends); 9759 break; 9760 case PyUnicode_2BYTE_KIND: 9761 list = ucs2lib_splitlines( 9762 string, PyUnicode_2BYTE_DATA(string), 9763 PyUnicode_GET_LENGTH(string), keepends); 9764 break; 9765 case PyUnicode_4BYTE_KIND: 9766 list = ucs4lib_splitlines( 9767 string, PyUnicode_4BYTE_DATA(string), 9768 PyUnicode_GET_LENGTH(string), keepends); 9769 break; 9770 default: 9771 assert(0); 9772 list = 0; 9773 } 9774 Py_DECREF(string); 9775 return list; 9776} 9777 9778static PyObject * 9779split(PyObject *self, 9780 PyObject *substring, 9781 Py_ssize_t maxcount) 9782{ 9783 int kind1, kind2, kind; 9784 void *buf1, *buf2; 9785 Py_ssize_t len1, len2; 9786 PyObject* out; 9787 9788 if (maxcount < 0) 9789 maxcount = PY_SSIZE_T_MAX; 9790 9791 if (PyUnicode_READY(self) == -1) 9792 return NULL; 9793 9794 if (substring == NULL) 9795 switch (PyUnicode_KIND(self)) { 9796 case PyUnicode_1BYTE_KIND: 9797 if (PyUnicode_IS_ASCII(self)) 9798 return asciilib_split_whitespace( 9799 self, PyUnicode_1BYTE_DATA(self), 9800 PyUnicode_GET_LENGTH(self), maxcount 9801 ); 9802 else 9803 return ucs1lib_split_whitespace( 9804 self, PyUnicode_1BYTE_DATA(self), 9805 PyUnicode_GET_LENGTH(self), maxcount 9806 ); 9807 case PyUnicode_2BYTE_KIND: 9808 return ucs2lib_split_whitespace( 9809 self, PyUnicode_2BYTE_DATA(self), 9810 PyUnicode_GET_LENGTH(self), maxcount 9811 ); 9812 case PyUnicode_4BYTE_KIND: 9813 return ucs4lib_split_whitespace( 9814 self, PyUnicode_4BYTE_DATA(self), 9815 PyUnicode_GET_LENGTH(self), maxcount 9816 ); 9817 default: 9818 assert(0); 9819 return NULL; 9820 } 9821 9822 if (PyUnicode_READY(substring) == -1) 9823 return NULL; 9824 9825 kind1 = PyUnicode_KIND(self); 9826 kind2 = PyUnicode_KIND(substring); 9827 kind = kind1 > kind2 ? kind1 : kind2; 9828 buf1 = PyUnicode_DATA(self); 9829 buf2 = PyUnicode_DATA(substring); 9830 if (kind1 != kind) 9831 buf1 = _PyUnicode_AsKind(self, kind); 9832 if (!buf1) 9833 return NULL; 9834 if (kind2 != kind) 9835 buf2 = _PyUnicode_AsKind(substring, kind); 9836 if (!buf2) { 9837 if (kind1 != kind) PyMem_Free(buf1); 9838 return NULL; 9839 } 9840 len1 = PyUnicode_GET_LENGTH(self); 9841 len2 = PyUnicode_GET_LENGTH(substring); 9842 9843 switch (kind) { 9844 case PyUnicode_1BYTE_KIND: 9845 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9846 out = asciilib_split( 9847 self, buf1, len1, buf2, len2, maxcount); 9848 else 9849 out = ucs1lib_split( 9850 self, buf1, len1, buf2, len2, maxcount); 9851 break; 9852 case PyUnicode_2BYTE_KIND: 9853 out = ucs2lib_split( 9854 self, buf1, len1, buf2, len2, maxcount); 9855 break; 9856 case PyUnicode_4BYTE_KIND: 9857 out = ucs4lib_split( 9858 self, buf1, len1, buf2, len2, maxcount); 9859 break; 9860 default: 9861 out = NULL; 9862 } 9863 if (kind1 != kind) 9864 PyMem_Free(buf1); 9865 if (kind2 != kind) 9866 PyMem_Free(buf2); 9867 return out; 9868} 9869 9870static PyObject * 9871rsplit(PyObject *self, 9872 PyObject *substring, 9873 Py_ssize_t maxcount) 9874{ 9875 int kind1, kind2, kind; 9876 void *buf1, *buf2; 9877 Py_ssize_t len1, len2; 9878 PyObject* out; 9879 9880 if (maxcount < 0) 9881 maxcount = PY_SSIZE_T_MAX; 9882 9883 if (PyUnicode_READY(self) == -1) 9884 return NULL; 9885 9886 if (substring == NULL) 9887 switch (PyUnicode_KIND(self)) { 9888 case PyUnicode_1BYTE_KIND: 9889 if (PyUnicode_IS_ASCII(self)) 9890 return asciilib_rsplit_whitespace( 9891 self, PyUnicode_1BYTE_DATA(self), 9892 PyUnicode_GET_LENGTH(self), maxcount 9893 ); 9894 else 9895 return ucs1lib_rsplit_whitespace( 9896 self, PyUnicode_1BYTE_DATA(self), 9897 PyUnicode_GET_LENGTH(self), maxcount 9898 ); 9899 case PyUnicode_2BYTE_KIND: 9900 return ucs2lib_rsplit_whitespace( 9901 self, PyUnicode_2BYTE_DATA(self), 9902 PyUnicode_GET_LENGTH(self), maxcount 9903 ); 9904 case PyUnicode_4BYTE_KIND: 9905 return ucs4lib_rsplit_whitespace( 9906 self, PyUnicode_4BYTE_DATA(self), 9907 PyUnicode_GET_LENGTH(self), maxcount 9908 ); 9909 default: 9910 assert(0); 9911 return NULL; 9912 } 9913 9914 if (PyUnicode_READY(substring) == -1) 9915 return NULL; 9916 9917 kind1 = PyUnicode_KIND(self); 9918 kind2 = PyUnicode_KIND(substring); 9919 kind = kind1 > kind2 ? kind1 : kind2; 9920 buf1 = PyUnicode_DATA(self); 9921 buf2 = PyUnicode_DATA(substring); 9922 if (kind1 != kind) 9923 buf1 = _PyUnicode_AsKind(self, kind); 9924 if (!buf1) 9925 return NULL; 9926 if (kind2 != kind) 9927 buf2 = _PyUnicode_AsKind(substring, kind); 9928 if (!buf2) { 9929 if (kind1 != kind) PyMem_Free(buf1); 9930 return NULL; 9931 } 9932 len1 = PyUnicode_GET_LENGTH(self); 9933 len2 = PyUnicode_GET_LENGTH(substring); 9934 9935 switch (kind) { 9936 case PyUnicode_1BYTE_KIND: 9937 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9938 out = asciilib_rsplit( 9939 self, buf1, len1, buf2, len2, maxcount); 9940 else 9941 out = ucs1lib_rsplit( 9942 self, buf1, len1, buf2, len2, maxcount); 9943 break; 9944 case PyUnicode_2BYTE_KIND: 9945 out = ucs2lib_rsplit( 9946 self, buf1, len1, buf2, len2, maxcount); 9947 break; 9948 case PyUnicode_4BYTE_KIND: 9949 out = ucs4lib_rsplit( 9950 self, buf1, len1, buf2, len2, maxcount); 9951 break; 9952 default: 9953 out = NULL; 9954 } 9955 if (kind1 != kind) 9956 PyMem_Free(buf1); 9957 if (kind2 != kind) 9958 PyMem_Free(buf2); 9959 return out; 9960} 9961 9962static Py_ssize_t 9963anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9964 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9965{ 9966 switch (kind) { 9967 case PyUnicode_1BYTE_KIND: 9968 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9969 return asciilib_find(buf1, len1, buf2, len2, offset); 9970 else 9971 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9972 case PyUnicode_2BYTE_KIND: 9973 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9974 case PyUnicode_4BYTE_KIND: 9975 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9976 } 9977 assert(0); 9978 return -1; 9979} 9980 9981static Py_ssize_t 9982anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9983 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9984{ 9985 switch (kind) { 9986 case PyUnicode_1BYTE_KIND: 9987 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 9988 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 9989 else 9990 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9991 case PyUnicode_2BYTE_KIND: 9992 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9993 case PyUnicode_4BYTE_KIND: 9994 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9995 } 9996 assert(0); 9997 return 0; 9998} 9999 10000static PyObject * 10001replace(PyObject *self, PyObject *str1, 10002 PyObject *str2, Py_ssize_t maxcount) 10003{ 10004 PyObject *u; 10005 char *sbuf = PyUnicode_DATA(self); 10006 char *buf1 = PyUnicode_DATA(str1); 10007 char *buf2 = PyUnicode_DATA(str2); 10008 int srelease = 0, release1 = 0, release2 = 0; 10009 int skind = PyUnicode_KIND(self); 10010 int kind1 = PyUnicode_KIND(str1); 10011 int kind2 = PyUnicode_KIND(str2); 10012 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10013 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10014 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10015 int mayshrink; 10016 Py_UCS4 maxchar, maxchar_str2; 10017 10018 if (maxcount < 0) 10019 maxcount = PY_SSIZE_T_MAX; 10020 else if (maxcount == 0 || slen == 0) 10021 goto nothing; 10022 10023 if (str1 == str2) 10024 goto nothing; 10025 if (skind < kind1) 10026 /* substring too wide to be present */ 10027 goto nothing; 10028 10029 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10030 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10031 /* Replacing str1 with str2 may cause a maxchar reduction in the 10032 result string. */ 10033 mayshrink = (maxchar_str2 < maxchar); 10034 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); 10035 10036 if (len1 == len2) { 10037 /* same length */ 10038 if (len1 == 0) 10039 goto nothing; 10040 if (len1 == 1) { 10041 /* replace characters */ 10042 Py_UCS4 u1, u2; 10043 int rkind; 10044 Py_ssize_t index, pos; 10045 char *src; 10046 10047 u1 = PyUnicode_READ_CHAR(str1, 0); 10048 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); 10049 if (pos < 0) 10050 goto nothing; 10051 u2 = PyUnicode_READ_CHAR(str2, 0); 10052 u = PyUnicode_New(slen, maxchar); 10053 if (!u) 10054 goto error; 10055 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10056 rkind = PyUnicode_KIND(u); 10057 10058 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); 10059 index = 0; 10060 src = sbuf; 10061 while (--maxcount) 10062 { 10063 pos++; 10064 src += pos * PyUnicode_KIND(self); 10065 slen -= pos; 10066 index += pos; 10067 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); 10068 if (pos < 0) 10069 break; 10070 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); 10071 } 10072 } 10073 else { 10074 int rkind = skind; 10075 char *res; 10076 Py_ssize_t i; 10077 10078 if (kind1 < rkind) { 10079 /* widen substring */ 10080 buf1 = _PyUnicode_AsKind(str1, rkind); 10081 if (!buf1) goto error; 10082 release1 = 1; 10083 } 10084 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10085 if (i < 0) 10086 goto nothing; 10087 if (rkind > kind2) { 10088 /* widen replacement */ 10089 buf2 = _PyUnicode_AsKind(str2, rkind); 10090 if (!buf2) goto error; 10091 release2 = 1; 10092 } 10093 else if (rkind < kind2) { 10094 /* widen self and buf1 */ 10095 rkind = kind2; 10096 if (release1) PyMem_Free(buf1); 10097 sbuf = _PyUnicode_AsKind(self, rkind); 10098 if (!sbuf) goto error; 10099 srelease = 1; 10100 buf1 = _PyUnicode_AsKind(str1, rkind); 10101 if (!buf1) goto error; 10102 release1 = 1; 10103 } 10104 u = PyUnicode_New(slen, maxchar); 10105 if (!u) 10106 goto error; 10107 assert(PyUnicode_KIND(u) == rkind); 10108 res = PyUnicode_DATA(u); 10109 10110 memcpy(res, sbuf, rkind * slen); 10111 /* change everything in-place, starting with this one */ 10112 memcpy(res + rkind * i, 10113 buf2, 10114 rkind * len2); 10115 i += len1; 10116 10117 while ( --maxcount > 0) { 10118 i = anylib_find(rkind, self, 10119 sbuf+rkind*i, slen-i, 10120 str1, buf1, len1, i); 10121 if (i == -1) 10122 break; 10123 memcpy(res + rkind * i, 10124 buf2, 10125 rkind * len2); 10126 i += len1; 10127 } 10128 } 10129 } 10130 else { 10131 Py_ssize_t n, i, j, ires; 10132 Py_ssize_t product, new_size; 10133 int rkind = skind; 10134 char *res; 10135 10136 if (kind1 < rkind) { 10137 /* widen substring */ 10138 buf1 = _PyUnicode_AsKind(str1, rkind); 10139 if (!buf1) goto error; 10140 release1 = 1; 10141 } 10142 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10143 if (n == 0) 10144 goto nothing; 10145 if (kind2 < rkind) { 10146 /* widen replacement */ 10147 buf2 = _PyUnicode_AsKind(str2, rkind); 10148 if (!buf2) goto error; 10149 release2 = 1; 10150 } 10151 else if (kind2 > rkind) { 10152 /* widen self and buf1 */ 10153 rkind = kind2; 10154 sbuf = _PyUnicode_AsKind(self, rkind); 10155 if (!sbuf) goto error; 10156 srelease = 1; 10157 if (release1) PyMem_Free(buf1); 10158 buf1 = _PyUnicode_AsKind(str1, rkind); 10159 if (!buf1) goto error; 10160 release1 = 1; 10161 } 10162 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10163 PyUnicode_GET_LENGTH(str1))); */ 10164 product = n * (len2-len1); 10165 if ((product / (len2-len1)) != n) { 10166 PyErr_SetString(PyExc_OverflowError, 10167 "replace string is too long"); 10168 goto error; 10169 } 10170 new_size = slen + product; 10171 if (new_size == 0) { 10172 Py_INCREF(unicode_empty); 10173 u = unicode_empty; 10174 goto done; 10175 } 10176 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10177 PyErr_SetString(PyExc_OverflowError, 10178 "replace string is too long"); 10179 goto error; 10180 } 10181 u = PyUnicode_New(new_size, maxchar); 10182 if (!u) 10183 goto error; 10184 assert(PyUnicode_KIND(u) == rkind); 10185 res = PyUnicode_DATA(u); 10186 ires = i = 0; 10187 if (len1 > 0) { 10188 while (n-- > 0) { 10189 /* look for next match */ 10190 j = anylib_find(rkind, self, 10191 sbuf + rkind * i, slen-i, 10192 str1, buf1, len1, i); 10193 if (j == -1) 10194 break; 10195 else if (j > i) { 10196 /* copy unchanged part [i:j] */ 10197 memcpy(res + rkind * ires, 10198 sbuf + rkind * i, 10199 rkind * (j-i)); 10200 ires += j - i; 10201 } 10202 /* copy substitution string */ 10203 if (len2 > 0) { 10204 memcpy(res + rkind * ires, 10205 buf2, 10206 rkind * len2); 10207 ires += len2; 10208 } 10209 i = j + len1; 10210 } 10211 if (i < slen) 10212 /* copy tail [i:] */ 10213 memcpy(res + rkind * ires, 10214 sbuf + rkind * i, 10215 rkind * (slen-i)); 10216 } 10217 else { 10218 /* interleave */ 10219 while (n > 0) { 10220 memcpy(res + rkind * ires, 10221 buf2, 10222 rkind * len2); 10223 ires += len2; 10224 if (--n <= 0) 10225 break; 10226 memcpy(res + rkind * ires, 10227 sbuf + rkind * i, 10228 rkind); 10229 ires++; 10230 i++; 10231 } 10232 memcpy(res + rkind * ires, 10233 sbuf + rkind * i, 10234 rkind * (slen-i)); 10235 } 10236 } 10237 10238 if (mayshrink) { 10239 unicode_adjust_maxchar(&u); 10240 if (u == NULL) 10241 goto error; 10242 } 10243 10244 done: 10245 if (srelease) 10246 PyMem_FREE(sbuf); 10247 if (release1) 10248 PyMem_FREE(buf1); 10249 if (release2) 10250 PyMem_FREE(buf2); 10251 assert(_PyUnicode_CheckConsistency(u, 1)); 10252 return u; 10253 10254 nothing: 10255 /* nothing to replace; return original string (when possible) */ 10256 if (srelease) 10257 PyMem_FREE(sbuf); 10258 if (release1) 10259 PyMem_FREE(buf1); 10260 if (release2) 10261 PyMem_FREE(buf2); 10262 return unicode_result_unchanged(self); 10263 10264 error: 10265 if (srelease && sbuf) 10266 PyMem_FREE(sbuf); 10267 if (release1 && buf1) 10268 PyMem_FREE(buf1); 10269 if (release2 && buf2) 10270 PyMem_FREE(buf2); 10271 return NULL; 10272} 10273 10274/* --- Unicode Object Methods --------------------------------------------- */ 10275 10276PyDoc_STRVAR(title__doc__, 10277 "S.title() -> str\n\ 10278\n\ 10279Return a titlecased version of S, i.e. words start with title case\n\ 10280characters, all remaining cased characters have lower case."); 10281 10282static PyObject* 10283unicode_title(PyObject *self) 10284{ 10285 if (PyUnicode_READY(self) == -1) 10286 return NULL; 10287 return case_operation(self, do_title); 10288} 10289 10290PyDoc_STRVAR(capitalize__doc__, 10291 "S.capitalize() -> str\n\ 10292\n\ 10293Return a capitalized version of S, i.e. make the first character\n\ 10294have upper case and the rest lower case."); 10295 10296static PyObject* 10297unicode_capitalize(PyObject *self) 10298{ 10299 if (PyUnicode_READY(self) == -1) 10300 return NULL; 10301 if (PyUnicode_GET_LENGTH(self) == 0) 10302 return unicode_result_unchanged(self); 10303 return case_operation(self, do_capitalize); 10304} 10305 10306PyDoc_STRVAR(casefold__doc__, 10307 "S.casefold() -> str\n\ 10308\n\ 10309Return a version of S suitable for caseless comparisons."); 10310 10311static PyObject * 10312unicode_casefold(PyObject *self) 10313{ 10314 if (PyUnicode_READY(self) == -1) 10315 return NULL; 10316 if (PyUnicode_IS_ASCII(self)) 10317 return ascii_upper_or_lower(self, 1); 10318 return case_operation(self, do_casefold); 10319} 10320 10321 10322/* Argument converter. Coerces to a single unicode character */ 10323 10324static int 10325convert_uc(PyObject *obj, void *addr) 10326{ 10327 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10328 PyObject *uniobj; 10329 10330 uniobj = PyUnicode_FromObject(obj); 10331 if (uniobj == NULL) { 10332 PyErr_SetString(PyExc_TypeError, 10333 "The fill character cannot be converted to Unicode"); 10334 return 0; 10335 } 10336 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10337 PyErr_SetString(PyExc_TypeError, 10338 "The fill character must be exactly one character long"); 10339 Py_DECREF(uniobj); 10340 return 0; 10341 } 10342 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10343 Py_DECREF(uniobj); 10344 return 1; 10345} 10346 10347PyDoc_STRVAR(center__doc__, 10348 "S.center(width[, fillchar]) -> str\n\ 10349\n\ 10350Return S centered in a string of length width. Padding is\n\ 10351done using the specified fill character (default is a space)"); 10352 10353static PyObject * 10354unicode_center(PyObject *self, PyObject *args) 10355{ 10356 Py_ssize_t marg, left; 10357 Py_ssize_t width; 10358 Py_UCS4 fillchar = ' '; 10359 10360 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10361 return NULL; 10362 10363 if (PyUnicode_READY(self) == -1) 10364 return NULL; 10365 10366 if (PyUnicode_GET_LENGTH(self) >= width) 10367 return unicode_result_unchanged(self); 10368 10369 marg = width - PyUnicode_GET_LENGTH(self); 10370 left = marg / 2 + (marg & width & 1); 10371 10372 return pad(self, left, marg - left, fillchar); 10373} 10374 10375/* This function assumes that str1 and str2 are readied by the caller. */ 10376 10377static int 10378unicode_compare(PyObject *str1, PyObject *str2) 10379{ 10380 int kind1, kind2; 10381 void *data1, *data2; 10382 Py_ssize_t len1, len2, i; 10383 10384 kind1 = PyUnicode_KIND(str1); 10385 kind2 = PyUnicode_KIND(str2); 10386 data1 = PyUnicode_DATA(str1); 10387 data2 = PyUnicode_DATA(str2); 10388 len1 = PyUnicode_GET_LENGTH(str1); 10389 len2 = PyUnicode_GET_LENGTH(str2); 10390 10391 for (i = 0; i < len1 && i < len2; ++i) { 10392 Py_UCS4 c1, c2; 10393 c1 = PyUnicode_READ(kind1, data1, i); 10394 c2 = PyUnicode_READ(kind2, data2, i); 10395 10396 if (c1 != c2) 10397 return (c1 < c2) ? -1 : 1; 10398 } 10399 10400 return (len1 < len2) ? -1 : (len1 != len2); 10401} 10402 10403int 10404PyUnicode_Compare(PyObject *left, PyObject *right) 10405{ 10406 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10407 if (PyUnicode_READY(left) == -1 || 10408 PyUnicode_READY(right) == -1) 10409 return -1; 10410 return unicode_compare(left, right); 10411 } 10412 PyErr_Format(PyExc_TypeError, 10413 "Can't compare %.100s and %.100s", 10414 left->ob_type->tp_name, 10415 right->ob_type->tp_name); 10416 return -1; 10417} 10418 10419int 10420PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10421{ 10422 Py_ssize_t i; 10423 int kind; 10424 void *data; 10425 Py_UCS4 chr; 10426 10427 assert(_PyUnicode_CHECK(uni)); 10428 if (PyUnicode_READY(uni) == -1) 10429 return -1; 10430 kind = PyUnicode_KIND(uni); 10431 data = PyUnicode_DATA(uni); 10432 /* Compare Unicode string and source character set string */ 10433 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10434 if (chr != str[i]) 10435 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10436 /* This check keeps Python strings that end in '\0' from comparing equal 10437 to C strings identical up to that point. */ 10438 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10439 return 1; /* uni is longer */ 10440 if (str[i]) 10441 return -1; /* str is longer */ 10442 return 0; 10443} 10444 10445 10446#define TEST_COND(cond) \ 10447 ((cond) ? Py_True : Py_False) 10448 10449PyObject * 10450PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10451{ 10452 int result; 10453 10454 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10455 PyObject *v; 10456 if (PyUnicode_READY(left) == -1 || 10457 PyUnicode_READY(right) == -1) 10458 return NULL; 10459 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10460 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10461 if (op == Py_EQ) { 10462 Py_INCREF(Py_False); 10463 return Py_False; 10464 } 10465 if (op == Py_NE) { 10466 Py_INCREF(Py_True); 10467 return Py_True; 10468 } 10469 } 10470 if (left == right) 10471 result = 0; 10472 else 10473 result = unicode_compare(left, right); 10474 10475 /* Convert the return value to a Boolean */ 10476 switch (op) { 10477 case Py_EQ: 10478 v = TEST_COND(result == 0); 10479 break; 10480 case Py_NE: 10481 v = TEST_COND(result != 0); 10482 break; 10483 case Py_LE: 10484 v = TEST_COND(result <= 0); 10485 break; 10486 case Py_GE: 10487 v = TEST_COND(result >= 0); 10488 break; 10489 case Py_LT: 10490 v = TEST_COND(result == -1); 10491 break; 10492 case Py_GT: 10493 v = TEST_COND(result == 1); 10494 break; 10495 default: 10496 PyErr_BadArgument(); 10497 return NULL; 10498 } 10499 Py_INCREF(v); 10500 return v; 10501 } 10502 10503 Py_RETURN_NOTIMPLEMENTED; 10504} 10505 10506int 10507PyUnicode_Contains(PyObject *container, PyObject *element) 10508{ 10509 PyObject *str, *sub; 10510 int kind1, kind2, kind; 10511 void *buf1, *buf2; 10512 Py_ssize_t len1, len2; 10513 int result; 10514 10515 /* Coerce the two arguments */ 10516 sub = PyUnicode_FromObject(element); 10517 if (!sub) { 10518 PyErr_Format(PyExc_TypeError, 10519 "'in <string>' requires string as left operand, not %s", 10520 element->ob_type->tp_name); 10521 return -1; 10522 } 10523 10524 str = PyUnicode_FromObject(container); 10525 if (!str) { 10526 Py_DECREF(sub); 10527 return -1; 10528 } 10529 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 10530 Py_DECREF(sub); 10531 Py_DECREF(str); 10532 } 10533 10534 kind1 = PyUnicode_KIND(str); 10535 kind2 = PyUnicode_KIND(sub); 10536 kind = kind1; 10537 buf1 = PyUnicode_DATA(str); 10538 buf2 = PyUnicode_DATA(sub); 10539 if (kind2 != kind) { 10540 if (kind2 > kind) { 10541 Py_DECREF(sub); 10542 Py_DECREF(str); 10543 return 0; 10544 } 10545 buf2 = _PyUnicode_AsKind(sub, kind); 10546 } 10547 if (!buf2) { 10548 Py_DECREF(sub); 10549 Py_DECREF(str); 10550 return -1; 10551 } 10552 len1 = PyUnicode_GET_LENGTH(str); 10553 len2 = PyUnicode_GET_LENGTH(sub); 10554 10555 switch (kind) { 10556 case PyUnicode_1BYTE_KIND: 10557 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10558 break; 10559 case PyUnicode_2BYTE_KIND: 10560 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10561 break; 10562 case PyUnicode_4BYTE_KIND: 10563 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10564 break; 10565 default: 10566 result = -1; 10567 assert(0); 10568 } 10569 10570 Py_DECREF(str); 10571 Py_DECREF(sub); 10572 10573 if (kind2 != kind) 10574 PyMem_Free(buf2); 10575 10576 return result; 10577} 10578 10579/* Concat to string or Unicode object giving a new Unicode object. */ 10580 10581PyObject * 10582PyUnicode_Concat(PyObject *left, PyObject *right) 10583{ 10584 PyObject *u = NULL, *v = NULL, *w; 10585 Py_UCS4 maxchar, maxchar2; 10586 Py_ssize_t u_len, v_len, new_len; 10587 10588 /* Coerce the two arguments */ 10589 u = PyUnicode_FromObject(left); 10590 if (u == NULL) 10591 goto onError; 10592 v = PyUnicode_FromObject(right); 10593 if (v == NULL) 10594 goto onError; 10595 10596 /* Shortcuts */ 10597 if (v == unicode_empty) { 10598 Py_DECREF(v); 10599 return u; 10600 } 10601 if (u == unicode_empty) { 10602 Py_DECREF(u); 10603 return v; 10604 } 10605 10606 u_len = PyUnicode_GET_LENGTH(u); 10607 v_len = PyUnicode_GET_LENGTH(v); 10608 if (u_len > PY_SSIZE_T_MAX - v_len) { 10609 PyErr_SetString(PyExc_OverflowError, 10610 "strings are too large to concat"); 10611 goto onError; 10612 } 10613 new_len = u_len + v_len; 10614 10615 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10616 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10617 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10618 10619 /* Concat the two Unicode strings */ 10620 w = PyUnicode_New(new_len, maxchar); 10621 if (w == NULL) 10622 goto onError; 10623 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10624 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10625 Py_DECREF(u); 10626 Py_DECREF(v); 10627 assert(_PyUnicode_CheckConsistency(w, 1)); 10628 return w; 10629 10630 onError: 10631 Py_XDECREF(u); 10632 Py_XDECREF(v); 10633 return NULL; 10634} 10635 10636void 10637PyUnicode_Append(PyObject **p_left, PyObject *right) 10638{ 10639 PyObject *left, *res; 10640 Py_UCS4 maxchar, maxchar2; 10641 Py_ssize_t left_len, right_len, new_len; 10642 10643 if (p_left == NULL) { 10644 if (!PyErr_Occurred()) 10645 PyErr_BadInternalCall(); 10646 return; 10647 } 10648 left = *p_left; 10649 if (right == NULL || !PyUnicode_Check(left)) { 10650 if (!PyErr_Occurred()) 10651 PyErr_BadInternalCall(); 10652 goto error; 10653 } 10654 10655 if (PyUnicode_READY(left) == -1) 10656 goto error; 10657 if (PyUnicode_READY(right) == -1) 10658 goto error; 10659 10660 /* Shortcuts */ 10661 if (left == unicode_empty) { 10662 Py_DECREF(left); 10663 Py_INCREF(right); 10664 *p_left = right; 10665 return; 10666 } 10667 if (right == unicode_empty) 10668 return; 10669 10670 left_len = PyUnicode_GET_LENGTH(left); 10671 right_len = PyUnicode_GET_LENGTH(right); 10672 if (left_len > PY_SSIZE_T_MAX - right_len) { 10673 PyErr_SetString(PyExc_OverflowError, 10674 "strings are too large to concat"); 10675 goto error; 10676 } 10677 new_len = left_len + right_len; 10678 10679 if (unicode_modifiable(left) 10680 && PyUnicode_CheckExact(right) 10681 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10682 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10683 to change the structure size, but characters are stored just after 10684 the structure, and so it requires to move all characters which is 10685 not so different than duplicating the string. */ 10686 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10687 { 10688 /* append inplace */ 10689 if (unicode_resize(p_left, new_len) != 0) { 10690 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10691 * deallocated so it cannot be put back into 10692 * 'variable'. The MemoryError is raised when there 10693 * is no value in 'variable', which might (very 10694 * remotely) be a cause of incompatibilities. 10695 */ 10696 goto error; 10697 } 10698 /* copy 'right' into the newly allocated area of 'left' */ 10699 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10700 } 10701 else { 10702 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10703 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10704 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10705 10706 /* Concat the two Unicode strings */ 10707 res = PyUnicode_New(new_len, maxchar); 10708 if (res == NULL) 10709 goto error; 10710 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10711 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10712 Py_DECREF(left); 10713 *p_left = res; 10714 } 10715 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10716 return; 10717 10718error: 10719 Py_CLEAR(*p_left); 10720} 10721 10722void 10723PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10724{ 10725 PyUnicode_Append(pleft, right); 10726 Py_XDECREF(right); 10727} 10728 10729PyDoc_STRVAR(count__doc__, 10730 "S.count(sub[, start[, end]]) -> int\n\ 10731\n\ 10732Return the number of non-overlapping occurrences of substring sub in\n\ 10733string S[start:end]. Optional arguments start and end are\n\ 10734interpreted as in slice notation."); 10735 10736static PyObject * 10737unicode_count(PyObject *self, PyObject *args) 10738{ 10739 PyObject *substring; 10740 Py_ssize_t start = 0; 10741 Py_ssize_t end = PY_SSIZE_T_MAX; 10742 PyObject *result; 10743 int kind1, kind2, kind; 10744 void *buf1, *buf2; 10745 Py_ssize_t len1, len2, iresult; 10746 10747 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10748 &start, &end)) 10749 return NULL; 10750 10751 kind1 = PyUnicode_KIND(self); 10752 kind2 = PyUnicode_KIND(substring); 10753 if (kind2 > kind1) 10754 return PyLong_FromLong(0); 10755 kind = kind1; 10756 buf1 = PyUnicode_DATA(self); 10757 buf2 = PyUnicode_DATA(substring); 10758 if (kind2 != kind) 10759 buf2 = _PyUnicode_AsKind(substring, kind); 10760 if (!buf2) { 10761 Py_DECREF(substring); 10762 return NULL; 10763 } 10764 len1 = PyUnicode_GET_LENGTH(self); 10765 len2 = PyUnicode_GET_LENGTH(substring); 10766 10767 ADJUST_INDICES(start, end, len1); 10768 switch (kind) { 10769 case PyUnicode_1BYTE_KIND: 10770 iresult = ucs1lib_count( 10771 ((Py_UCS1*)buf1) + start, end - start, 10772 buf2, len2, PY_SSIZE_T_MAX 10773 ); 10774 break; 10775 case PyUnicode_2BYTE_KIND: 10776 iresult = ucs2lib_count( 10777 ((Py_UCS2*)buf1) + start, end - start, 10778 buf2, len2, PY_SSIZE_T_MAX 10779 ); 10780 break; 10781 case PyUnicode_4BYTE_KIND: 10782 iresult = ucs4lib_count( 10783 ((Py_UCS4*)buf1) + start, end - start, 10784 buf2, len2, PY_SSIZE_T_MAX 10785 ); 10786 break; 10787 default: 10788 assert(0); iresult = 0; 10789 } 10790 10791 result = PyLong_FromSsize_t(iresult); 10792 10793 if (kind2 != kind) 10794 PyMem_Free(buf2); 10795 10796 Py_DECREF(substring); 10797 10798 return result; 10799} 10800 10801PyDoc_STRVAR(encode__doc__, 10802 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10803\n\ 10804Encode S using the codec registered for encoding. Default encoding\n\ 10805is 'utf-8'. errors may be given to set a different error\n\ 10806handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10807a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10808'xmlcharrefreplace' as well as any other name registered with\n\ 10809codecs.register_error that can handle UnicodeEncodeErrors."); 10810 10811static PyObject * 10812unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10813{ 10814 static char *kwlist[] = {"encoding", "errors", 0}; 10815 char *encoding = NULL; 10816 char *errors = NULL; 10817 10818 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10819 kwlist, &encoding, &errors)) 10820 return NULL; 10821 return PyUnicode_AsEncodedString(self, encoding, errors); 10822} 10823 10824PyDoc_STRVAR(expandtabs__doc__, 10825 "S.expandtabs([tabsize]) -> str\n\ 10826\n\ 10827Return a copy of S where all tab characters are expanded using spaces.\n\ 10828If tabsize is not given, a tab size of 8 characters is assumed."); 10829 10830static PyObject* 10831unicode_expandtabs(PyObject *self, PyObject *args) 10832{ 10833 Py_ssize_t i, j, line_pos, src_len, incr; 10834 Py_UCS4 ch; 10835 PyObject *u; 10836 void *src_data, *dest_data; 10837 int tabsize = 8; 10838 int kind; 10839 int found; 10840 10841 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10842 return NULL; 10843 10844 if (PyUnicode_READY(self) == -1) 10845 return NULL; 10846 10847 /* First pass: determine size of output string */ 10848 src_len = PyUnicode_GET_LENGTH(self); 10849 i = j = line_pos = 0; 10850 kind = PyUnicode_KIND(self); 10851 src_data = PyUnicode_DATA(self); 10852 found = 0; 10853 for (; i < src_len; i++) { 10854 ch = PyUnicode_READ(kind, src_data, i); 10855 if (ch == '\t') { 10856 found = 1; 10857 if (tabsize > 0) { 10858 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10859 if (j > PY_SSIZE_T_MAX - incr) 10860 goto overflow; 10861 line_pos += incr; 10862 j += incr; 10863 } 10864 } 10865 else { 10866 if (j > PY_SSIZE_T_MAX - 1) 10867 goto overflow; 10868 line_pos++; 10869 j++; 10870 if (ch == '\n' || ch == '\r') 10871 line_pos = 0; 10872 } 10873 } 10874 if (!found) 10875 return unicode_result_unchanged(self); 10876 10877 /* Second pass: create output string and fill it */ 10878 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10879 if (!u) 10880 return NULL; 10881 dest_data = PyUnicode_DATA(u); 10882 10883 i = j = line_pos = 0; 10884 10885 for (; i < src_len; i++) { 10886 ch = PyUnicode_READ(kind, src_data, i); 10887 if (ch == '\t') { 10888 if (tabsize > 0) { 10889 incr = tabsize - (line_pos % tabsize); 10890 line_pos += incr; 10891 FILL(kind, dest_data, ' ', j, incr); 10892 j += incr; 10893 } 10894 } 10895 else { 10896 line_pos++; 10897 PyUnicode_WRITE(kind, dest_data, j, ch); 10898 j++; 10899 if (ch == '\n' || ch == '\r') 10900 line_pos = 0; 10901 } 10902 } 10903 assert (j == PyUnicode_GET_LENGTH(u)); 10904 return unicode_result(u); 10905 10906 overflow: 10907 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10908 return NULL; 10909} 10910 10911PyDoc_STRVAR(find__doc__, 10912 "S.find(sub[, start[, end]]) -> int\n\ 10913\n\ 10914Return the lowest index in S where substring sub is found,\n\ 10915such that sub is contained within S[start:end]. Optional\n\ 10916arguments start and end are interpreted as in slice notation.\n\ 10917\n\ 10918Return -1 on failure."); 10919 10920static PyObject * 10921unicode_find(PyObject *self, PyObject *args) 10922{ 10923 PyObject *substring; 10924 Py_ssize_t start; 10925 Py_ssize_t end; 10926 Py_ssize_t result; 10927 10928 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10929 &start, &end)) 10930 return NULL; 10931 10932 if (PyUnicode_READY(self) == -1) 10933 return NULL; 10934 if (PyUnicode_READY(substring) == -1) 10935 return NULL; 10936 10937 result = any_find_slice(1, self, substring, start, end); 10938 10939 Py_DECREF(substring); 10940 10941 if (result == -2) 10942 return NULL; 10943 10944 return PyLong_FromSsize_t(result); 10945} 10946 10947static PyObject * 10948unicode_getitem(PyObject *self, Py_ssize_t index) 10949{ 10950 void *data; 10951 enum PyUnicode_Kind kind; 10952 Py_UCS4 ch; 10953 PyObject *res; 10954 10955 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 10956 PyErr_BadArgument(); 10957 return NULL; 10958 } 10959 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 10960 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10961 return NULL; 10962 } 10963 kind = PyUnicode_KIND(self); 10964 data = PyUnicode_DATA(self); 10965 ch = PyUnicode_READ(kind, data, index); 10966 if (ch < 256) 10967 return get_latin1_char(ch); 10968 10969 res = PyUnicode_New(1, ch); 10970 if (res == NULL) 10971 return NULL; 10972 kind = PyUnicode_KIND(res); 10973 data = PyUnicode_DATA(res); 10974 PyUnicode_WRITE(kind, data, 0, ch); 10975 assert(_PyUnicode_CheckConsistency(res, 1)); 10976 return res; 10977} 10978 10979/* Believe it or not, this produces the same value for ASCII strings 10980 as bytes_hash(). */ 10981static Py_hash_t 10982unicode_hash(PyObject *self) 10983{ 10984 Py_ssize_t len; 10985 Py_uhash_t x; 10986 10987#ifdef Py_DEBUG 10988 assert(_Py_HashSecret_Initialized); 10989#endif 10990 if (_PyUnicode_HASH(self) != -1) 10991 return _PyUnicode_HASH(self); 10992 if (PyUnicode_READY(self) == -1) 10993 return -1; 10994 len = PyUnicode_GET_LENGTH(self); 10995 /* 10996 We make the hash of the empty string be 0, rather than using 10997 (prefix ^ suffix), since this slightly obfuscates the hash secret 10998 */ 10999 if (len == 0) { 11000 _PyUnicode_HASH(self) = 0; 11001 return 0; 11002 } 11003 11004 /* The hash function as a macro, gets expanded three times below. */ 11005#define HASH(P) \ 11006 x ^= (Py_uhash_t) *P << 7; \ 11007 while (--len >= 0) \ 11008 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 11009 11010 x = (Py_uhash_t) _Py_HashSecret.prefix; 11011 switch (PyUnicode_KIND(self)) { 11012 case PyUnicode_1BYTE_KIND: { 11013 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11014 HASH(c); 11015 break; 11016 } 11017 case PyUnicode_2BYTE_KIND: { 11018 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11019 HASH(s); 11020 break; 11021 } 11022 default: { 11023 Py_UCS4 *l; 11024 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11025 "Impossible switch case in unicode_hash"); 11026 l = PyUnicode_4BYTE_DATA(self); 11027 HASH(l); 11028 break; 11029 } 11030 } 11031 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 11032 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 11033 11034 if (x == -1) 11035 x = -2; 11036 _PyUnicode_HASH(self) = x; 11037 return x; 11038} 11039#undef HASH 11040 11041PyDoc_STRVAR(index__doc__, 11042 "S.index(sub[, start[, end]]) -> int\n\ 11043\n\ 11044Like S.find() but raise ValueError when the substring is not found."); 11045 11046static PyObject * 11047unicode_index(PyObject *self, PyObject *args) 11048{ 11049 Py_ssize_t result; 11050 PyObject *substring; 11051 Py_ssize_t start; 11052 Py_ssize_t end; 11053 11054 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11055 &start, &end)) 11056 return NULL; 11057 11058 if (PyUnicode_READY(self) == -1) 11059 return NULL; 11060 if (PyUnicode_READY(substring) == -1) 11061 return NULL; 11062 11063 result = any_find_slice(1, self, substring, start, end); 11064 11065 Py_DECREF(substring); 11066 11067 if (result == -2) 11068 return NULL; 11069 11070 if (result < 0) { 11071 PyErr_SetString(PyExc_ValueError, "substring not found"); 11072 return NULL; 11073 } 11074 11075 return PyLong_FromSsize_t(result); 11076} 11077 11078PyDoc_STRVAR(islower__doc__, 11079 "S.islower() -> bool\n\ 11080\n\ 11081Return True if all cased characters in S are lowercase and there is\n\ 11082at least one cased character in S, False otherwise."); 11083 11084static PyObject* 11085unicode_islower(PyObject *self) 11086{ 11087 Py_ssize_t i, length; 11088 int kind; 11089 void *data; 11090 int cased; 11091 11092 if (PyUnicode_READY(self) == -1) 11093 return NULL; 11094 length = PyUnicode_GET_LENGTH(self); 11095 kind = PyUnicode_KIND(self); 11096 data = PyUnicode_DATA(self); 11097 11098 /* Shortcut for single character strings */ 11099 if (length == 1) 11100 return PyBool_FromLong( 11101 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11102 11103 /* Special case for empty strings */ 11104 if (length == 0) 11105 return PyBool_FromLong(0); 11106 11107 cased = 0; 11108 for (i = 0; i < length; i++) { 11109 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11110 11111 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11112 return PyBool_FromLong(0); 11113 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11114 cased = 1; 11115 } 11116 return PyBool_FromLong(cased); 11117} 11118 11119PyDoc_STRVAR(isupper__doc__, 11120 "S.isupper() -> bool\n\ 11121\n\ 11122Return True if all cased characters in S are uppercase and there is\n\ 11123at least one cased character in S, False otherwise."); 11124 11125static PyObject* 11126unicode_isupper(PyObject *self) 11127{ 11128 Py_ssize_t i, length; 11129 int kind; 11130 void *data; 11131 int cased; 11132 11133 if (PyUnicode_READY(self) == -1) 11134 return NULL; 11135 length = PyUnicode_GET_LENGTH(self); 11136 kind = PyUnicode_KIND(self); 11137 data = PyUnicode_DATA(self); 11138 11139 /* Shortcut for single character strings */ 11140 if (length == 1) 11141 return PyBool_FromLong( 11142 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11143 11144 /* Special case for empty strings */ 11145 if (length == 0) 11146 return PyBool_FromLong(0); 11147 11148 cased = 0; 11149 for (i = 0; i < length; i++) { 11150 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11151 11152 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11153 return PyBool_FromLong(0); 11154 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11155 cased = 1; 11156 } 11157 return PyBool_FromLong(cased); 11158} 11159 11160PyDoc_STRVAR(istitle__doc__, 11161 "S.istitle() -> bool\n\ 11162\n\ 11163Return True if S is a titlecased string and there is at least one\n\ 11164character in S, i.e. upper- and titlecase characters may only\n\ 11165follow uncased characters and lowercase characters only cased ones.\n\ 11166Return False otherwise."); 11167 11168static PyObject* 11169unicode_istitle(PyObject *self) 11170{ 11171 Py_ssize_t i, length; 11172 int kind; 11173 void *data; 11174 int cased, previous_is_cased; 11175 11176 if (PyUnicode_READY(self) == -1) 11177 return NULL; 11178 length = PyUnicode_GET_LENGTH(self); 11179 kind = PyUnicode_KIND(self); 11180 data = PyUnicode_DATA(self); 11181 11182 /* Shortcut for single character strings */ 11183 if (length == 1) { 11184 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11185 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11186 (Py_UNICODE_ISUPPER(ch) != 0)); 11187 } 11188 11189 /* Special case for empty strings */ 11190 if (length == 0) 11191 return PyBool_FromLong(0); 11192 11193 cased = 0; 11194 previous_is_cased = 0; 11195 for (i = 0; i < length; i++) { 11196 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11197 11198 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11199 if (previous_is_cased) 11200 return PyBool_FromLong(0); 11201 previous_is_cased = 1; 11202 cased = 1; 11203 } 11204 else if (Py_UNICODE_ISLOWER(ch)) { 11205 if (!previous_is_cased) 11206 return PyBool_FromLong(0); 11207 previous_is_cased = 1; 11208 cased = 1; 11209 } 11210 else 11211 previous_is_cased = 0; 11212 } 11213 return PyBool_FromLong(cased); 11214} 11215 11216PyDoc_STRVAR(isspace__doc__, 11217 "S.isspace() -> bool\n\ 11218\n\ 11219Return True if all characters in S are whitespace\n\ 11220and there is at least one character in S, False otherwise."); 11221 11222static PyObject* 11223unicode_isspace(PyObject *self) 11224{ 11225 Py_ssize_t i, length; 11226 int kind; 11227 void *data; 11228 11229 if (PyUnicode_READY(self) == -1) 11230 return NULL; 11231 length = PyUnicode_GET_LENGTH(self); 11232 kind = PyUnicode_KIND(self); 11233 data = PyUnicode_DATA(self); 11234 11235 /* Shortcut for single character strings */ 11236 if (length == 1) 11237 return PyBool_FromLong( 11238 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11239 11240 /* Special case for empty strings */ 11241 if (length == 0) 11242 return PyBool_FromLong(0); 11243 11244 for (i = 0; i < length; i++) { 11245 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11246 if (!Py_UNICODE_ISSPACE(ch)) 11247 return PyBool_FromLong(0); 11248 } 11249 return PyBool_FromLong(1); 11250} 11251 11252PyDoc_STRVAR(isalpha__doc__, 11253 "S.isalpha() -> bool\n\ 11254\n\ 11255Return True if all characters in S are alphabetic\n\ 11256and there is at least one character in S, False otherwise."); 11257 11258static PyObject* 11259unicode_isalpha(PyObject *self) 11260{ 11261 Py_ssize_t i, length; 11262 int kind; 11263 void *data; 11264 11265 if (PyUnicode_READY(self) == -1) 11266 return NULL; 11267 length = PyUnicode_GET_LENGTH(self); 11268 kind = PyUnicode_KIND(self); 11269 data = PyUnicode_DATA(self); 11270 11271 /* Shortcut for single character strings */ 11272 if (length == 1) 11273 return PyBool_FromLong( 11274 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11275 11276 /* Special case for empty strings */ 11277 if (length == 0) 11278 return PyBool_FromLong(0); 11279 11280 for (i = 0; i < length; i++) { 11281 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11282 return PyBool_FromLong(0); 11283 } 11284 return PyBool_FromLong(1); 11285} 11286 11287PyDoc_STRVAR(isalnum__doc__, 11288 "S.isalnum() -> bool\n\ 11289\n\ 11290Return True if all characters in S are alphanumeric\n\ 11291and there is at least one character in S, False otherwise."); 11292 11293static PyObject* 11294unicode_isalnum(PyObject *self) 11295{ 11296 int kind; 11297 void *data; 11298 Py_ssize_t len, i; 11299 11300 if (PyUnicode_READY(self) == -1) 11301 return NULL; 11302 11303 kind = PyUnicode_KIND(self); 11304 data = PyUnicode_DATA(self); 11305 len = PyUnicode_GET_LENGTH(self); 11306 11307 /* Shortcut for single character strings */ 11308 if (len == 1) { 11309 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11310 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11311 } 11312 11313 /* Special case for empty strings */ 11314 if (len == 0) 11315 return PyBool_FromLong(0); 11316 11317 for (i = 0; i < len; i++) { 11318 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11319 if (!Py_UNICODE_ISALNUM(ch)) 11320 return PyBool_FromLong(0); 11321 } 11322 return PyBool_FromLong(1); 11323} 11324 11325PyDoc_STRVAR(isdecimal__doc__, 11326 "S.isdecimal() -> bool\n\ 11327\n\ 11328Return True if there are only decimal characters in S,\n\ 11329False otherwise."); 11330 11331static PyObject* 11332unicode_isdecimal(PyObject *self) 11333{ 11334 Py_ssize_t i, length; 11335 int kind; 11336 void *data; 11337 11338 if (PyUnicode_READY(self) == -1) 11339 return NULL; 11340 length = PyUnicode_GET_LENGTH(self); 11341 kind = PyUnicode_KIND(self); 11342 data = PyUnicode_DATA(self); 11343 11344 /* Shortcut for single character strings */ 11345 if (length == 1) 11346 return PyBool_FromLong( 11347 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11348 11349 /* Special case for empty strings */ 11350 if (length == 0) 11351 return PyBool_FromLong(0); 11352 11353 for (i = 0; i < length; i++) { 11354 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11355 return PyBool_FromLong(0); 11356 } 11357 return PyBool_FromLong(1); 11358} 11359 11360PyDoc_STRVAR(isdigit__doc__, 11361 "S.isdigit() -> bool\n\ 11362\n\ 11363Return True if all characters in S are digits\n\ 11364and there is at least one character in S, False otherwise."); 11365 11366static PyObject* 11367unicode_isdigit(PyObject *self) 11368{ 11369 Py_ssize_t i, length; 11370 int kind; 11371 void *data; 11372 11373 if (PyUnicode_READY(self) == -1) 11374 return NULL; 11375 length = PyUnicode_GET_LENGTH(self); 11376 kind = PyUnicode_KIND(self); 11377 data = PyUnicode_DATA(self); 11378 11379 /* Shortcut for single character strings */ 11380 if (length == 1) { 11381 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11382 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11383 } 11384 11385 /* Special case for empty strings */ 11386 if (length == 0) 11387 return PyBool_FromLong(0); 11388 11389 for (i = 0; i < length; i++) { 11390 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11391 return PyBool_FromLong(0); 11392 } 11393 return PyBool_FromLong(1); 11394} 11395 11396PyDoc_STRVAR(isnumeric__doc__, 11397 "S.isnumeric() -> bool\n\ 11398\n\ 11399Return True if there are only numeric characters in S,\n\ 11400False otherwise."); 11401 11402static PyObject* 11403unicode_isnumeric(PyObject *self) 11404{ 11405 Py_ssize_t i, length; 11406 int kind; 11407 void *data; 11408 11409 if (PyUnicode_READY(self) == -1) 11410 return NULL; 11411 length = PyUnicode_GET_LENGTH(self); 11412 kind = PyUnicode_KIND(self); 11413 data = PyUnicode_DATA(self); 11414 11415 /* Shortcut for single character strings */ 11416 if (length == 1) 11417 return PyBool_FromLong( 11418 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11419 11420 /* Special case for empty strings */ 11421 if (length == 0) 11422 return PyBool_FromLong(0); 11423 11424 for (i = 0; i < length; i++) { 11425 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11426 return PyBool_FromLong(0); 11427 } 11428 return PyBool_FromLong(1); 11429} 11430 11431int 11432PyUnicode_IsIdentifier(PyObject *self) 11433{ 11434 int kind; 11435 void *data; 11436 Py_ssize_t i; 11437 Py_UCS4 first; 11438 11439 if (PyUnicode_READY(self) == -1) { 11440 Py_FatalError("identifier not ready"); 11441 return 0; 11442 } 11443 11444 /* Special case for empty strings */ 11445 if (PyUnicode_GET_LENGTH(self) == 0) 11446 return 0; 11447 kind = PyUnicode_KIND(self); 11448 data = PyUnicode_DATA(self); 11449 11450 /* PEP 3131 says that the first character must be in 11451 XID_Start and subsequent characters in XID_Continue, 11452 and for the ASCII range, the 2.x rules apply (i.e 11453 start with letters and underscore, continue with 11454 letters, digits, underscore). However, given the current 11455 definition of XID_Start and XID_Continue, it is sufficient 11456 to check just for these, except that _ must be allowed 11457 as starting an identifier. */ 11458 first = PyUnicode_READ(kind, data, 0); 11459 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11460 return 0; 11461 11462 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11463 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11464 return 0; 11465 return 1; 11466} 11467 11468PyDoc_STRVAR(isidentifier__doc__, 11469 "S.isidentifier() -> bool\n\ 11470\n\ 11471Return True if S is a valid identifier according\n\ 11472to the language definition."); 11473 11474static PyObject* 11475unicode_isidentifier(PyObject *self) 11476{ 11477 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11478} 11479 11480PyDoc_STRVAR(isprintable__doc__, 11481 "S.isprintable() -> bool\n\ 11482\n\ 11483Return True if all characters in S are considered\n\ 11484printable in repr() or S is empty, False otherwise."); 11485 11486static PyObject* 11487unicode_isprintable(PyObject *self) 11488{ 11489 Py_ssize_t i, length; 11490 int kind; 11491 void *data; 11492 11493 if (PyUnicode_READY(self) == -1) 11494 return NULL; 11495 length = PyUnicode_GET_LENGTH(self); 11496 kind = PyUnicode_KIND(self); 11497 data = PyUnicode_DATA(self); 11498 11499 /* Shortcut for single character strings */ 11500 if (length == 1) 11501 return PyBool_FromLong( 11502 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11503 11504 for (i = 0; i < length; i++) { 11505 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11506 Py_RETURN_FALSE; 11507 } 11508 } 11509 Py_RETURN_TRUE; 11510} 11511 11512PyDoc_STRVAR(join__doc__, 11513 "S.join(iterable) -> str\n\ 11514\n\ 11515Return a string which is the concatenation of the strings in the\n\ 11516iterable. The separator between elements is S."); 11517 11518static PyObject* 11519unicode_join(PyObject *self, PyObject *data) 11520{ 11521 return PyUnicode_Join(self, data); 11522} 11523 11524static Py_ssize_t 11525unicode_length(PyObject *self) 11526{ 11527 if (PyUnicode_READY(self) == -1) 11528 return -1; 11529 return PyUnicode_GET_LENGTH(self); 11530} 11531 11532PyDoc_STRVAR(ljust__doc__, 11533 "S.ljust(width[, fillchar]) -> str\n\ 11534\n\ 11535Return S left-justified in a Unicode string of length width. Padding is\n\ 11536done using the specified fill character (default is a space)."); 11537 11538static PyObject * 11539unicode_ljust(PyObject *self, PyObject *args) 11540{ 11541 Py_ssize_t width; 11542 Py_UCS4 fillchar = ' '; 11543 11544 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11545 return NULL; 11546 11547 if (PyUnicode_READY(self) == -1) 11548 return NULL; 11549 11550 if (PyUnicode_GET_LENGTH(self) >= width) 11551 return unicode_result_unchanged(self); 11552 11553 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11554} 11555 11556PyDoc_STRVAR(lower__doc__, 11557 "S.lower() -> str\n\ 11558\n\ 11559Return a copy of the string S converted to lowercase."); 11560 11561static PyObject* 11562unicode_lower(PyObject *self) 11563{ 11564 if (PyUnicode_READY(self) == -1) 11565 return NULL; 11566 if (PyUnicode_IS_ASCII(self)) 11567 return ascii_upper_or_lower(self, 1); 11568 return case_operation(self, do_lower); 11569} 11570 11571#define LEFTSTRIP 0 11572#define RIGHTSTRIP 1 11573#define BOTHSTRIP 2 11574 11575/* Arrays indexed by above */ 11576static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11577 11578#define STRIPNAME(i) (stripformat[i]+3) 11579 11580/* externally visible for str.strip(unicode) */ 11581PyObject * 11582_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11583{ 11584 void *data; 11585 int kind; 11586 Py_ssize_t i, j, len; 11587 BLOOM_MASK sepmask; 11588 11589 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11590 return NULL; 11591 11592 kind = PyUnicode_KIND(self); 11593 data = PyUnicode_DATA(self); 11594 len = PyUnicode_GET_LENGTH(self); 11595 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11596 PyUnicode_DATA(sepobj), 11597 PyUnicode_GET_LENGTH(sepobj)); 11598 11599 i = 0; 11600 if (striptype != RIGHTSTRIP) { 11601 while (i < len && 11602 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11603 i++; 11604 } 11605 } 11606 11607 j = len; 11608 if (striptype != LEFTSTRIP) { 11609 do { 11610 j--; 11611 } while (j >= i && 11612 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11613 j++; 11614 } 11615 11616 return PyUnicode_Substring(self, i, j); 11617} 11618 11619PyObject* 11620PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11621{ 11622 unsigned char *data; 11623 int kind; 11624 Py_ssize_t length; 11625 11626 if (PyUnicode_READY(self) == -1) 11627 return NULL; 11628 11629 length = PyUnicode_GET_LENGTH(self); 11630 end = Py_MIN(end, length); 11631 11632 if (start == 0 && end == length) 11633 return unicode_result_unchanged(self); 11634 11635 if (start < 0 || end < 0) { 11636 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11637 return NULL; 11638 } 11639 if (start >= length || end < start) { 11640 Py_INCREF(unicode_empty); 11641 return unicode_empty; 11642 } 11643 11644 length = end - start; 11645 if (PyUnicode_IS_ASCII(self)) { 11646 data = PyUnicode_1BYTE_DATA(self); 11647 return _PyUnicode_FromASCII((char*)(data + start), length); 11648 } 11649 else { 11650 kind = PyUnicode_KIND(self); 11651 data = PyUnicode_1BYTE_DATA(self); 11652 return PyUnicode_FromKindAndData(kind, 11653 data + kind * start, 11654 length); 11655 } 11656} 11657 11658static PyObject * 11659do_strip(PyObject *self, int striptype) 11660{ 11661 int kind; 11662 void *data; 11663 Py_ssize_t len, i, j; 11664 11665 if (PyUnicode_READY(self) == -1) 11666 return NULL; 11667 11668 kind = PyUnicode_KIND(self); 11669 data = PyUnicode_DATA(self); 11670 len = PyUnicode_GET_LENGTH(self); 11671 11672 i = 0; 11673 if (striptype != RIGHTSTRIP) { 11674 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11675 i++; 11676 } 11677 } 11678 11679 j = len; 11680 if (striptype != LEFTSTRIP) { 11681 do { 11682 j--; 11683 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11684 j++; 11685 } 11686 11687 return PyUnicode_Substring(self, i, j); 11688} 11689 11690 11691static PyObject * 11692do_argstrip(PyObject *self, int striptype, PyObject *args) 11693{ 11694 PyObject *sep = NULL; 11695 11696 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11697 return NULL; 11698 11699 if (sep != NULL && sep != Py_None) { 11700 if (PyUnicode_Check(sep)) 11701 return _PyUnicode_XStrip(self, striptype, sep); 11702 else { 11703 PyErr_Format(PyExc_TypeError, 11704 "%s arg must be None or str", 11705 STRIPNAME(striptype)); 11706 return NULL; 11707 } 11708 } 11709 11710 return do_strip(self, striptype); 11711} 11712 11713 11714PyDoc_STRVAR(strip__doc__, 11715 "S.strip([chars]) -> str\n\ 11716\n\ 11717Return a copy of the string S with leading and trailing\n\ 11718whitespace removed.\n\ 11719If chars is given and not None, remove characters in chars instead."); 11720 11721static PyObject * 11722unicode_strip(PyObject *self, PyObject *args) 11723{ 11724 if (PyTuple_GET_SIZE(args) == 0) 11725 return do_strip(self, BOTHSTRIP); /* Common case */ 11726 else 11727 return do_argstrip(self, BOTHSTRIP, args); 11728} 11729 11730 11731PyDoc_STRVAR(lstrip__doc__, 11732 "S.lstrip([chars]) -> str\n\ 11733\n\ 11734Return a copy of the string S with leading whitespace removed.\n\ 11735If chars is given and not None, remove characters in chars instead."); 11736 11737static PyObject * 11738unicode_lstrip(PyObject *self, PyObject *args) 11739{ 11740 if (PyTuple_GET_SIZE(args) == 0) 11741 return do_strip(self, LEFTSTRIP); /* Common case */ 11742 else 11743 return do_argstrip(self, LEFTSTRIP, args); 11744} 11745 11746 11747PyDoc_STRVAR(rstrip__doc__, 11748 "S.rstrip([chars]) -> str\n\ 11749\n\ 11750Return a copy of the string S with trailing whitespace removed.\n\ 11751If chars is given and not None, remove characters in chars instead."); 11752 11753static PyObject * 11754unicode_rstrip(PyObject *self, PyObject *args) 11755{ 11756 if (PyTuple_GET_SIZE(args) == 0) 11757 return do_strip(self, RIGHTSTRIP); /* Common case */ 11758 else 11759 return do_argstrip(self, RIGHTSTRIP, args); 11760} 11761 11762 11763static PyObject* 11764unicode_repeat(PyObject *str, Py_ssize_t len) 11765{ 11766 PyObject *u; 11767 Py_ssize_t nchars, n; 11768 11769 if (len < 1) { 11770 Py_INCREF(unicode_empty); 11771 return unicode_empty; 11772 } 11773 11774 /* no repeat, return original string */ 11775 if (len == 1) 11776 return unicode_result_unchanged(str); 11777 11778 if (PyUnicode_READY(str) == -1) 11779 return NULL; 11780 11781 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11782 PyErr_SetString(PyExc_OverflowError, 11783 "repeated string is too long"); 11784 return NULL; 11785 } 11786 nchars = len * PyUnicode_GET_LENGTH(str); 11787 11788 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11789 if (!u) 11790 return NULL; 11791 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11792 11793 if (PyUnicode_GET_LENGTH(str) == 1) { 11794 const int kind = PyUnicode_KIND(str); 11795 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11796 if (kind == PyUnicode_1BYTE_KIND) { 11797 void *to = PyUnicode_DATA(u); 11798 memset(to, (unsigned char)fill_char, len); 11799 } 11800 else if (kind == PyUnicode_2BYTE_KIND) { 11801 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11802 for (n = 0; n < len; ++n) 11803 ucs2[n] = fill_char; 11804 } else { 11805 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11806 assert(kind == PyUnicode_4BYTE_KIND); 11807 for (n = 0; n < len; ++n) 11808 ucs4[n] = fill_char; 11809 } 11810 } 11811 else { 11812 /* number of characters copied this far */ 11813 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11814 const Py_ssize_t char_size = PyUnicode_KIND(str); 11815 char *to = (char *) PyUnicode_DATA(u); 11816 Py_MEMCPY(to, PyUnicode_DATA(str), 11817 PyUnicode_GET_LENGTH(str) * char_size); 11818 while (done < nchars) { 11819 n = (done <= nchars-done) ? done : nchars-done; 11820 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11821 done += n; 11822 } 11823 } 11824 11825 assert(_PyUnicode_CheckConsistency(u, 1)); 11826 return u; 11827} 11828 11829PyObject * 11830PyUnicode_Replace(PyObject *obj, 11831 PyObject *subobj, 11832 PyObject *replobj, 11833 Py_ssize_t maxcount) 11834{ 11835 PyObject *self; 11836 PyObject *str1; 11837 PyObject *str2; 11838 PyObject *result; 11839 11840 self = PyUnicode_FromObject(obj); 11841 if (self == NULL) 11842 return NULL; 11843 str1 = PyUnicode_FromObject(subobj); 11844 if (str1 == NULL) { 11845 Py_DECREF(self); 11846 return NULL; 11847 } 11848 str2 = PyUnicode_FromObject(replobj); 11849 if (str2 == NULL) { 11850 Py_DECREF(self); 11851 Py_DECREF(str1); 11852 return NULL; 11853 } 11854 if (PyUnicode_READY(self) == -1 || 11855 PyUnicode_READY(str1) == -1 || 11856 PyUnicode_READY(str2) == -1) 11857 result = NULL; 11858 else 11859 result = replace(self, str1, str2, maxcount); 11860 Py_DECREF(self); 11861 Py_DECREF(str1); 11862 Py_DECREF(str2); 11863 return result; 11864} 11865 11866PyDoc_STRVAR(replace__doc__, 11867 "S.replace(old, new[, count]) -> str\n\ 11868\n\ 11869Return a copy of S with all occurrences of substring\n\ 11870old replaced by new. If the optional argument count is\n\ 11871given, only the first count occurrences are replaced."); 11872 11873static PyObject* 11874unicode_replace(PyObject *self, PyObject *args) 11875{ 11876 PyObject *str1; 11877 PyObject *str2; 11878 Py_ssize_t maxcount = -1; 11879 PyObject *result; 11880 11881 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11882 return NULL; 11883 if (PyUnicode_READY(self) == -1) 11884 return NULL; 11885 str1 = PyUnicode_FromObject(str1); 11886 if (str1 == NULL) 11887 return NULL; 11888 str2 = PyUnicode_FromObject(str2); 11889 if (str2 == NULL) { 11890 Py_DECREF(str1); 11891 return NULL; 11892 } 11893 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 11894 result = NULL; 11895 else 11896 result = replace(self, str1, str2, maxcount); 11897 11898 Py_DECREF(str1); 11899 Py_DECREF(str2); 11900 return result; 11901} 11902 11903static PyObject * 11904unicode_repr(PyObject *unicode) 11905{ 11906 PyObject *repr; 11907 Py_ssize_t isize; 11908 Py_ssize_t osize, squote, dquote, i, o; 11909 Py_UCS4 max, quote; 11910 int ikind, okind; 11911 void *idata, *odata; 11912 11913 if (PyUnicode_READY(unicode) == -1) 11914 return NULL; 11915 11916 isize = PyUnicode_GET_LENGTH(unicode); 11917 idata = PyUnicode_DATA(unicode); 11918 11919 /* Compute length of output, quote characters, and 11920 maximum character */ 11921 osize = 2; /* quotes */ 11922 max = 127; 11923 squote = dquote = 0; 11924 ikind = PyUnicode_KIND(unicode); 11925 for (i = 0; i < isize; i++) { 11926 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11927 switch (ch) { 11928 case '\'': squote++; osize++; break; 11929 case '"': dquote++; osize++; break; 11930 case '\\': case '\t': case '\r': case '\n': 11931 osize += 2; break; 11932 default: 11933 /* Fast-path ASCII */ 11934 if (ch < ' ' || ch == 0x7f) 11935 osize += 4; /* \xHH */ 11936 else if (ch < 0x7f) 11937 osize++; 11938 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11939 osize++; 11940 max = ch > max ? ch : max; 11941 } 11942 else if (ch < 0x100) 11943 osize += 4; /* \xHH */ 11944 else if (ch < 0x10000) 11945 osize += 6; /* \uHHHH */ 11946 else 11947 osize += 10; /* \uHHHHHHHH */ 11948 } 11949 } 11950 11951 quote = '\''; 11952 if (squote) { 11953 if (dquote) 11954 /* Both squote and dquote present. Use squote, 11955 and escape them */ 11956 osize += squote; 11957 else 11958 quote = '"'; 11959 } 11960 11961 repr = PyUnicode_New(osize, max); 11962 if (repr == NULL) 11963 return NULL; 11964 okind = PyUnicode_KIND(repr); 11965 odata = PyUnicode_DATA(repr); 11966 11967 PyUnicode_WRITE(okind, odata, 0, quote); 11968 PyUnicode_WRITE(okind, odata, osize-1, quote); 11969 11970 for (i = 0, o = 1; i < isize; i++) { 11971 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11972 11973 /* Escape quotes and backslashes */ 11974 if ((ch == quote) || (ch == '\\')) { 11975 PyUnicode_WRITE(okind, odata, o++, '\\'); 11976 PyUnicode_WRITE(okind, odata, o++, ch); 11977 continue; 11978 } 11979 11980 /* Map special whitespace to '\t', \n', '\r' */ 11981 if (ch == '\t') { 11982 PyUnicode_WRITE(okind, odata, o++, '\\'); 11983 PyUnicode_WRITE(okind, odata, o++, 't'); 11984 } 11985 else if (ch == '\n') { 11986 PyUnicode_WRITE(okind, odata, o++, '\\'); 11987 PyUnicode_WRITE(okind, odata, o++, 'n'); 11988 } 11989 else if (ch == '\r') { 11990 PyUnicode_WRITE(okind, odata, o++, '\\'); 11991 PyUnicode_WRITE(okind, odata, o++, 'r'); 11992 } 11993 11994 /* Map non-printable US ASCII to '\xhh' */ 11995 else if (ch < ' ' || ch == 0x7F) { 11996 PyUnicode_WRITE(okind, odata, o++, '\\'); 11997 PyUnicode_WRITE(okind, odata, o++, 'x'); 11998 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11999 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12000 } 12001 12002 /* Copy ASCII characters as-is */ 12003 else if (ch < 0x7F) { 12004 PyUnicode_WRITE(okind, odata, o++, ch); 12005 } 12006 12007 /* Non-ASCII characters */ 12008 else { 12009 /* Map Unicode whitespace and control characters 12010 (categories Z* and C* except ASCII space) 12011 */ 12012 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12013 PyUnicode_WRITE(okind, odata, o++, '\\'); 12014 /* Map 8-bit characters to '\xhh' */ 12015 if (ch <= 0xff) { 12016 PyUnicode_WRITE(okind, odata, o++, 'x'); 12017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12019 } 12020 /* Map 16-bit characters to '\uxxxx' */ 12021 else if (ch <= 0xffff) { 12022 PyUnicode_WRITE(okind, odata, o++, 'u'); 12023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12026 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12027 } 12028 /* Map 21-bit characters to '\U00xxxxxx' */ 12029 else { 12030 PyUnicode_WRITE(okind, odata, o++, 'U'); 12031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12033 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12038 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12039 } 12040 } 12041 /* Copy characters as-is */ 12042 else { 12043 PyUnicode_WRITE(okind, odata, o++, ch); 12044 } 12045 } 12046 } 12047 /* Closing quote already added at the beginning */ 12048 assert(_PyUnicode_CheckConsistency(repr, 1)); 12049 return repr; 12050} 12051 12052PyDoc_STRVAR(rfind__doc__, 12053 "S.rfind(sub[, start[, end]]) -> int\n\ 12054\n\ 12055Return the highest index in S where substring sub is found,\n\ 12056such that sub is contained within S[start:end]. Optional\n\ 12057arguments start and end are interpreted as in slice notation.\n\ 12058\n\ 12059Return -1 on failure."); 12060 12061static PyObject * 12062unicode_rfind(PyObject *self, PyObject *args) 12063{ 12064 PyObject *substring; 12065 Py_ssize_t start; 12066 Py_ssize_t end; 12067 Py_ssize_t result; 12068 12069 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12070 &start, &end)) 12071 return NULL; 12072 12073 if (PyUnicode_READY(self) == -1) 12074 return NULL; 12075 if (PyUnicode_READY(substring) == -1) 12076 return NULL; 12077 12078 result = any_find_slice(-1, self, substring, start, end); 12079 12080 Py_DECREF(substring); 12081 12082 if (result == -2) 12083 return NULL; 12084 12085 return PyLong_FromSsize_t(result); 12086} 12087 12088PyDoc_STRVAR(rindex__doc__, 12089 "S.rindex(sub[, start[, end]]) -> int\n\ 12090\n\ 12091Like S.rfind() but raise ValueError when the substring is not found."); 12092 12093static PyObject * 12094unicode_rindex(PyObject *self, PyObject *args) 12095{ 12096 PyObject *substring; 12097 Py_ssize_t start; 12098 Py_ssize_t end; 12099 Py_ssize_t result; 12100 12101 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12102 &start, &end)) 12103 return NULL; 12104 12105 if (PyUnicode_READY(self) == -1) 12106 return NULL; 12107 if (PyUnicode_READY(substring) == -1) 12108 return NULL; 12109 12110 result = any_find_slice(-1, self, substring, start, end); 12111 12112 Py_DECREF(substring); 12113 12114 if (result == -2) 12115 return NULL; 12116 12117 if (result < 0) { 12118 PyErr_SetString(PyExc_ValueError, "substring not found"); 12119 return NULL; 12120 } 12121 12122 return PyLong_FromSsize_t(result); 12123} 12124 12125PyDoc_STRVAR(rjust__doc__, 12126 "S.rjust(width[, fillchar]) -> str\n\ 12127\n\ 12128Return S right-justified in a string of length width. Padding is\n\ 12129done using the specified fill character (default is a space)."); 12130 12131static PyObject * 12132unicode_rjust(PyObject *self, PyObject *args) 12133{ 12134 Py_ssize_t width; 12135 Py_UCS4 fillchar = ' '; 12136 12137 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12138 return NULL; 12139 12140 if (PyUnicode_READY(self) == -1) 12141 return NULL; 12142 12143 if (PyUnicode_GET_LENGTH(self) >= width) 12144 return unicode_result_unchanged(self); 12145 12146 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12147} 12148 12149PyObject * 12150PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12151{ 12152 PyObject *result; 12153 12154 s = PyUnicode_FromObject(s); 12155 if (s == NULL) 12156 return NULL; 12157 if (sep != NULL) { 12158 sep = PyUnicode_FromObject(sep); 12159 if (sep == NULL) { 12160 Py_DECREF(s); 12161 return NULL; 12162 } 12163 } 12164 12165 result = split(s, sep, maxsplit); 12166 12167 Py_DECREF(s); 12168 Py_XDECREF(sep); 12169 return result; 12170} 12171 12172PyDoc_STRVAR(split__doc__, 12173 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12174\n\ 12175Return a list of the words in S, using sep as the\n\ 12176delimiter string. If maxsplit is given, at most maxsplit\n\ 12177splits are done. If sep is not specified or is None, any\n\ 12178whitespace string is a separator and empty strings are\n\ 12179removed from the result."); 12180 12181static PyObject* 12182unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12183{ 12184 static char *kwlist[] = {"sep", "maxsplit", 0}; 12185 PyObject *substring = Py_None; 12186 Py_ssize_t maxcount = -1; 12187 12188 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12189 kwlist, &substring, &maxcount)) 12190 return NULL; 12191 12192 if (substring == Py_None) 12193 return split(self, NULL, maxcount); 12194 else if (PyUnicode_Check(substring)) 12195 return split(self, substring, maxcount); 12196 else 12197 return PyUnicode_Split(self, substring, maxcount); 12198} 12199 12200PyObject * 12201PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12202{ 12203 PyObject* str_obj; 12204 PyObject* sep_obj; 12205 PyObject* out; 12206 int kind1, kind2, kind; 12207 void *buf1 = NULL, *buf2 = NULL; 12208 Py_ssize_t len1, len2; 12209 12210 str_obj = PyUnicode_FromObject(str_in); 12211 if (!str_obj) 12212 return NULL; 12213 sep_obj = PyUnicode_FromObject(sep_in); 12214 if (!sep_obj) { 12215 Py_DECREF(str_obj); 12216 return NULL; 12217 } 12218 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12219 Py_DECREF(sep_obj); 12220 Py_DECREF(str_obj); 12221 return NULL; 12222 } 12223 12224 kind1 = PyUnicode_KIND(str_obj); 12225 kind2 = PyUnicode_KIND(sep_obj); 12226 kind = Py_MAX(kind1, kind2); 12227 buf1 = PyUnicode_DATA(str_obj); 12228 if (kind1 != kind) 12229 buf1 = _PyUnicode_AsKind(str_obj, kind); 12230 if (!buf1) 12231 goto onError; 12232 buf2 = PyUnicode_DATA(sep_obj); 12233 if (kind2 != kind) 12234 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12235 if (!buf2) 12236 goto onError; 12237 len1 = PyUnicode_GET_LENGTH(str_obj); 12238 len2 = PyUnicode_GET_LENGTH(sep_obj); 12239 12240 switch (PyUnicode_KIND(str_obj)) { 12241 case PyUnicode_1BYTE_KIND: 12242 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12243 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12244 else 12245 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12246 break; 12247 case PyUnicode_2BYTE_KIND: 12248 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12249 break; 12250 case PyUnicode_4BYTE_KIND: 12251 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12252 break; 12253 default: 12254 assert(0); 12255 out = 0; 12256 } 12257 12258 Py_DECREF(sep_obj); 12259 Py_DECREF(str_obj); 12260 if (kind1 != kind) 12261 PyMem_Free(buf1); 12262 if (kind2 != kind) 12263 PyMem_Free(buf2); 12264 12265 return out; 12266 onError: 12267 Py_DECREF(sep_obj); 12268 Py_DECREF(str_obj); 12269 if (kind1 != kind && buf1) 12270 PyMem_Free(buf1); 12271 if (kind2 != kind && buf2) 12272 PyMem_Free(buf2); 12273 return NULL; 12274} 12275 12276 12277PyObject * 12278PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12279{ 12280 PyObject* str_obj; 12281 PyObject* sep_obj; 12282 PyObject* out; 12283 int kind1, kind2, kind; 12284 void *buf1 = NULL, *buf2 = NULL; 12285 Py_ssize_t len1, len2; 12286 12287 str_obj = PyUnicode_FromObject(str_in); 12288 if (!str_obj) 12289 return NULL; 12290 sep_obj = PyUnicode_FromObject(sep_in); 12291 if (!sep_obj) { 12292 Py_DECREF(str_obj); 12293 return NULL; 12294 } 12295 12296 kind1 = PyUnicode_KIND(str_in); 12297 kind2 = PyUnicode_KIND(sep_obj); 12298 kind = Py_MAX(kind1, kind2); 12299 buf1 = PyUnicode_DATA(str_in); 12300 if (kind1 != kind) 12301 buf1 = _PyUnicode_AsKind(str_in, kind); 12302 if (!buf1) 12303 goto onError; 12304 buf2 = PyUnicode_DATA(sep_obj); 12305 if (kind2 != kind) 12306 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12307 if (!buf2) 12308 goto onError; 12309 len1 = PyUnicode_GET_LENGTH(str_obj); 12310 len2 = PyUnicode_GET_LENGTH(sep_obj); 12311 12312 switch (PyUnicode_KIND(str_in)) { 12313 case PyUnicode_1BYTE_KIND: 12314 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12315 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12316 else 12317 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12318 break; 12319 case PyUnicode_2BYTE_KIND: 12320 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12321 break; 12322 case PyUnicode_4BYTE_KIND: 12323 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12324 break; 12325 default: 12326 assert(0); 12327 out = 0; 12328 } 12329 12330 Py_DECREF(sep_obj); 12331 Py_DECREF(str_obj); 12332 if (kind1 != kind) 12333 PyMem_Free(buf1); 12334 if (kind2 != kind) 12335 PyMem_Free(buf2); 12336 12337 return out; 12338 onError: 12339 Py_DECREF(sep_obj); 12340 Py_DECREF(str_obj); 12341 if (kind1 != kind && buf1) 12342 PyMem_Free(buf1); 12343 if (kind2 != kind && buf2) 12344 PyMem_Free(buf2); 12345 return NULL; 12346} 12347 12348PyDoc_STRVAR(partition__doc__, 12349 "S.partition(sep) -> (head, sep, tail)\n\ 12350\n\ 12351Search for the separator sep in S, and return the part before it,\n\ 12352the separator itself, and the part after it. If the separator is not\n\ 12353found, return S and two empty strings."); 12354 12355static PyObject* 12356unicode_partition(PyObject *self, PyObject *separator) 12357{ 12358 return PyUnicode_Partition(self, separator); 12359} 12360 12361PyDoc_STRVAR(rpartition__doc__, 12362 "S.rpartition(sep) -> (head, sep, tail)\n\ 12363\n\ 12364Search for the separator sep in S, starting at the end of S, and return\n\ 12365the part before it, the separator itself, and the part after it. If the\n\ 12366separator is not found, return two empty strings and S."); 12367 12368static PyObject* 12369unicode_rpartition(PyObject *self, PyObject *separator) 12370{ 12371 return PyUnicode_RPartition(self, separator); 12372} 12373 12374PyObject * 12375PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12376{ 12377 PyObject *result; 12378 12379 s = PyUnicode_FromObject(s); 12380 if (s == NULL) 12381 return NULL; 12382 if (sep != NULL) { 12383 sep = PyUnicode_FromObject(sep); 12384 if (sep == NULL) { 12385 Py_DECREF(s); 12386 return NULL; 12387 } 12388 } 12389 12390 result = rsplit(s, sep, maxsplit); 12391 12392 Py_DECREF(s); 12393 Py_XDECREF(sep); 12394 return result; 12395} 12396 12397PyDoc_STRVAR(rsplit__doc__, 12398 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12399\n\ 12400Return a list of the words in S, using sep as the\n\ 12401delimiter string, starting at the end of the string and\n\ 12402working to the front. If maxsplit is given, at most maxsplit\n\ 12403splits are done. If sep is not specified, any whitespace string\n\ 12404is a separator."); 12405 12406static PyObject* 12407unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12408{ 12409 static char *kwlist[] = {"sep", "maxsplit", 0}; 12410 PyObject *substring = Py_None; 12411 Py_ssize_t maxcount = -1; 12412 12413 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12414 kwlist, &substring, &maxcount)) 12415 return NULL; 12416 12417 if (substring == Py_None) 12418 return rsplit(self, NULL, maxcount); 12419 else if (PyUnicode_Check(substring)) 12420 return rsplit(self, substring, maxcount); 12421 else 12422 return PyUnicode_RSplit(self, substring, maxcount); 12423} 12424 12425PyDoc_STRVAR(splitlines__doc__, 12426 "S.splitlines([keepends]) -> list of strings\n\ 12427\n\ 12428Return a list of the lines in S, breaking at line boundaries.\n\ 12429Line breaks are not included in the resulting list unless keepends\n\ 12430is given and true."); 12431 12432static PyObject* 12433unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12434{ 12435 static char *kwlist[] = {"keepends", 0}; 12436 int keepends = 0; 12437 12438 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12439 kwlist, &keepends)) 12440 return NULL; 12441 12442 return PyUnicode_Splitlines(self, keepends); 12443} 12444 12445static 12446PyObject *unicode_str(PyObject *self) 12447{ 12448 return unicode_result_unchanged(self); 12449} 12450 12451PyDoc_STRVAR(swapcase__doc__, 12452 "S.swapcase() -> str\n\ 12453\n\ 12454Return a copy of S with uppercase characters converted to lowercase\n\ 12455and vice versa."); 12456 12457static PyObject* 12458unicode_swapcase(PyObject *self) 12459{ 12460 if (PyUnicode_READY(self) == -1) 12461 return NULL; 12462 return case_operation(self, do_swapcase); 12463} 12464 12465PyDoc_STRVAR(maketrans__doc__, 12466 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12467\n\ 12468Return a translation table usable for str.translate().\n\ 12469If there is only one argument, it must be a dictionary mapping Unicode\n\ 12470ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12471Character keys will be then converted to ordinals.\n\ 12472If there are two arguments, they must be strings of equal length, and\n\ 12473in the resulting dictionary, each character in x will be mapped to the\n\ 12474character at the same position in y. If there is a third argument, it\n\ 12475must be a string, whose characters will be mapped to None in the result."); 12476 12477static PyObject* 12478unicode_maketrans(PyObject *null, PyObject *args) 12479{ 12480 PyObject *x, *y = NULL, *z = NULL; 12481 PyObject *new = NULL, *key, *value; 12482 Py_ssize_t i = 0; 12483 int res; 12484 12485 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12486 return NULL; 12487 new = PyDict_New(); 12488 if (!new) 12489 return NULL; 12490 if (y != NULL) { 12491 int x_kind, y_kind, z_kind; 12492 void *x_data, *y_data, *z_data; 12493 12494 /* x must be a string too, of equal length */ 12495 if (!PyUnicode_Check(x)) { 12496 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12497 "be a string if there is a second argument"); 12498 goto err; 12499 } 12500 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12501 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12502 "arguments must have equal length"); 12503 goto err; 12504 } 12505 /* create entries for translating chars in x to those in y */ 12506 x_kind = PyUnicode_KIND(x); 12507 y_kind = PyUnicode_KIND(y); 12508 x_data = PyUnicode_DATA(x); 12509 y_data = PyUnicode_DATA(y); 12510 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12511 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12512 if (!key) 12513 goto err; 12514 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12515 if (!value) { 12516 Py_DECREF(key); 12517 goto err; 12518 } 12519 res = PyDict_SetItem(new, key, value); 12520 Py_DECREF(key); 12521 Py_DECREF(value); 12522 if (res < 0) 12523 goto err; 12524 } 12525 /* create entries for deleting chars in z */ 12526 if (z != NULL) { 12527 z_kind = PyUnicode_KIND(z); 12528 z_data = PyUnicode_DATA(z); 12529 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12530 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12531 if (!key) 12532 goto err; 12533 res = PyDict_SetItem(new, key, Py_None); 12534 Py_DECREF(key); 12535 if (res < 0) 12536 goto err; 12537 } 12538 } 12539 } else { 12540 int kind; 12541 void *data; 12542 12543 /* x must be a dict */ 12544 if (!PyDict_CheckExact(x)) { 12545 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12546 "to maketrans it must be a dict"); 12547 goto err; 12548 } 12549 /* copy entries into the new dict, converting string keys to int keys */ 12550 while (PyDict_Next(x, &i, &key, &value)) { 12551 if (PyUnicode_Check(key)) { 12552 /* convert string keys to integer keys */ 12553 PyObject *newkey; 12554 if (PyUnicode_GET_LENGTH(key) != 1) { 12555 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12556 "table must be of length 1"); 12557 goto err; 12558 } 12559 kind = PyUnicode_KIND(key); 12560 data = PyUnicode_DATA(key); 12561 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12562 if (!newkey) 12563 goto err; 12564 res = PyDict_SetItem(new, newkey, value); 12565 Py_DECREF(newkey); 12566 if (res < 0) 12567 goto err; 12568 } else if (PyLong_Check(key)) { 12569 /* just keep integer keys */ 12570 if (PyDict_SetItem(new, key, value) < 0) 12571 goto err; 12572 } else { 12573 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12574 "be strings or integers"); 12575 goto err; 12576 } 12577 } 12578 } 12579 return new; 12580 err: 12581 Py_DECREF(new); 12582 return NULL; 12583} 12584 12585PyDoc_STRVAR(translate__doc__, 12586 "S.translate(table) -> str\n\ 12587\n\ 12588Return a copy of the string S, where all characters have been mapped\n\ 12589through the given translation table, which must be a mapping of\n\ 12590Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12591Unmapped characters are left untouched. Characters mapped to None\n\ 12592are deleted."); 12593 12594static PyObject* 12595unicode_translate(PyObject *self, PyObject *table) 12596{ 12597 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12598} 12599 12600PyDoc_STRVAR(upper__doc__, 12601 "S.upper() -> str\n\ 12602\n\ 12603Return a copy of S converted to uppercase."); 12604 12605static PyObject* 12606unicode_upper(PyObject *self) 12607{ 12608 if (PyUnicode_READY(self) == -1) 12609 return NULL; 12610 if (PyUnicode_IS_ASCII(self)) 12611 return ascii_upper_or_lower(self, 0); 12612 return case_operation(self, do_upper); 12613} 12614 12615PyDoc_STRVAR(zfill__doc__, 12616 "S.zfill(width) -> str\n\ 12617\n\ 12618Pad a numeric string S with zeros on the left, to fill a field\n\ 12619of the specified width. The string S is never truncated."); 12620 12621static PyObject * 12622unicode_zfill(PyObject *self, PyObject *args) 12623{ 12624 Py_ssize_t fill; 12625 PyObject *u; 12626 Py_ssize_t width; 12627 int kind; 12628 void *data; 12629 Py_UCS4 chr; 12630 12631 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12632 return NULL; 12633 12634 if (PyUnicode_READY(self) == -1) 12635 return NULL; 12636 12637 if (PyUnicode_GET_LENGTH(self) >= width) 12638 return unicode_result_unchanged(self); 12639 12640 fill = width - PyUnicode_GET_LENGTH(self); 12641 12642 u = pad(self, fill, 0, '0'); 12643 12644 if (u == NULL) 12645 return NULL; 12646 12647 kind = PyUnicode_KIND(u); 12648 data = PyUnicode_DATA(u); 12649 chr = PyUnicode_READ(kind, data, fill); 12650 12651 if (chr == '+' || chr == '-') { 12652 /* move sign to beginning of string */ 12653 PyUnicode_WRITE(kind, data, 0, chr); 12654 PyUnicode_WRITE(kind, data, fill, '0'); 12655 } 12656 12657 assert(_PyUnicode_CheckConsistency(u, 1)); 12658 return u; 12659} 12660 12661#if 0 12662static PyObject * 12663unicode__decimal2ascii(PyObject *self) 12664{ 12665 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12666} 12667#endif 12668 12669PyDoc_STRVAR(startswith__doc__, 12670 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12671\n\ 12672Return True if S starts with the specified prefix, False otherwise.\n\ 12673With optional start, test S beginning at that position.\n\ 12674With optional end, stop comparing S at that position.\n\ 12675prefix can also be a tuple of strings to try."); 12676 12677static PyObject * 12678unicode_startswith(PyObject *self, 12679 PyObject *args) 12680{ 12681 PyObject *subobj; 12682 PyObject *substring; 12683 Py_ssize_t start = 0; 12684 Py_ssize_t end = PY_SSIZE_T_MAX; 12685 int result; 12686 12687 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12688 return NULL; 12689 if (PyTuple_Check(subobj)) { 12690 Py_ssize_t i; 12691 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12692 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12693 if (substring == NULL) 12694 return NULL; 12695 result = tailmatch(self, substring, start, end, -1); 12696 Py_DECREF(substring); 12697 if (result) { 12698 Py_RETURN_TRUE; 12699 } 12700 } 12701 /* nothing matched */ 12702 Py_RETURN_FALSE; 12703 } 12704 substring = PyUnicode_FromObject(subobj); 12705 if (substring == NULL) { 12706 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12707 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12708 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12709 return NULL; 12710 } 12711 result = tailmatch(self, substring, start, end, -1); 12712 Py_DECREF(substring); 12713 return PyBool_FromLong(result); 12714} 12715 12716 12717PyDoc_STRVAR(endswith__doc__, 12718 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12719\n\ 12720Return True if S ends with the specified suffix, False otherwise.\n\ 12721With optional start, test S beginning at that position.\n\ 12722With optional end, stop comparing S at that position.\n\ 12723suffix can also be a tuple of strings to try."); 12724 12725static PyObject * 12726unicode_endswith(PyObject *self, 12727 PyObject *args) 12728{ 12729 PyObject *subobj; 12730 PyObject *substring; 12731 Py_ssize_t start = 0; 12732 Py_ssize_t end = PY_SSIZE_T_MAX; 12733 int result; 12734 12735 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12736 return NULL; 12737 if (PyTuple_Check(subobj)) { 12738 Py_ssize_t i; 12739 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12740 substring = PyUnicode_FromObject( 12741 PyTuple_GET_ITEM(subobj, i)); 12742 if (substring == NULL) 12743 return NULL; 12744 result = tailmatch(self, substring, start, end, +1); 12745 Py_DECREF(substring); 12746 if (result) { 12747 Py_RETURN_TRUE; 12748 } 12749 } 12750 Py_RETURN_FALSE; 12751 } 12752 substring = PyUnicode_FromObject(subobj); 12753 if (substring == NULL) { 12754 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12755 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12756 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12757 return NULL; 12758 } 12759 result = tailmatch(self, substring, start, end, +1); 12760 Py_DECREF(substring); 12761 return PyBool_FromLong(result); 12762} 12763 12764Py_LOCAL_INLINE(void) 12765_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12766{ 12767 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12768 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12769 writer->data = PyUnicode_DATA(writer->buffer); 12770 writer->kind = PyUnicode_KIND(writer->buffer); 12771} 12772 12773void 12774_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) 12775{ 12776 memset(writer, 0, sizeof(*writer)); 12777#ifdef Py_DEBUG 12778 writer->kind = 5; /* invalid kind */ 12779#endif 12780 writer->min_length = Py_MAX(min_length, 100); 12781 writer->overallocate = (min_length > 0); 12782} 12783 12784int 12785_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12786 Py_ssize_t length, Py_UCS4 maxchar) 12787{ 12788 Py_ssize_t newlen; 12789 PyObject *newbuffer; 12790 12791 assert(length > 0); 12792 12793 if (length > PY_SSIZE_T_MAX - writer->pos) { 12794 PyErr_NoMemory(); 12795 return -1; 12796 } 12797 newlen = writer->pos + length; 12798 12799 if (writer->buffer == NULL) { 12800 if (writer->overallocate) { 12801 /* overallocate 25% to limit the number of resize */ 12802 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12803 newlen += newlen / 4; 12804 if (newlen < writer->min_length) 12805 newlen = writer->min_length; 12806 } 12807 writer->buffer = PyUnicode_New(newlen, maxchar); 12808 if (writer->buffer == NULL) 12809 return -1; 12810 _PyUnicodeWriter_Update(writer); 12811 return 0; 12812 } 12813 12814 if (newlen > writer->size) { 12815 if (writer->overallocate) { 12816 /* overallocate 25% to limit the number of resize */ 12817 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12818 newlen += newlen / 4; 12819 if (newlen < writer->min_length) 12820 newlen = writer->min_length; 12821 } 12822 12823 if (maxchar > writer->maxchar || writer->readonly) { 12824 /* resize + widen */ 12825 newbuffer = PyUnicode_New(newlen, maxchar); 12826 if (newbuffer == NULL) 12827 return -1; 12828 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12829 writer->buffer, 0, writer->pos); 12830 Py_DECREF(writer->buffer); 12831 writer->readonly = 0; 12832 } 12833 else { 12834 newbuffer = resize_compact(writer->buffer, newlen); 12835 if (newbuffer == NULL) 12836 return -1; 12837 } 12838 writer->buffer = newbuffer; 12839 _PyUnicodeWriter_Update(writer); 12840 } 12841 else if (maxchar > writer->maxchar) { 12842 assert(!writer->readonly); 12843 newbuffer = PyUnicode_New(writer->size, maxchar); 12844 if (newbuffer == NULL) 12845 return -1; 12846 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12847 writer->buffer, 0, writer->pos); 12848 Py_DECREF(writer->buffer); 12849 writer->buffer = newbuffer; 12850 _PyUnicodeWriter_Update(writer); 12851 } 12852 return 0; 12853} 12854 12855int 12856_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 12857{ 12858 Py_UCS4 maxchar; 12859 Py_ssize_t len; 12860 12861 if (PyUnicode_READY(str) == -1) 12862 return -1; 12863 len = PyUnicode_GET_LENGTH(str); 12864 if (len == 0) 12865 return 0; 12866 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 12867 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 12868 if (writer->buffer == NULL && !writer->overallocate) { 12869 Py_INCREF(str); 12870 writer->buffer = str; 12871 _PyUnicodeWriter_Update(writer); 12872 writer->readonly = 1; 12873 writer->size = 0; 12874 writer->pos += len; 12875 return 0; 12876 } 12877 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 12878 return -1; 12879 } 12880 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 12881 str, 0, len); 12882 writer->pos += len; 12883 return 0; 12884} 12885 12886PyObject * 12887_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 12888{ 12889 if (writer->pos == 0) { 12890 Py_XDECREF(writer->buffer); 12891 Py_INCREF(unicode_empty); 12892 return unicode_empty; 12893 } 12894 if (writer->readonly) { 12895 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 12896 return writer->buffer; 12897 } 12898 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 12899 PyObject *newbuffer; 12900 newbuffer = resize_compact(writer->buffer, writer->pos); 12901 if (newbuffer == NULL) { 12902 Py_DECREF(writer->buffer); 12903 return NULL; 12904 } 12905 writer->buffer = newbuffer; 12906 } 12907 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 12908 return writer->buffer; 12909} 12910 12911void 12912_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 12913{ 12914 Py_CLEAR(writer->buffer); 12915} 12916 12917#include "stringlib/unicode_format.h" 12918 12919PyDoc_STRVAR(format__doc__, 12920 "S.format(*args, **kwargs) -> str\n\ 12921\n\ 12922Return a formatted version of S, using substitutions from args and kwargs.\n\ 12923The substitutions are identified by braces ('{' and '}')."); 12924 12925PyDoc_STRVAR(format_map__doc__, 12926 "S.format_map(mapping) -> str\n\ 12927\n\ 12928Return a formatted version of S, using substitutions from mapping.\n\ 12929The substitutions are identified by braces ('{' and '}')."); 12930 12931static PyObject * 12932unicode__format__(PyObject* self, PyObject* args) 12933{ 12934 PyObject *format_spec; 12935 _PyUnicodeWriter writer; 12936 int ret; 12937 12938 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12939 return NULL; 12940 12941 if (PyUnicode_READY(self) == -1) 12942 return NULL; 12943 _PyUnicodeWriter_Init(&writer, 0); 12944 ret = _PyUnicode_FormatAdvancedWriter(&writer, 12945 self, format_spec, 0, 12946 PyUnicode_GET_LENGTH(format_spec)); 12947 if (ret == -1) { 12948 _PyUnicodeWriter_Dealloc(&writer); 12949 return NULL; 12950 } 12951 return _PyUnicodeWriter_Finish(&writer); 12952} 12953 12954PyDoc_STRVAR(p_format__doc__, 12955 "S.__format__(format_spec) -> str\n\ 12956\n\ 12957Return a formatted version of S as described by format_spec."); 12958 12959static PyObject * 12960unicode__sizeof__(PyObject *v) 12961{ 12962 Py_ssize_t size; 12963 12964 /* If it's a compact object, account for base structure + 12965 character data. */ 12966 if (PyUnicode_IS_COMPACT_ASCII(v)) 12967 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12968 else if (PyUnicode_IS_COMPACT(v)) 12969 size = sizeof(PyCompactUnicodeObject) + 12970 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12971 else { 12972 /* If it is a two-block object, account for base object, and 12973 for character block if present. */ 12974 size = sizeof(PyUnicodeObject); 12975 if (_PyUnicode_DATA_ANY(v)) 12976 size += (PyUnicode_GET_LENGTH(v) + 1) * 12977 PyUnicode_KIND(v); 12978 } 12979 /* If the wstr pointer is present, account for it unless it is shared 12980 with the data pointer. Check if the data is not shared. */ 12981 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12982 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12983 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12984 size += PyUnicode_UTF8_LENGTH(v) + 1; 12985 12986 return PyLong_FromSsize_t(size); 12987} 12988 12989PyDoc_STRVAR(sizeof__doc__, 12990 "S.__sizeof__() -> size of S in memory, in bytes"); 12991 12992static PyObject * 12993unicode_getnewargs(PyObject *v) 12994{ 12995 PyObject *copy = _PyUnicode_Copy(v); 12996 if (!copy) 12997 return NULL; 12998 return Py_BuildValue("(N)", copy); 12999} 13000 13001static PyMethodDef unicode_methods[] = { 13002 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13003 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13004 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13005 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13006 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13007 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13008 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13009 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13010 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13011 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13012 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 13013 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13014 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13015 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13016 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13017 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13018 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13019 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13020 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13021 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13022 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13023 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13040 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13041 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13042 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13043 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13044 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13045 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13046 {"maketrans", (PyCFunction) unicode_maketrans, 13047 METH_VARARGS | METH_STATIC, maketrans__doc__}, 13048 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13049#if 0 13050 /* These methods are just used for debugging the implementation. */ 13051 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13052#endif 13053 13054 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13055 {NULL, NULL} 13056}; 13057 13058static PyObject * 13059unicode_mod(PyObject *v, PyObject *w) 13060{ 13061 if (!PyUnicode_Check(v)) 13062 Py_RETURN_NOTIMPLEMENTED; 13063 return PyUnicode_Format(v, w); 13064} 13065 13066static PyNumberMethods unicode_as_number = { 13067 0, /*nb_add*/ 13068 0, /*nb_subtract*/ 13069 0, /*nb_multiply*/ 13070 unicode_mod, /*nb_remainder*/ 13071}; 13072 13073static PySequenceMethods unicode_as_sequence = { 13074 (lenfunc) unicode_length, /* sq_length */ 13075 PyUnicode_Concat, /* sq_concat */ 13076 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13077 (ssizeargfunc) unicode_getitem, /* sq_item */ 13078 0, /* sq_slice */ 13079 0, /* sq_ass_item */ 13080 0, /* sq_ass_slice */ 13081 PyUnicode_Contains, /* sq_contains */ 13082}; 13083 13084static PyObject* 13085unicode_subscript(PyObject* self, PyObject* item) 13086{ 13087 if (PyUnicode_READY(self) == -1) 13088 return NULL; 13089 13090 if (PyIndex_Check(item)) { 13091 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13092 if (i == -1 && PyErr_Occurred()) 13093 return NULL; 13094 if (i < 0) 13095 i += PyUnicode_GET_LENGTH(self); 13096 return unicode_getitem(self, i); 13097 } else if (PySlice_Check(item)) { 13098 Py_ssize_t start, stop, step, slicelength, cur, i; 13099 PyObject *result; 13100 void *src_data, *dest_data; 13101 int src_kind, dest_kind; 13102 Py_UCS4 ch, max_char, kind_limit; 13103 13104 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13105 &start, &stop, &step, &slicelength) < 0) { 13106 return NULL; 13107 } 13108 13109 if (slicelength <= 0) { 13110 Py_INCREF(unicode_empty); 13111 return unicode_empty; 13112 } else if (start == 0 && step == 1 && 13113 slicelength == PyUnicode_GET_LENGTH(self)) { 13114 return unicode_result_unchanged(self); 13115 } else if (step == 1) { 13116 return PyUnicode_Substring(self, 13117 start, start + slicelength); 13118 } 13119 /* General case */ 13120 src_kind = PyUnicode_KIND(self); 13121 src_data = PyUnicode_DATA(self); 13122 if (!PyUnicode_IS_ASCII(self)) { 13123 kind_limit = kind_maxchar_limit(src_kind); 13124 max_char = 0; 13125 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13126 ch = PyUnicode_READ(src_kind, src_data, cur); 13127 if (ch > max_char) { 13128 max_char = ch; 13129 if (max_char >= kind_limit) 13130 break; 13131 } 13132 } 13133 } 13134 else 13135 max_char = 127; 13136 result = PyUnicode_New(slicelength, max_char); 13137 if (result == NULL) 13138 return NULL; 13139 dest_kind = PyUnicode_KIND(result); 13140 dest_data = PyUnicode_DATA(result); 13141 13142 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13143 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13144 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13145 } 13146 assert(_PyUnicode_CheckConsistency(result, 1)); 13147 return result; 13148 } else { 13149 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13150 return NULL; 13151 } 13152} 13153 13154static PyMappingMethods unicode_as_mapping = { 13155 (lenfunc)unicode_length, /* mp_length */ 13156 (binaryfunc)unicode_subscript, /* mp_subscript */ 13157 (objobjargproc)0, /* mp_ass_subscript */ 13158}; 13159 13160 13161/* Helpers for PyUnicode_Format() */ 13162 13163static PyObject * 13164getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 13165{ 13166 Py_ssize_t argidx = *p_argidx; 13167 if (argidx < arglen) { 13168 (*p_argidx)++; 13169 if (arglen < 0) 13170 return args; 13171 else 13172 return PyTuple_GetItem(args, argidx); 13173 } 13174 PyErr_SetString(PyExc_TypeError, 13175 "not enough arguments for format string"); 13176 return NULL; 13177} 13178 13179/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13180 13181static int 13182formatfloat(PyObject *v, int flags, int prec, int type, 13183 PyObject **p_output, _PyUnicodeWriter *writer) 13184{ 13185 char *p; 13186 double x; 13187 Py_ssize_t len; 13188 13189 x = PyFloat_AsDouble(v); 13190 if (x == -1.0 && PyErr_Occurred()) 13191 return -1; 13192 13193 if (prec < 0) 13194 prec = 6; 13195 13196 p = PyOS_double_to_string(x, type, prec, 13197 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13198 if (p == NULL) 13199 return -1; 13200 len = strlen(p); 13201 if (writer) { 13202 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13203 return -1; 13204 memcpy((char*)writer->data + writer->pos * writer->kind, 13205 p, 13206 len); 13207 writer->pos += len; 13208 } 13209 else 13210 *p_output = _PyUnicode_FromASCII(p, len); 13211 PyMem_Free(p); 13212 return 0; 13213} 13214 13215/* formatlong() emulates the format codes d, u, o, x and X, and 13216 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13217 * Python's regular ints. 13218 * Return value: a new PyUnicodeObject*, or NULL if error. 13219 * The output string is of the form 13220 * "-"? ("0x" | "0X")? digit+ 13221 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13222 * set in flags. The case of hex digits will be correct, 13223 * There will be at least prec digits, zero-filled on the left if 13224 * necessary to get that many. 13225 * val object to be converted 13226 * flags bitmask of format flags; only F_ALT is looked at 13227 * prec minimum number of digits; 0-fill on left if needed 13228 * type a character in [duoxX]; u acts the same as d 13229 * 13230 * CAUTION: o, x and X conversions on regular ints can never 13231 * produce a '-' sign, but can for Python's unbounded ints. 13232 */ 13233static PyObject* 13234formatlong(PyObject *val, int flags, int prec, int type) 13235{ 13236 PyObject *result = NULL; 13237 char *buf; 13238 Py_ssize_t i; 13239 int sign; /* 1 if '-', else 0 */ 13240 int len; /* number of characters */ 13241 Py_ssize_t llen; 13242 int numdigits; /* len == numnondigits + numdigits */ 13243 int numnondigits = 0; 13244 13245 /* Avoid exceeding SSIZE_T_MAX */ 13246 if (prec > INT_MAX-3) { 13247 PyErr_SetString(PyExc_OverflowError, 13248 "precision too large"); 13249 return NULL; 13250 } 13251 13252 assert(PyLong_Check(val)); 13253 13254 switch (type) { 13255 case 'd': 13256 case 'u': 13257 /* Special-case boolean: we want 0/1 */ 13258 if (PyBool_Check(val)) 13259 result = PyNumber_ToBase(val, 10); 13260 else 13261 result = Py_TYPE(val)->tp_str(val); 13262 break; 13263 case 'o': 13264 numnondigits = 2; 13265 result = PyNumber_ToBase(val, 8); 13266 break; 13267 case 'x': 13268 case 'X': 13269 numnondigits = 2; 13270 result = PyNumber_ToBase(val, 16); 13271 break; 13272 default: 13273 assert(!"'type' not in [duoxX]"); 13274 } 13275 if (!result) 13276 return NULL; 13277 13278 assert(unicode_modifiable(result)); 13279 assert(PyUnicode_IS_READY(result)); 13280 assert(PyUnicode_IS_ASCII(result)); 13281 13282 /* To modify the string in-place, there can only be one reference. */ 13283 if (Py_REFCNT(result) != 1) { 13284 PyErr_BadInternalCall(); 13285 return NULL; 13286 } 13287 buf = PyUnicode_DATA(result); 13288 llen = PyUnicode_GET_LENGTH(result); 13289 if (llen > INT_MAX) { 13290 PyErr_SetString(PyExc_ValueError, 13291 "string too large in _PyBytes_FormatLong"); 13292 return NULL; 13293 } 13294 len = (int)llen; 13295 sign = buf[0] == '-'; 13296 numnondigits += sign; 13297 numdigits = len - numnondigits; 13298 assert(numdigits > 0); 13299 13300 /* Get rid of base marker unless F_ALT */ 13301 if (((flags & F_ALT) == 0 && 13302 (type == 'o' || type == 'x' || type == 'X'))) { 13303 assert(buf[sign] == '0'); 13304 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13305 buf[sign+1] == 'o'); 13306 numnondigits -= 2; 13307 buf += 2; 13308 len -= 2; 13309 if (sign) 13310 buf[0] = '-'; 13311 assert(len == numnondigits + numdigits); 13312 assert(numdigits > 0); 13313 } 13314 13315 /* Fill with leading zeroes to meet minimum width. */ 13316 if (prec > numdigits) { 13317 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13318 numnondigits + prec); 13319 char *b1; 13320 if (!r1) { 13321 Py_DECREF(result); 13322 return NULL; 13323 } 13324 b1 = PyBytes_AS_STRING(r1); 13325 for (i = 0; i < numnondigits; ++i) 13326 *b1++ = *buf++; 13327 for (i = 0; i < prec - numdigits; i++) 13328 *b1++ = '0'; 13329 for (i = 0; i < numdigits; i++) 13330 *b1++ = *buf++; 13331 *b1 = '\0'; 13332 Py_DECREF(result); 13333 result = r1; 13334 buf = PyBytes_AS_STRING(result); 13335 len = numnondigits + prec; 13336 } 13337 13338 /* Fix up case for hex conversions. */ 13339 if (type == 'X') { 13340 /* Need to convert all lower case letters to upper case. 13341 and need to convert 0x to 0X (and -0x to -0X). */ 13342 for (i = 0; i < len; i++) 13343 if (buf[i] >= 'a' && buf[i] <= 'x') 13344 buf[i] -= 'a'-'A'; 13345 } 13346 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) { 13347 PyObject *unicode; 13348 unicode = _PyUnicode_FromASCII(buf, len); 13349 Py_DECREF(result); 13350 result = unicode; 13351 } 13352 return result; 13353} 13354 13355static Py_UCS4 13356formatchar(PyObject *v) 13357{ 13358 /* presume that the buffer is at least 3 characters long */ 13359 if (PyUnicode_Check(v)) { 13360 if (PyUnicode_GET_LENGTH(v) == 1) { 13361 return PyUnicode_READ_CHAR(v, 0); 13362 } 13363 goto onError; 13364 } 13365 else { 13366 /* Integer input truncated to a character */ 13367 long x; 13368 x = PyLong_AsLong(v); 13369 if (x == -1 && PyErr_Occurred()) 13370 goto onError; 13371 13372 if (x < 0 || x > MAX_UNICODE) { 13373 PyErr_SetString(PyExc_OverflowError, 13374 "%c arg not in range(0x110000)"); 13375 return (Py_UCS4) -1; 13376 } 13377 13378 return (Py_UCS4) x; 13379 } 13380 13381 onError: 13382 PyErr_SetString(PyExc_TypeError, 13383 "%c requires int or char"); 13384 return (Py_UCS4) -1; 13385} 13386 13387PyObject * 13388PyUnicode_Format(PyObject *format, PyObject *args) 13389{ 13390 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13391 int args_owned = 0; 13392 PyObject *dict = NULL; 13393 PyObject *temp = NULL; 13394 PyObject *second = NULL; 13395 PyObject *uformat; 13396 void *fmt; 13397 enum PyUnicode_Kind kind, fmtkind; 13398 _PyUnicodeWriter writer; 13399 Py_ssize_t sublen; 13400 Py_UCS4 maxchar; 13401 13402 if (format == NULL || args == NULL) { 13403 PyErr_BadInternalCall(); 13404 return NULL; 13405 } 13406 uformat = PyUnicode_FromObject(format); 13407 if (uformat == NULL) 13408 return NULL; 13409 if (PyUnicode_READY(uformat) == -1) 13410 Py_DECREF(uformat); 13411 13412 fmt = PyUnicode_DATA(uformat); 13413 fmtkind = PyUnicode_KIND(uformat); 13414 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13415 fmtpos = 0; 13416 13417 _PyUnicodeWriter_Init(&writer, fmtcnt + 100); 13418 13419 if (PyTuple_Check(args)) { 13420 arglen = PyTuple_Size(args); 13421 argidx = 0; 13422 } 13423 else { 13424 arglen = -1; 13425 argidx = -2; 13426 } 13427 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 13428 !PyUnicode_Check(args)) 13429 dict = args; 13430 13431 while (--fmtcnt >= 0) { 13432 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13433 Py_ssize_t nonfmtpos; 13434 nonfmtpos = fmtpos++; 13435 while (fmtcnt >= 0 && 13436 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13437 fmtpos++; 13438 fmtcnt--; 13439 } 13440 if (fmtcnt < 0) 13441 fmtpos--; 13442 sublen = fmtpos - nonfmtpos; 13443 maxchar = _PyUnicode_FindMaxChar(uformat, 13444 nonfmtpos, nonfmtpos + sublen); 13445 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1) 13446 goto onError; 13447 13448 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13449 uformat, nonfmtpos, sublen); 13450 writer.pos += sublen; 13451 } 13452 else { 13453 /* Got a format specifier */ 13454 int flags = 0; 13455 Py_ssize_t width = -1; 13456 int prec = -1; 13457 Py_UCS4 c = '\0'; 13458 Py_UCS4 fill; 13459 int sign; 13460 Py_UCS4 signchar; 13461 int isnumok; 13462 PyObject *v = NULL; 13463 void *pbuf = NULL; 13464 Py_ssize_t pindex, len; 13465 Py_UCS4 bufmaxchar; 13466 Py_ssize_t buflen; 13467 13468 fmtpos++; 13469 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13470 if (c == '(') { 13471 Py_ssize_t keystart; 13472 Py_ssize_t keylen; 13473 PyObject *key; 13474 int pcount = 1; 13475 13476 if (dict == NULL) { 13477 PyErr_SetString(PyExc_TypeError, 13478 "format requires a mapping"); 13479 goto onError; 13480 } 13481 ++fmtpos; 13482 --fmtcnt; 13483 keystart = fmtpos; 13484 /* Skip over balanced parentheses */ 13485 while (pcount > 0 && --fmtcnt >= 0) { 13486 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13487 if (c == ')') 13488 --pcount; 13489 else if (c == '(') 13490 ++pcount; 13491 fmtpos++; 13492 } 13493 keylen = fmtpos - keystart - 1; 13494 if (fmtcnt < 0 || pcount > 0) { 13495 PyErr_SetString(PyExc_ValueError, 13496 "incomplete format key"); 13497 goto onError; 13498 } 13499 key = PyUnicode_Substring(uformat, 13500 keystart, keystart + keylen); 13501 if (key == NULL) 13502 goto onError; 13503 if (args_owned) { 13504 Py_DECREF(args); 13505 args_owned = 0; 13506 } 13507 args = PyObject_GetItem(dict, key); 13508 Py_DECREF(key); 13509 if (args == NULL) { 13510 goto onError; 13511 } 13512 args_owned = 1; 13513 arglen = -1; 13514 argidx = -2; 13515 } 13516 while (--fmtcnt >= 0) { 13517 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13518 switch (c) { 13519 case '-': flags |= F_LJUST; continue; 13520 case '+': flags |= F_SIGN; continue; 13521 case ' ': flags |= F_BLANK; continue; 13522 case '#': flags |= F_ALT; continue; 13523 case '0': flags |= F_ZERO; continue; 13524 } 13525 break; 13526 } 13527 if (c == '*') { 13528 v = getnextarg(args, arglen, &argidx); 13529 if (v == NULL) 13530 goto onError; 13531 if (!PyLong_Check(v)) { 13532 PyErr_SetString(PyExc_TypeError, 13533 "* wants int"); 13534 goto onError; 13535 } 13536 width = PyLong_AsLong(v); 13537 if (width == -1 && PyErr_Occurred()) 13538 goto onError; 13539 if (width < 0) { 13540 flags |= F_LJUST; 13541 width = -width; 13542 } 13543 if (--fmtcnt >= 0) 13544 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13545 } 13546 else if (c >= '0' && c <= '9') { 13547 width = c - '0'; 13548 while (--fmtcnt >= 0) { 13549 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13550 if (c < '0' || c > '9') 13551 break; 13552 /* Since c is unsigned, the RHS would end up as unsigned, 13553 mixing signed and unsigned comparison. Since c is between 13554 '0' and '9', casting to int is safe. */ 13555 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 13556 PyErr_SetString(PyExc_ValueError, 13557 "width too big"); 13558 goto onError; 13559 } 13560 width = width*10 + (c - '0'); 13561 } 13562 } 13563 if (c == '.') { 13564 prec = 0; 13565 if (--fmtcnt >= 0) 13566 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13567 if (c == '*') { 13568 v = getnextarg(args, arglen, &argidx); 13569 if (v == NULL) 13570 goto onError; 13571 if (!PyLong_Check(v)) { 13572 PyErr_SetString(PyExc_TypeError, 13573 "* wants int"); 13574 goto onError; 13575 } 13576 prec = PyLong_AsLong(v); 13577 if (prec == -1 && PyErr_Occurred()) 13578 goto onError; 13579 if (prec < 0) 13580 prec = 0; 13581 if (--fmtcnt >= 0) 13582 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13583 } 13584 else if (c >= '0' && c <= '9') { 13585 prec = c - '0'; 13586 while (--fmtcnt >= 0) { 13587 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13588 if (c < '0' || c > '9') 13589 break; 13590 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 13591 PyErr_SetString(PyExc_ValueError, 13592 "prec too big"); 13593 goto onError; 13594 } 13595 prec = prec*10 + (c - '0'); 13596 } 13597 } 13598 } /* prec */ 13599 if (fmtcnt >= 0) { 13600 if (c == 'h' || c == 'l' || c == 'L') { 13601 if (--fmtcnt >= 0) 13602 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13603 } 13604 } 13605 if (fmtcnt < 0) { 13606 PyErr_SetString(PyExc_ValueError, 13607 "incomplete format"); 13608 goto onError; 13609 } 13610 if (fmtcnt == 0) 13611 writer.overallocate = 0; 13612 13613 if (c == '%') { 13614 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1) 13615 goto onError; 13616 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%'); 13617 writer.pos += 1; 13618 continue; 13619 } 13620 13621 v = getnextarg(args, arglen, &argidx); 13622 if (v == NULL) 13623 goto onError; 13624 13625 sign = 0; 13626 signchar = '\0'; 13627 fill = ' '; 13628 switch (c) { 13629 13630 case 's': 13631 case 'r': 13632 case 'a': 13633 if (PyLong_CheckExact(v) && width == -1 && prec == -1) { 13634 /* Fast path */ 13635 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13636 goto onError; 13637 goto nextarg; 13638 } 13639 13640 if (PyUnicode_CheckExact(v) && c == 's') { 13641 temp = v; 13642 Py_INCREF(temp); 13643 } 13644 else { 13645 if (c == 's') 13646 temp = PyObject_Str(v); 13647 else if (c == 'r') 13648 temp = PyObject_Repr(v); 13649 else 13650 temp = PyObject_ASCII(v); 13651 } 13652 break; 13653 13654 case 'i': 13655 case 'd': 13656 case 'u': 13657 case 'o': 13658 case 'x': 13659 case 'X': 13660 if (PyLong_CheckExact(v) 13661 && width == -1 && prec == -1 13662 && !(flags & (F_SIGN | F_BLANK))) 13663 { 13664 /* Fast path */ 13665 switch(c) 13666 { 13667 case 'd': 13668 case 'i': 13669 case 'u': 13670 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13671 goto onError; 13672 goto nextarg; 13673 case 'x': 13674 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1) 13675 goto onError; 13676 goto nextarg; 13677 case 'o': 13678 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1) 13679 goto onError; 13680 goto nextarg; 13681 default: 13682 break; 13683 } 13684 } 13685 13686 isnumok = 0; 13687 if (PyNumber_Check(v)) { 13688 PyObject *iobj=NULL; 13689 13690 if (PyLong_Check(v)) { 13691 iobj = v; 13692 Py_INCREF(iobj); 13693 } 13694 else { 13695 iobj = PyNumber_Long(v); 13696 } 13697 if (iobj!=NULL) { 13698 if (PyLong_Check(iobj)) { 13699 isnumok = 1; 13700 sign = 1; 13701 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13702 Py_DECREF(iobj); 13703 } 13704 else { 13705 Py_DECREF(iobj); 13706 } 13707 } 13708 } 13709 if (!isnumok) { 13710 PyErr_Format(PyExc_TypeError, 13711 "%%%c format: a number is required, " 13712 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13713 goto onError; 13714 } 13715 if (flags & F_ZERO) 13716 fill = '0'; 13717 break; 13718 13719 case 'e': 13720 case 'E': 13721 case 'f': 13722 case 'F': 13723 case 'g': 13724 case 'G': 13725 if (width == -1 && prec == -1 13726 && !(flags & (F_SIGN | F_BLANK))) 13727 { 13728 /* Fast path */ 13729 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1) 13730 goto onError; 13731 goto nextarg; 13732 } 13733 13734 sign = 1; 13735 if (flags & F_ZERO) 13736 fill = '0'; 13737 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1) 13738 temp = NULL; 13739 break; 13740 13741 case 'c': 13742 { 13743 Py_UCS4 ch = formatchar(v); 13744 if (ch == (Py_UCS4) -1) 13745 goto onError; 13746 if (width == -1 && prec == -1) { 13747 /* Fast path */ 13748 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 13749 goto onError; 13750 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 13751 writer.pos += 1; 13752 goto nextarg; 13753 } 13754 temp = PyUnicode_FromOrdinal(ch); 13755 break; 13756 } 13757 13758 default: 13759 PyErr_Format(PyExc_ValueError, 13760 "unsupported format character '%c' (0x%x) " 13761 "at index %zd", 13762 (31<=c && c<=126) ? (char)c : '?', 13763 (int)c, 13764 fmtpos - 1); 13765 goto onError; 13766 } 13767 if (temp == NULL) 13768 goto onError; 13769 assert (PyUnicode_Check(temp)); 13770 13771 if (width == -1 && prec == -1 13772 && !(flags & (F_SIGN | F_BLANK))) 13773 { 13774 /* Fast path */ 13775 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1) 13776 goto onError; 13777 goto nextarg; 13778 } 13779 13780 if (PyUnicode_READY(temp) == -1) { 13781 Py_CLEAR(temp); 13782 goto onError; 13783 } 13784 kind = PyUnicode_KIND(temp); 13785 pbuf = PyUnicode_DATA(temp); 13786 len = PyUnicode_GET_LENGTH(temp); 13787 13788 if (c == 's' || c == 'r' || c == 'a') { 13789 if (prec >= 0 && len > prec) 13790 len = prec; 13791 } 13792 13793 /* pbuf is initialized here. */ 13794 pindex = 0; 13795 if (sign) { 13796 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 13797 if (ch == '-' || ch == '+') { 13798 signchar = ch; 13799 len--; 13800 pindex++; 13801 } 13802 else if (flags & F_SIGN) 13803 signchar = '+'; 13804 else if (flags & F_BLANK) 13805 signchar = ' '; 13806 else 13807 sign = 0; 13808 } 13809 if (width < len) 13810 width = len; 13811 13812 /* Compute the length and maximum character of the 13813 written characters */ 13814 bufmaxchar = 127; 13815 if (!(flags & F_LJUST)) { 13816 if (sign) { 13817 if ((width-1) > len) 13818 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13819 } 13820 else { 13821 if (width > len) 13822 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13823 } 13824 } 13825 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len); 13826 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); 13827 13828 buflen = width; 13829 if (sign && len == width) 13830 buflen++; 13831 13832 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1) 13833 goto onError; 13834 13835 /* Write characters */ 13836 if (sign) { 13837 if (fill != ' ') { 13838 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13839 writer.pos += 1; 13840 } 13841 if (width > len) 13842 width--; 13843 } 13844 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13845 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13846 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13847 if (fill != ' ') { 13848 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13849 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13850 writer.pos += 2; 13851 pindex += 2; 13852 } 13853 width -= 2; 13854 if (width < 0) 13855 width = 0; 13856 len -= 2; 13857 } 13858 if (width > len && !(flags & F_LJUST)) { 13859 sublen = width - len; 13860 FILL(writer.kind, writer.data, fill, writer.pos, sublen); 13861 writer.pos += sublen; 13862 width = len; 13863 } 13864 if (fill == ' ') { 13865 if (sign) { 13866 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13867 writer.pos += 1; 13868 } 13869 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13870 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13871 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13872 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13873 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13874 writer.pos += 2; 13875 pindex += 2; 13876 } 13877 } 13878 13879 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13880 temp, pindex, len); 13881 writer.pos += len; 13882 if (width > len) { 13883 sublen = width - len; 13884 FILL(writer.kind, writer.data, ' ', writer.pos, sublen); 13885 writer.pos += sublen; 13886 } 13887 13888nextarg: 13889 if (dict && (argidx < arglen) && c != '%') { 13890 PyErr_SetString(PyExc_TypeError, 13891 "not all arguments converted during string formatting"); 13892 goto onError; 13893 } 13894 Py_CLEAR(temp); 13895 } /* '%' */ 13896 } /* until end */ 13897 if (argidx < arglen && !dict) { 13898 PyErr_SetString(PyExc_TypeError, 13899 "not all arguments converted during string formatting"); 13900 goto onError; 13901 } 13902 13903 if (args_owned) { 13904 Py_DECREF(args); 13905 } 13906 Py_DECREF(uformat); 13907 Py_XDECREF(temp); 13908 Py_XDECREF(second); 13909 return _PyUnicodeWriter_Finish(&writer); 13910 13911 onError: 13912 Py_DECREF(uformat); 13913 Py_XDECREF(temp); 13914 Py_XDECREF(second); 13915 _PyUnicodeWriter_Dealloc(&writer); 13916 if (args_owned) { 13917 Py_DECREF(args); 13918 } 13919 return NULL; 13920} 13921 13922static PyObject * 13923unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13924 13925static PyObject * 13926unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13927{ 13928 PyObject *x = NULL; 13929 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13930 char *encoding = NULL; 13931 char *errors = NULL; 13932 13933 if (type != &PyUnicode_Type) 13934 return unicode_subtype_new(type, args, kwds); 13935 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13936 kwlist, &x, &encoding, &errors)) 13937 return NULL; 13938 if (x == NULL) { 13939 Py_INCREF(unicode_empty); 13940 return unicode_empty; 13941 } 13942 if (encoding == NULL && errors == NULL) 13943 return PyObject_Str(x); 13944 else 13945 return PyUnicode_FromEncodedObject(x, encoding, errors); 13946} 13947 13948static PyObject * 13949unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13950{ 13951 PyObject *unicode, *self; 13952 Py_ssize_t length, char_size; 13953 int share_wstr, share_utf8; 13954 unsigned int kind; 13955 void *data; 13956 13957 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13958 13959 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13960 if (unicode == NULL) 13961 return NULL; 13962 assert(_PyUnicode_CHECK(unicode)); 13963 if (PyUnicode_READY(unicode) == -1) { 13964 Py_DECREF(unicode); 13965 return NULL; 13966 } 13967 13968 self = type->tp_alloc(type, 0); 13969 if (self == NULL) { 13970 Py_DECREF(unicode); 13971 return NULL; 13972 } 13973 kind = PyUnicode_KIND(unicode); 13974 length = PyUnicode_GET_LENGTH(unicode); 13975 13976 _PyUnicode_LENGTH(self) = length; 13977#ifdef Py_DEBUG 13978 _PyUnicode_HASH(self) = -1; 13979#else 13980 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13981#endif 13982 _PyUnicode_STATE(self).interned = 0; 13983 _PyUnicode_STATE(self).kind = kind; 13984 _PyUnicode_STATE(self).compact = 0; 13985 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13986 _PyUnicode_STATE(self).ready = 1; 13987 _PyUnicode_WSTR(self) = NULL; 13988 _PyUnicode_UTF8_LENGTH(self) = 0; 13989 _PyUnicode_UTF8(self) = NULL; 13990 _PyUnicode_WSTR_LENGTH(self) = 0; 13991 _PyUnicode_DATA_ANY(self) = NULL; 13992 13993 share_utf8 = 0; 13994 share_wstr = 0; 13995 if (kind == PyUnicode_1BYTE_KIND) { 13996 char_size = 1; 13997 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13998 share_utf8 = 1; 13999 } 14000 else if (kind == PyUnicode_2BYTE_KIND) { 14001 char_size = 2; 14002 if (sizeof(wchar_t) == 2) 14003 share_wstr = 1; 14004 } 14005 else { 14006 assert(kind == PyUnicode_4BYTE_KIND); 14007 char_size = 4; 14008 if (sizeof(wchar_t) == 4) 14009 share_wstr = 1; 14010 } 14011 14012 /* Ensure we won't overflow the length. */ 14013 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14014 PyErr_NoMemory(); 14015 goto onError; 14016 } 14017 data = PyObject_MALLOC((length + 1) * char_size); 14018 if (data == NULL) { 14019 PyErr_NoMemory(); 14020 goto onError; 14021 } 14022 14023 _PyUnicode_DATA_ANY(self) = data; 14024 if (share_utf8) { 14025 _PyUnicode_UTF8_LENGTH(self) = length; 14026 _PyUnicode_UTF8(self) = data; 14027 } 14028 if (share_wstr) { 14029 _PyUnicode_WSTR_LENGTH(self) = length; 14030 _PyUnicode_WSTR(self) = (wchar_t *)data; 14031 } 14032 14033 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14034 kind * (length + 1)); 14035 assert(_PyUnicode_CheckConsistency(self, 1)); 14036#ifdef Py_DEBUG 14037 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14038#endif 14039 Py_DECREF(unicode); 14040 return self; 14041 14042onError: 14043 Py_DECREF(unicode); 14044 Py_DECREF(self); 14045 return NULL; 14046} 14047 14048PyDoc_STRVAR(unicode_doc, 14049 "str(string[, encoding[, errors]]) -> str\n\ 14050\n\ 14051Create a new string object from the given encoded string.\n\ 14052encoding defaults to the current default string encoding.\n\ 14053errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 14054 14055static PyObject *unicode_iter(PyObject *seq); 14056 14057PyTypeObject PyUnicode_Type = { 14058 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14059 "str", /* tp_name */ 14060 sizeof(PyUnicodeObject), /* tp_size */ 14061 0, /* tp_itemsize */ 14062 /* Slots */ 14063 (destructor)unicode_dealloc, /* tp_dealloc */ 14064 0, /* tp_print */ 14065 0, /* tp_getattr */ 14066 0, /* tp_setattr */ 14067 0, /* tp_reserved */ 14068 unicode_repr, /* tp_repr */ 14069 &unicode_as_number, /* tp_as_number */ 14070 &unicode_as_sequence, /* tp_as_sequence */ 14071 &unicode_as_mapping, /* tp_as_mapping */ 14072 (hashfunc) unicode_hash, /* tp_hash*/ 14073 0, /* tp_call*/ 14074 (reprfunc) unicode_str, /* tp_str */ 14075 PyObject_GenericGetAttr, /* tp_getattro */ 14076 0, /* tp_setattro */ 14077 0, /* tp_as_buffer */ 14078 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14079 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14080 unicode_doc, /* tp_doc */ 14081 0, /* tp_traverse */ 14082 0, /* tp_clear */ 14083 PyUnicode_RichCompare, /* tp_richcompare */ 14084 0, /* tp_weaklistoffset */ 14085 unicode_iter, /* tp_iter */ 14086 0, /* tp_iternext */ 14087 unicode_methods, /* tp_methods */ 14088 0, /* tp_members */ 14089 0, /* tp_getset */ 14090 &PyBaseObject_Type, /* tp_base */ 14091 0, /* tp_dict */ 14092 0, /* tp_descr_get */ 14093 0, /* tp_descr_set */ 14094 0, /* tp_dictoffset */ 14095 0, /* tp_init */ 14096 0, /* tp_alloc */ 14097 unicode_new, /* tp_new */ 14098 PyObject_Del, /* tp_free */ 14099}; 14100 14101/* Initialize the Unicode implementation */ 14102 14103int _PyUnicode_Init(void) 14104{ 14105 int i; 14106 14107 /* XXX - move this array to unicodectype.c ? */ 14108 Py_UCS2 linebreak[] = { 14109 0x000A, /* LINE FEED */ 14110 0x000D, /* CARRIAGE RETURN */ 14111 0x001C, /* FILE SEPARATOR */ 14112 0x001D, /* GROUP SEPARATOR */ 14113 0x001E, /* RECORD SEPARATOR */ 14114 0x0085, /* NEXT LINE */ 14115 0x2028, /* LINE SEPARATOR */ 14116 0x2029, /* PARAGRAPH SEPARATOR */ 14117 }; 14118 14119 /* Init the implementation */ 14120 unicode_empty = PyUnicode_New(0, 0); 14121 if (!unicode_empty) 14122 Py_FatalError("Can't create empty string"); 14123 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 14124 14125 for (i = 0; i < 256; i++) 14126 unicode_latin1[i] = NULL; 14127 if (PyType_Ready(&PyUnicode_Type) < 0) 14128 Py_FatalError("Can't initialize 'unicode'"); 14129 14130 /* initialize the linebreak bloom filter */ 14131 bloom_linebreak = make_bloom_mask( 14132 PyUnicode_2BYTE_KIND, linebreak, 14133 Py_ARRAY_LENGTH(linebreak)); 14134 14135 PyType_Ready(&EncodingMapType); 14136 14137#ifdef HAVE_MBCS 14138 winver.dwOSVersionInfoSize = sizeof(winver); 14139 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14140 PyErr_SetFromWindowsErr(0); 14141 return -1; 14142 } 14143#endif 14144 return 0; 14145} 14146 14147/* Finalize the Unicode implementation */ 14148 14149int 14150PyUnicode_ClearFreeList(void) 14151{ 14152 return 0; 14153} 14154 14155void 14156_PyUnicode_Fini(void) 14157{ 14158 int i; 14159 14160 Py_XDECREF(unicode_empty); 14161 unicode_empty = NULL; 14162 14163 for (i = 0; i < 256; i++) { 14164 if (unicode_latin1[i]) { 14165 Py_DECREF(unicode_latin1[i]); 14166 unicode_latin1[i] = NULL; 14167 } 14168 } 14169 _PyUnicode_ClearStaticStrings(); 14170 (void)PyUnicode_ClearFreeList(); 14171} 14172 14173void 14174PyUnicode_InternInPlace(PyObject **p) 14175{ 14176 register PyObject *s = *p; 14177 PyObject *t; 14178#ifdef Py_DEBUG 14179 assert(s != NULL); 14180 assert(_PyUnicode_CHECK(s)); 14181#else 14182 if (s == NULL || !PyUnicode_Check(s)) 14183 return; 14184#endif 14185 /* If it's a subclass, we don't really know what putting 14186 it in the interned dict might do. */ 14187 if (!PyUnicode_CheckExact(s)) 14188 return; 14189 if (PyUnicode_CHECK_INTERNED(s)) 14190 return; 14191 if (interned == NULL) { 14192 interned = PyDict_New(); 14193 if (interned == NULL) { 14194 PyErr_Clear(); /* Don't leave an exception */ 14195 return; 14196 } 14197 } 14198 /* It might be that the GetItem call fails even 14199 though the key is present in the dictionary, 14200 namely when this happens during a stack overflow. */ 14201 Py_ALLOW_RECURSION 14202 t = PyDict_GetItem(interned, s); 14203 Py_END_ALLOW_RECURSION 14204 14205 if (t) { 14206 Py_INCREF(t); 14207 Py_DECREF(*p); 14208 *p = t; 14209 return; 14210 } 14211 14212 PyThreadState_GET()->recursion_critical = 1; 14213 if (PyDict_SetItem(interned, s, s) < 0) { 14214 PyErr_Clear(); 14215 PyThreadState_GET()->recursion_critical = 0; 14216 return; 14217 } 14218 PyThreadState_GET()->recursion_critical = 0; 14219 /* The two references in interned are not counted by refcnt. 14220 The deallocator will take care of this */ 14221 Py_REFCNT(s) -= 2; 14222 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14223} 14224 14225void 14226PyUnicode_InternImmortal(PyObject **p) 14227{ 14228 PyUnicode_InternInPlace(p); 14229 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14230 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14231 Py_INCREF(*p); 14232 } 14233} 14234 14235PyObject * 14236PyUnicode_InternFromString(const char *cp) 14237{ 14238 PyObject *s = PyUnicode_FromString(cp); 14239 if (s == NULL) 14240 return NULL; 14241 PyUnicode_InternInPlace(&s); 14242 return s; 14243} 14244 14245void 14246_Py_ReleaseInternedUnicodeStrings(void) 14247{ 14248 PyObject *keys; 14249 PyObject *s; 14250 Py_ssize_t i, n; 14251 Py_ssize_t immortal_size = 0, mortal_size = 0; 14252 14253 if (interned == NULL || !PyDict_Check(interned)) 14254 return; 14255 keys = PyDict_Keys(interned); 14256 if (keys == NULL || !PyList_Check(keys)) { 14257 PyErr_Clear(); 14258 return; 14259 } 14260 14261 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14262 detector, interned unicode strings are not forcibly deallocated; 14263 rather, we give them their stolen references back, and then clear 14264 and DECREF the interned dict. */ 14265 14266 n = PyList_GET_SIZE(keys); 14267 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14268 n); 14269 for (i = 0; i < n; i++) { 14270 s = PyList_GET_ITEM(keys, i); 14271 if (PyUnicode_READY(s) == -1) { 14272 assert(0 && "could not ready string"); 14273 fprintf(stderr, "could not ready string\n"); 14274 } 14275 switch (PyUnicode_CHECK_INTERNED(s)) { 14276 case SSTATE_NOT_INTERNED: 14277 /* XXX Shouldn't happen */ 14278 break; 14279 case SSTATE_INTERNED_IMMORTAL: 14280 Py_REFCNT(s) += 1; 14281 immortal_size += PyUnicode_GET_LENGTH(s); 14282 break; 14283 case SSTATE_INTERNED_MORTAL: 14284 Py_REFCNT(s) += 2; 14285 mortal_size += PyUnicode_GET_LENGTH(s); 14286 break; 14287 default: 14288 Py_FatalError("Inconsistent interned string state."); 14289 } 14290 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14291 } 14292 fprintf(stderr, "total size of all interned strings: " 14293 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14294 "mortal/immortal\n", mortal_size, immortal_size); 14295 Py_DECREF(keys); 14296 PyDict_Clear(interned); 14297 Py_DECREF(interned); 14298 interned = NULL; 14299} 14300 14301 14302/********************* Unicode Iterator **************************/ 14303 14304typedef struct { 14305 PyObject_HEAD 14306 Py_ssize_t it_index; 14307 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14308} unicodeiterobject; 14309 14310static void 14311unicodeiter_dealloc(unicodeiterobject *it) 14312{ 14313 _PyObject_GC_UNTRACK(it); 14314 Py_XDECREF(it->it_seq); 14315 PyObject_GC_Del(it); 14316} 14317 14318static int 14319unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14320{ 14321 Py_VISIT(it->it_seq); 14322 return 0; 14323} 14324 14325static PyObject * 14326unicodeiter_next(unicodeiterobject *it) 14327{ 14328 PyObject *seq, *item; 14329 14330 assert(it != NULL); 14331 seq = it->it_seq; 14332 if (seq == NULL) 14333 return NULL; 14334 assert(_PyUnicode_CHECK(seq)); 14335 14336 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14337 int kind = PyUnicode_KIND(seq); 14338 void *data = PyUnicode_DATA(seq); 14339 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14340 item = PyUnicode_FromOrdinal(chr); 14341 if (item != NULL) 14342 ++it->it_index; 14343 return item; 14344 } 14345 14346 Py_DECREF(seq); 14347 it->it_seq = NULL; 14348 return NULL; 14349} 14350 14351static PyObject * 14352unicodeiter_len(unicodeiterobject *it) 14353{ 14354 Py_ssize_t len = 0; 14355 if (it->it_seq) 14356 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14357 return PyLong_FromSsize_t(len); 14358} 14359 14360PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14361 14362static PyObject * 14363unicodeiter_reduce(unicodeiterobject *it) 14364{ 14365 if (it->it_seq != NULL) { 14366 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14367 it->it_seq, it->it_index); 14368 } else { 14369 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14370 if (u == NULL) 14371 return NULL; 14372 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14373 } 14374} 14375 14376PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14377 14378static PyObject * 14379unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14380{ 14381 Py_ssize_t index = PyLong_AsSsize_t(state); 14382 if (index == -1 && PyErr_Occurred()) 14383 return NULL; 14384 if (index < 0) 14385 index = 0; 14386 it->it_index = index; 14387 Py_RETURN_NONE; 14388} 14389 14390PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14391 14392static PyMethodDef unicodeiter_methods[] = { 14393 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14394 length_hint_doc}, 14395 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14396 reduce_doc}, 14397 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14398 setstate_doc}, 14399 {NULL, NULL} /* sentinel */ 14400}; 14401 14402PyTypeObject PyUnicodeIter_Type = { 14403 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14404 "str_iterator", /* tp_name */ 14405 sizeof(unicodeiterobject), /* tp_basicsize */ 14406 0, /* tp_itemsize */ 14407 /* methods */ 14408 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14409 0, /* tp_print */ 14410 0, /* tp_getattr */ 14411 0, /* tp_setattr */ 14412 0, /* tp_reserved */ 14413 0, /* tp_repr */ 14414 0, /* tp_as_number */ 14415 0, /* tp_as_sequence */ 14416 0, /* tp_as_mapping */ 14417 0, /* tp_hash */ 14418 0, /* tp_call */ 14419 0, /* tp_str */ 14420 PyObject_GenericGetAttr, /* tp_getattro */ 14421 0, /* tp_setattro */ 14422 0, /* tp_as_buffer */ 14423 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14424 0, /* tp_doc */ 14425 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14426 0, /* tp_clear */ 14427 0, /* tp_richcompare */ 14428 0, /* tp_weaklistoffset */ 14429 PyObject_SelfIter, /* tp_iter */ 14430 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14431 unicodeiter_methods, /* tp_methods */ 14432 0, 14433}; 14434 14435static PyObject * 14436unicode_iter(PyObject *seq) 14437{ 14438 unicodeiterobject *it; 14439 14440 if (!PyUnicode_Check(seq)) { 14441 PyErr_BadInternalCall(); 14442 return NULL; 14443 } 14444 if (PyUnicode_READY(seq) == -1) 14445 return NULL; 14446 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14447 if (it == NULL) 14448 return NULL; 14449 it->it_index = 0; 14450 Py_INCREF(seq); 14451 it->it_seq = seq; 14452 _PyObject_GC_TRACK(it); 14453 return (PyObject *)it; 14454} 14455 14456 14457size_t 14458Py_UNICODE_strlen(const Py_UNICODE *u) 14459{ 14460 int res = 0; 14461 while(*u++) 14462 res++; 14463 return res; 14464} 14465 14466Py_UNICODE* 14467Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14468{ 14469 Py_UNICODE *u = s1; 14470 while ((*u++ = *s2++)); 14471 return s1; 14472} 14473 14474Py_UNICODE* 14475Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14476{ 14477 Py_UNICODE *u = s1; 14478 while ((*u++ = *s2++)) 14479 if (n-- == 0) 14480 break; 14481 return s1; 14482} 14483 14484Py_UNICODE* 14485Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14486{ 14487 Py_UNICODE *u1 = s1; 14488 u1 += Py_UNICODE_strlen(u1); 14489 Py_UNICODE_strcpy(u1, s2); 14490 return s1; 14491} 14492 14493int 14494Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14495{ 14496 while (*s1 && *s2 && *s1 == *s2) 14497 s1++, s2++; 14498 if (*s1 && *s2) 14499 return (*s1 < *s2) ? -1 : +1; 14500 if (*s1) 14501 return 1; 14502 if (*s2) 14503 return -1; 14504 return 0; 14505} 14506 14507int 14508Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14509{ 14510 register Py_UNICODE u1, u2; 14511 for (; n != 0; n--) { 14512 u1 = *s1; 14513 u2 = *s2; 14514 if (u1 != u2) 14515 return (u1 < u2) ? -1 : +1; 14516 if (u1 == '\0') 14517 return 0; 14518 s1++; 14519 s2++; 14520 } 14521 return 0; 14522} 14523 14524Py_UNICODE* 14525Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14526{ 14527 const Py_UNICODE *p; 14528 for (p = s; *p; p++) 14529 if (*p == c) 14530 return (Py_UNICODE*)p; 14531 return NULL; 14532} 14533 14534Py_UNICODE* 14535Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14536{ 14537 const Py_UNICODE *p; 14538 p = s + Py_UNICODE_strlen(s); 14539 while (p != s) { 14540 p--; 14541 if (*p == c) 14542 return (Py_UNICODE*)p; 14543 } 14544 return NULL; 14545} 14546 14547Py_UNICODE* 14548PyUnicode_AsUnicodeCopy(PyObject *unicode) 14549{ 14550 Py_UNICODE *u, *copy; 14551 Py_ssize_t len, size; 14552 14553 if (!PyUnicode_Check(unicode)) { 14554 PyErr_BadArgument(); 14555 return NULL; 14556 } 14557 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14558 if (u == NULL) 14559 return NULL; 14560 /* Ensure we won't overflow the size. */ 14561 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14562 PyErr_NoMemory(); 14563 return NULL; 14564 } 14565 size = len + 1; /* copy the null character */ 14566 size *= sizeof(Py_UNICODE); 14567 copy = PyMem_Malloc(size); 14568 if (copy == NULL) { 14569 PyErr_NoMemory(); 14570 return NULL; 14571 } 14572 memcpy(copy, u, size); 14573 return copy; 14574} 14575 14576/* A _string module, to export formatter_parser and formatter_field_name_split 14577 to the string.Formatter class implemented in Python. */ 14578 14579static PyMethodDef _string_methods[] = { 14580 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14581 METH_O, PyDoc_STR("split the argument as a field name")}, 14582 {"formatter_parser", (PyCFunction) formatter_parser, 14583 METH_O, PyDoc_STR("parse the argument as a format string")}, 14584 {NULL, NULL} 14585}; 14586 14587static struct PyModuleDef _string_module = { 14588 PyModuleDef_HEAD_INIT, 14589 "_string", 14590 PyDoc_STR("string helper module"), 14591 0, 14592 _string_methods, 14593 NULL, 14594 NULL, 14595 NULL, 14596 NULL 14597}; 14598 14599PyMODINIT_FUNC 14600PyInit__string(void) 14601{ 14602 return PyModule_Create(&_string_module); 14603} 14604 14605 14606#ifdef __cplusplus 14607} 14608#endif 14609