codecs.c revision 26861b0b29fdf64fba8cd120183408495f2c80e2
1/* ------------------------------------------------------------------------ 2 3 Python Codec Registry and support functions 4 5Written by Marc-Andre Lemburg (mal@lemburg.com). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9 ------------------------------------------------------------------------ */ 10 11#include "Python.h" 12#include "ucnhash.h" 13#include <ctype.h> 14 15const char *Py_hexdigits = "0123456789abcdef"; 16 17/* --- Codec Registry ----------------------------------------------------- */ 18 19/* Import the standard encodings package which will register the first 20 codec search function. 21 22 This is done in a lazy way so that the Unicode implementation does 23 not downgrade startup time of scripts not needing it. 24 25 ImportErrors are silently ignored by this function. Only one try is 26 made. 27 28*/ 29 30static int _PyCodecRegistry_Init(void); /* Forward */ 31 32int PyCodec_Register(PyObject *search_function) 33{ 34 PyInterpreterState *interp = PyThreadState_GET()->interp; 35 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 36 goto onError; 37 if (search_function == NULL) { 38 PyErr_BadArgument(); 39 goto onError; 40 } 41 if (!PyCallable_Check(search_function)) { 42 PyErr_SetString(PyExc_TypeError, "argument must be callable"); 43 goto onError; 44 } 45 return PyList_Append(interp->codec_search_path, search_function); 46 47 onError: 48 return -1; 49} 50 51/* Convert a string to a normalized Python string: all characters are 52 converted to lower case, spaces are replaced with underscores. */ 53 54static 55PyObject *normalizestring(const char *string) 56{ 57 size_t i; 58 size_t len = strlen(string); 59 char *p; 60 PyObject *v; 61 62 if (len > PY_SSIZE_T_MAX) { 63 PyErr_SetString(PyExc_OverflowError, "string is too large"); 64 return NULL; 65 } 66 67 p = PyMem_Malloc(len + 1); 68 if (p == NULL) 69 return PyErr_NoMemory(); 70 for (i = 0; i < len; i++) { 71 char ch = string[i]; 72 if (ch == ' ') 73 ch = '-'; 74 else 75 ch = Py_TOLOWER(Py_CHARMASK(ch)); 76 p[i] = ch; 77 } 78 p[i] = '\0'; 79 v = PyUnicode_FromString(p); 80 if (v == NULL) 81 return NULL; 82 PyMem_Free(p); 83 return v; 84} 85 86/* Lookup the given encoding and return a tuple providing the codec 87 facilities. 88 89 The encoding string is looked up converted to all lower-case 90 characters. This makes encodings looked up through this mechanism 91 effectively case-insensitive. 92 93 If no codec is found, a LookupError is set and NULL returned. 94 95 As side effect, this tries to load the encodings package, if not 96 yet done. This is part of the lazy load strategy for the encodings 97 package. 98 99*/ 100 101PyObject *_PyCodec_Lookup(const char *encoding) 102{ 103 PyInterpreterState *interp; 104 PyObject *result, *args = NULL, *v; 105 Py_ssize_t i, len; 106 107 if (encoding == NULL) { 108 PyErr_BadArgument(); 109 goto onError; 110 } 111 112 interp = PyThreadState_GET()->interp; 113 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 114 goto onError; 115 116 /* Convert the encoding to a normalized Python string: all 117 characters are converted to lower case, spaces and hyphens are 118 replaced with underscores. */ 119 v = normalizestring(encoding); 120 if (v == NULL) 121 goto onError; 122 PyUnicode_InternInPlace(&v); 123 124 /* First, try to lookup the name in the registry dictionary */ 125 result = PyDict_GetItem(interp->codec_search_cache, v); 126 if (result != NULL) { 127 Py_INCREF(result); 128 Py_DECREF(v); 129 return result; 130 } 131 132 /* Next, scan the search functions in order of registration */ 133 args = PyTuple_New(1); 134 if (args == NULL) 135 goto onError; 136 PyTuple_SET_ITEM(args,0,v); 137 138 len = PyList_Size(interp->codec_search_path); 139 if (len < 0) 140 goto onError; 141 if (len == 0) { 142 PyErr_SetString(PyExc_LookupError, 143 "no codec search functions registered: " 144 "can't find encoding"); 145 goto onError; 146 } 147 148 for (i = 0; i < len; i++) { 149 PyObject *func; 150 151 func = PyList_GetItem(interp->codec_search_path, i); 152 if (func == NULL) 153 goto onError; 154 result = PyEval_CallObject(func, args); 155 if (result == NULL) 156 goto onError; 157 if (result == Py_None) { 158 Py_DECREF(result); 159 continue; 160 } 161 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { 162 PyErr_SetString(PyExc_TypeError, 163 "codec search functions must return 4-tuples"); 164 Py_DECREF(result); 165 goto onError; 166 } 167 break; 168 } 169 if (i == len) { 170 /* XXX Perhaps we should cache misses too ? */ 171 PyErr_Format(PyExc_LookupError, 172 "unknown encoding: %s", encoding); 173 goto onError; 174 } 175 176 /* Cache and return the result */ 177 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { 178 Py_DECREF(result); 179 goto onError; 180 } 181 Py_DECREF(args); 182 return result; 183 184 onError: 185 Py_XDECREF(args); 186 return NULL; 187} 188 189int _PyCodec_Forget(const char *encoding) 190{ 191 PyInterpreterState *interp; 192 PyObject *v; 193 int result; 194 195 interp = PyThreadState_GET()->interp; 196 if (interp->codec_search_path == NULL) { 197 return -1; 198 } 199 200 /* Convert the encoding to a normalized Python string: all 201 characters are converted to lower case, spaces and hyphens are 202 replaced with underscores. */ 203 v = normalizestring(encoding); 204 if (v == NULL) { 205 return -1; 206 } 207 208 /* Drop the named codec from the internal cache */ 209 result = PyDict_DelItem(interp->codec_search_cache, v); 210 Py_DECREF(v); 211 212 return result; 213} 214 215/* Codec registry encoding check API. */ 216 217int PyCodec_KnownEncoding(const char *encoding) 218{ 219 PyObject *codecs; 220 221 codecs = _PyCodec_Lookup(encoding); 222 if (!codecs) { 223 PyErr_Clear(); 224 return 0; 225 } 226 else { 227 Py_DECREF(codecs); 228 return 1; 229 } 230} 231 232static 233PyObject *args_tuple(PyObject *object, 234 const char *errors) 235{ 236 PyObject *args; 237 238 args = PyTuple_New(1 + (errors != NULL)); 239 if (args == NULL) 240 return NULL; 241 Py_INCREF(object); 242 PyTuple_SET_ITEM(args,0,object); 243 if (errors) { 244 PyObject *v; 245 246 v = PyUnicode_FromString(errors); 247 if (v == NULL) { 248 Py_DECREF(args); 249 return NULL; 250 } 251 PyTuple_SET_ITEM(args, 1, v); 252 } 253 return args; 254} 255 256/* Helper function to get a codec item */ 257 258static 259PyObject *codec_getitem(const char *encoding, int index) 260{ 261 PyObject *codecs; 262 PyObject *v; 263 264 codecs = _PyCodec_Lookup(encoding); 265 if (codecs == NULL) 266 return NULL; 267 v = PyTuple_GET_ITEM(codecs, index); 268 Py_DECREF(codecs); 269 Py_INCREF(v); 270 return v; 271} 272 273/* Helper functions to create an incremental codec. */ 274static 275PyObject *codec_makeincrementalcodec(PyObject *codec_info, 276 const char *errors, 277 const char *attrname) 278{ 279 PyObject *ret, *inccodec; 280 281 inccodec = PyObject_GetAttrString(codec_info, attrname); 282 if (inccodec == NULL) 283 return NULL; 284 if (errors) 285 ret = PyObject_CallFunction(inccodec, "s", errors); 286 else 287 ret = PyObject_CallFunction(inccodec, NULL); 288 Py_DECREF(inccodec); 289 return ret; 290} 291 292static 293PyObject *codec_getincrementalcodec(const char *encoding, 294 const char *errors, 295 const char *attrname) 296{ 297 PyObject *codec_info, *ret; 298 299 codec_info = _PyCodec_Lookup(encoding); 300 if (codec_info == NULL) 301 return NULL; 302 ret = codec_makeincrementalcodec(codec_info, errors, attrname); 303 Py_DECREF(codec_info); 304 return ret; 305} 306 307/* Helper function to create a stream codec. */ 308 309static 310PyObject *codec_getstreamcodec(const char *encoding, 311 PyObject *stream, 312 const char *errors, 313 const int index) 314{ 315 PyObject *codecs, *streamcodec, *codeccls; 316 317 codecs = _PyCodec_Lookup(encoding); 318 if (codecs == NULL) 319 return NULL; 320 321 codeccls = PyTuple_GET_ITEM(codecs, index); 322 if (errors != NULL) 323 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors); 324 else 325 streamcodec = PyObject_CallFunction(codeccls, "O", stream); 326 Py_DECREF(codecs); 327 return streamcodec; 328} 329 330/* Helpers to work with the result of _PyCodec_Lookup 331 332 */ 333PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, 334 const char *errors) 335{ 336 return codec_makeincrementalcodec(codec_info, errors, 337 "incrementaldecoder"); 338} 339 340PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, 341 const char *errors) 342{ 343 return codec_makeincrementalcodec(codec_info, errors, 344 "incrementalencoder"); 345} 346 347 348/* Convenience APIs to query the Codec registry. 349 350 All APIs return a codec object with incremented refcount. 351 352 */ 353 354PyObject *PyCodec_Encoder(const char *encoding) 355{ 356 return codec_getitem(encoding, 0); 357} 358 359PyObject *PyCodec_Decoder(const char *encoding) 360{ 361 return codec_getitem(encoding, 1); 362} 363 364PyObject *PyCodec_IncrementalEncoder(const char *encoding, 365 const char *errors) 366{ 367 return codec_getincrementalcodec(encoding, errors, "incrementalencoder"); 368} 369 370PyObject *PyCodec_IncrementalDecoder(const char *encoding, 371 const char *errors) 372{ 373 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder"); 374} 375 376PyObject *PyCodec_StreamReader(const char *encoding, 377 PyObject *stream, 378 const char *errors) 379{ 380 return codec_getstreamcodec(encoding, stream, errors, 2); 381} 382 383PyObject *PyCodec_StreamWriter(const char *encoding, 384 PyObject *stream, 385 const char *errors) 386{ 387 return codec_getstreamcodec(encoding, stream, errors, 3); 388} 389 390/* Helper that tries to ensure the reported exception chain indicates the 391 * codec that was invoked to trigger the failure without changing the type 392 * of the exception raised. 393 */ 394static void 395wrap_codec_error(const char *operation, 396 const char *encoding) 397{ 398 /* TrySetFromCause will replace the active exception with a suitably 399 * updated clone if it can, otherwise it will leave the original 400 * exception alone. 401 */ 402 _PyErr_TrySetFromCause("%s with '%s' codec failed", 403 operation, encoding); 404} 405 406/* Encode an object (e.g. an Unicode object) using the given encoding 407 and return the resulting encoded object (usually a Python string). 408 409 errors is passed to the encoder factory as argument if non-NULL. */ 410 411static PyObject * 412_PyCodec_EncodeInternal(PyObject *object, 413 PyObject *encoder, 414 const char *encoding, 415 const char *errors) 416{ 417 PyObject *args = NULL, *result = NULL; 418 PyObject *v = NULL; 419 420 args = args_tuple(object, errors); 421 if (args == NULL) 422 goto onError; 423 424 result = PyEval_CallObject(encoder, args); 425 if (result == NULL) { 426 wrap_codec_error("encoding", encoding); 427 goto onError; 428 } 429 430 if (!PyTuple_Check(result) || 431 PyTuple_GET_SIZE(result) != 2) { 432 PyErr_SetString(PyExc_TypeError, 433 "encoder must return a tuple (object, integer)"); 434 goto onError; 435 } 436 v = PyTuple_GET_ITEM(result,0); 437 Py_INCREF(v); 438 /* We don't check or use the second (integer) entry. */ 439 440 Py_DECREF(args); 441 Py_DECREF(encoder); 442 Py_DECREF(result); 443 return v; 444 445 onError: 446 Py_XDECREF(result); 447 Py_XDECREF(args); 448 Py_XDECREF(encoder); 449 return NULL; 450} 451 452/* Decode an object (usually a Python string) using the given encoding 453 and return an equivalent object (e.g. an Unicode object). 454 455 errors is passed to the decoder factory as argument if non-NULL. */ 456 457static PyObject * 458_PyCodec_DecodeInternal(PyObject *object, 459 PyObject *decoder, 460 const char *encoding, 461 const char *errors) 462{ 463 PyObject *args = NULL, *result = NULL; 464 PyObject *v; 465 466 args = args_tuple(object, errors); 467 if (args == NULL) 468 goto onError; 469 470 result = PyEval_CallObject(decoder,args); 471 if (result == NULL) { 472 wrap_codec_error("decoding", encoding); 473 goto onError; 474 } 475 if (!PyTuple_Check(result) || 476 PyTuple_GET_SIZE(result) != 2) { 477 PyErr_SetString(PyExc_TypeError, 478 "decoder must return a tuple (object,integer)"); 479 goto onError; 480 } 481 v = PyTuple_GET_ITEM(result,0); 482 Py_INCREF(v); 483 /* We don't check or use the second (integer) entry. */ 484 485 Py_DECREF(args); 486 Py_DECREF(decoder); 487 Py_DECREF(result); 488 return v; 489 490 onError: 491 Py_XDECREF(args); 492 Py_XDECREF(decoder); 493 Py_XDECREF(result); 494 return NULL; 495} 496 497/* Generic encoding/decoding API */ 498PyObject *PyCodec_Encode(PyObject *object, 499 const char *encoding, 500 const char *errors) 501{ 502 PyObject *encoder; 503 504 encoder = PyCodec_Encoder(encoding); 505 if (encoder == NULL) 506 return NULL; 507 508 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 509} 510 511PyObject *PyCodec_Decode(PyObject *object, 512 const char *encoding, 513 const char *errors) 514{ 515 PyObject *decoder; 516 517 decoder = PyCodec_Decoder(encoding); 518 if (decoder == NULL) 519 return NULL; 520 521 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 522} 523 524/* Text encoding/decoding API */ 525PyObject * _PyCodec_LookupTextEncoding(const char *encoding, 526 const char *alternate_command) 527{ 528 _Py_IDENTIFIER(_is_text_encoding); 529 PyObject *codec; 530 PyObject *attr; 531 int is_text_codec; 532 533 codec = _PyCodec_Lookup(encoding); 534 if (codec == NULL) 535 return NULL; 536 537 /* Backwards compatibility: assume any raw tuple describes a text 538 * encoding, and the same for anything lacking the private 539 * attribute. 540 */ 541 if (!PyTuple_CheckExact(codec)) { 542 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding); 543 if (attr == NULL) { 544 if (PyErr_ExceptionMatches(PyExc_AttributeError)) { 545 PyErr_Clear(); 546 } else { 547 Py_DECREF(codec); 548 return NULL; 549 } 550 } else { 551 is_text_codec = PyObject_IsTrue(attr); 552 Py_DECREF(attr); 553 if (!is_text_codec) { 554 Py_DECREF(codec); 555 PyErr_Format(PyExc_LookupError, 556 "'%.400s' is not a text encoding; " 557 "use %s to handle arbitrary codecs", 558 encoding, alternate_command); 559 return NULL; 560 } 561 } 562 } 563 564 /* This appears to be a valid text encoding */ 565 return codec; 566} 567 568 569static 570PyObject *codec_getitem_checked(const char *encoding, 571 const char *alternate_command, 572 int index) 573{ 574 PyObject *codec; 575 PyObject *v; 576 577 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); 578 if (codec == NULL) 579 return NULL; 580 581 v = PyTuple_GET_ITEM(codec, index); 582 Py_INCREF(v); 583 Py_DECREF(codec); 584 return v; 585} 586 587static PyObject * _PyCodec_TextEncoder(const char *encoding) 588{ 589 return codec_getitem_checked(encoding, "codecs.encode()", 0); 590} 591 592static PyObject * _PyCodec_TextDecoder(const char *encoding) 593{ 594 return codec_getitem_checked(encoding, "codecs.decode()", 1); 595} 596 597PyObject *_PyCodec_EncodeText(PyObject *object, 598 const char *encoding, 599 const char *errors) 600{ 601 PyObject *encoder; 602 603 encoder = _PyCodec_TextEncoder(encoding); 604 if (encoder == NULL) 605 return NULL; 606 607 return _PyCodec_EncodeInternal(object, encoder, encoding, errors); 608} 609 610PyObject *_PyCodec_DecodeText(PyObject *object, 611 const char *encoding, 612 const char *errors) 613{ 614 PyObject *decoder; 615 616 decoder = _PyCodec_TextDecoder(encoding); 617 if (decoder == NULL) 618 return NULL; 619 620 return _PyCodec_DecodeInternal(object, decoder, encoding, errors); 621} 622 623/* Register the error handling callback function error under the name 624 name. This function will be called by the codec when it encounters 625 an unencodable characters/undecodable bytes and doesn't know the 626 callback name, when name is specified as the error parameter 627 in the call to the encode/decode function. 628 Return 0 on success, -1 on error */ 629int PyCodec_RegisterError(const char *name, PyObject *error) 630{ 631 PyInterpreterState *interp = PyThreadState_GET()->interp; 632 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 633 return -1; 634 if (!PyCallable_Check(error)) { 635 PyErr_SetString(PyExc_TypeError, "handler must be callable"); 636 return -1; 637 } 638 return PyDict_SetItemString(interp->codec_error_registry, 639 name, error); 640} 641 642/* Lookup the error handling callback function registered under the 643 name error. As a special case NULL can be passed, in which case 644 the error handling callback for strict encoding will be returned. */ 645PyObject *PyCodec_LookupError(const char *name) 646{ 647 PyObject *handler = NULL; 648 649 PyInterpreterState *interp = PyThreadState_GET()->interp; 650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) 651 return NULL; 652 653 if (name==NULL) 654 name = "strict"; 655 handler = PyDict_GetItemString(interp->codec_error_registry, name); 656 if (!handler) 657 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); 658 else 659 Py_INCREF(handler); 660 return handler; 661} 662 663static void wrong_exception_type(PyObject *exc) 664{ 665 _Py_IDENTIFIER(__class__); 666 _Py_IDENTIFIER(__name__); 667 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__); 668 if (type != NULL) { 669 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__); 670 Py_DECREF(type); 671 if (name != NULL) { 672 PyErr_Format(PyExc_TypeError, 673 "don't know how to handle %S in error callback", name); 674 Py_DECREF(name); 675 } 676 } 677} 678 679PyObject *PyCodec_StrictErrors(PyObject *exc) 680{ 681 if (PyExceptionInstance_Check(exc)) 682 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 683 else 684 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); 685 return NULL; 686} 687 688 689PyObject *PyCodec_IgnoreErrors(PyObject *exc) 690{ 691 Py_ssize_t end; 692 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 693 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 694 return NULL; 695 } 696 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { 697 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 698 return NULL; 699 } 700 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { 701 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 702 return NULL; 703 } 704 else { 705 wrong_exception_type(exc); 706 return NULL; 707 } 708 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); 709} 710 711 712PyObject *PyCodec_ReplaceErrors(PyObject *exc) 713{ 714 Py_ssize_t start, end, i, len; 715 716 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 717 PyObject *res; 718 int kind; 719 void *data; 720 if (PyUnicodeEncodeError_GetStart(exc, &start)) 721 return NULL; 722 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 723 return NULL; 724 len = end - start; 725 res = PyUnicode_New(len, '?'); 726 if (res == NULL) 727 return NULL; 728 kind = PyUnicode_KIND(res); 729 data = PyUnicode_DATA(res); 730 for (i = 0; i < len; ++i) 731 PyUnicode_WRITE(kind, data, i, '?'); 732 assert(_PyUnicode_CheckConsistency(res, 1)); 733 return Py_BuildValue("(Nn)", res, end); 734 } 735 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { 736 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 737 return NULL; 738 return Py_BuildValue("(Cn)", 739 (int)Py_UNICODE_REPLACEMENT_CHARACTER, 740 end); 741 } 742 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { 743 PyObject *res; 744 int kind; 745 void *data; 746 if (PyUnicodeTranslateError_GetStart(exc, &start)) 747 return NULL; 748 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 749 return NULL; 750 len = end - start; 751 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); 752 if (res == NULL) 753 return NULL; 754 kind = PyUnicode_KIND(res); 755 data = PyUnicode_DATA(res); 756 for (i=0; i < len; i++) 757 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); 758 assert(_PyUnicode_CheckConsistency(res, 1)); 759 return Py_BuildValue("(Nn)", res, end); 760 } 761 else { 762 wrong_exception_type(exc); 763 return NULL; 764 } 765} 766 767PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) 768{ 769 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 770 PyObject *restuple; 771 PyObject *object; 772 Py_ssize_t i; 773 Py_ssize_t start; 774 Py_ssize_t end; 775 PyObject *res; 776 unsigned char *outp; 777 Py_ssize_t ressize; 778 Py_UCS4 ch; 779 if (PyUnicodeEncodeError_GetStart(exc, &start)) 780 return NULL; 781 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 782 return NULL; 783 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 784 return NULL; 785 if (end - start > PY_SSIZE_T_MAX / (2+7+1)) 786 end = start + PY_SSIZE_T_MAX / (2+7+1); 787 for (i = start, ressize = 0; i < end; ++i) { 788 /* object is guaranteed to be "ready" */ 789 ch = PyUnicode_READ_CHAR(object, i); 790 if (ch<10) 791 ressize += 2+1+1; 792 else if (ch<100) 793 ressize += 2+2+1; 794 else if (ch<1000) 795 ressize += 2+3+1; 796 else if (ch<10000) 797 ressize += 2+4+1; 798 else if (ch<100000) 799 ressize += 2+5+1; 800 else if (ch<1000000) 801 ressize += 2+6+1; 802 else 803 ressize += 2+7+1; 804 } 805 /* allocate replacement */ 806 res = PyUnicode_New(ressize, 127); 807 if (res == NULL) { 808 Py_DECREF(object); 809 return NULL; 810 } 811 outp = PyUnicode_1BYTE_DATA(res); 812 /* generate replacement */ 813 for (i = start; i < end; ++i) { 814 int digits; 815 int base; 816 ch = PyUnicode_READ_CHAR(object, i); 817 *outp++ = '&'; 818 *outp++ = '#'; 819 if (ch<10) { 820 digits = 1; 821 base = 1; 822 } 823 else if (ch<100) { 824 digits = 2; 825 base = 10; 826 } 827 else if (ch<1000) { 828 digits = 3; 829 base = 100; 830 } 831 else if (ch<10000) { 832 digits = 4; 833 base = 1000; 834 } 835 else if (ch<100000) { 836 digits = 5; 837 base = 10000; 838 } 839 else if (ch<1000000) { 840 digits = 6; 841 base = 100000; 842 } 843 else { 844 digits = 7; 845 base = 1000000; 846 } 847 while (digits-->0) { 848 *outp++ = '0' + ch/base; 849 ch %= base; 850 base /= 10; 851 } 852 *outp++ = ';'; 853 } 854 assert(_PyUnicode_CheckConsistency(res, 1)); 855 restuple = Py_BuildValue("(Nn)", res, end); 856 Py_DECREF(object); 857 return restuple; 858 } 859 else { 860 wrong_exception_type(exc); 861 return NULL; 862 } 863} 864 865PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) 866{ 867 PyObject *object; 868 Py_ssize_t i; 869 Py_ssize_t start; 870 Py_ssize_t end; 871 PyObject *res; 872 unsigned char *outp; 873 int ressize; 874 Py_UCS4 c; 875 876 if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { 877 unsigned char *p; 878 if (PyUnicodeDecodeError_GetStart(exc, &start)) 879 return NULL; 880 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 881 return NULL; 882 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 883 return NULL; 884 if (!(p = (unsigned char*)PyBytes_AsString(object))) { 885 Py_DECREF(object); 886 return NULL; 887 } 888 res = PyUnicode_New(4 * (end - start), 127); 889 if (res == NULL) { 890 Py_DECREF(object); 891 return NULL; 892 } 893 outp = PyUnicode_1BYTE_DATA(res); 894 for (i = start; i < end; i++, outp += 4) { 895 unsigned char c = p[i]; 896 outp[0] = '\\'; 897 outp[1] = 'x'; 898 outp[2] = Py_hexdigits[(c>>4)&0xf]; 899 outp[3] = Py_hexdigits[c&0xf]; 900 } 901 902 assert(_PyUnicode_CheckConsistency(res, 1)); 903 Py_DECREF(object); 904 return Py_BuildValue("(Nn)", res, end); 905 } 906 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 907 if (PyUnicodeEncodeError_GetStart(exc, &start)) 908 return NULL; 909 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 910 return NULL; 911 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 912 return NULL; 913 } 914 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { 915 if (PyUnicodeTranslateError_GetStart(exc, &start)) 916 return NULL; 917 if (PyUnicodeTranslateError_GetEnd(exc, &end)) 918 return NULL; 919 if (!(object = PyUnicodeTranslateError_GetObject(exc))) 920 return NULL; 921 } 922 else { 923 wrong_exception_type(exc); 924 return NULL; 925 } 926 927 if (end - start > PY_SSIZE_T_MAX / (1+1+8)) 928 end = start + PY_SSIZE_T_MAX / (1+1+8); 929 for (i = start, ressize = 0; i < end; ++i) { 930 /* object is guaranteed to be "ready" */ 931 c = PyUnicode_READ_CHAR(object, i); 932 if (c >= 0x10000) { 933 ressize += 1+1+8; 934 } 935 else if (c >= 0x100) { 936 ressize += 1+1+4; 937 } 938 else 939 ressize += 1+1+2; 940 } 941 res = PyUnicode_New(ressize, 127); 942 if (res == NULL) { 943 Py_DECREF(object); 944 return NULL; 945 } 946 outp = PyUnicode_1BYTE_DATA(res); 947 for (i = start; i < end; ++i) { 948 c = PyUnicode_READ_CHAR(object, i); 949 *outp++ = '\\'; 950 if (c >= 0x00010000) { 951 *outp++ = 'U'; 952 *outp++ = Py_hexdigits[(c>>28)&0xf]; 953 *outp++ = Py_hexdigits[(c>>24)&0xf]; 954 *outp++ = Py_hexdigits[(c>>20)&0xf]; 955 *outp++ = Py_hexdigits[(c>>16)&0xf]; 956 *outp++ = Py_hexdigits[(c>>12)&0xf]; 957 *outp++ = Py_hexdigits[(c>>8)&0xf]; 958 } 959 else if (c >= 0x100) { 960 *outp++ = 'u'; 961 *outp++ = Py_hexdigits[(c>>12)&0xf]; 962 *outp++ = Py_hexdigits[(c>>8)&0xf]; 963 } 964 else 965 *outp++ = 'x'; 966 *outp++ = Py_hexdigits[(c>>4)&0xf]; 967 *outp++ = Py_hexdigits[c&0xf]; 968 } 969 970 assert(_PyUnicode_CheckConsistency(res, 1)); 971 Py_DECREF(object); 972 return Py_BuildValue("(Nn)", res, end); 973} 974 975static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 976static int ucnhash_initialized = 0; 977 978PyObject *PyCodec_NameReplaceErrors(PyObject *exc) 979{ 980 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 981 PyObject *restuple; 982 PyObject *object; 983 Py_ssize_t i; 984 Py_ssize_t start; 985 Py_ssize_t end; 986 PyObject *res; 987 unsigned char *outp; 988 Py_ssize_t ressize; 989 int replsize; 990 Py_UCS4 c; 991 char buffer[256]; /* NAME_MAXLEN */ 992 if (PyUnicodeEncodeError_GetStart(exc, &start)) 993 return NULL; 994 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 995 return NULL; 996 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 997 return NULL; 998 if (!ucnhash_initialized) { 999 /* load the unicode data module */ 1000 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 1001 PyUnicodeData_CAPSULE_NAME, 1); 1002 ucnhash_initialized = 1; 1003 } 1004 for (i = start, ressize = 0; i < end; ++i) { 1005 /* object is guaranteed to be "ready" */ 1006 c = PyUnicode_READ_CHAR(object, i); 1007 if (ucnhash_CAPI && 1008 ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { 1009 replsize = 1+1+1+(int)strlen(buffer)+1; 1010 } 1011 else if (c >= 0x10000) { 1012 replsize = 1+1+8; 1013 } 1014 else if (c >= 0x100) { 1015 replsize = 1+1+4; 1016 } 1017 else 1018 replsize = 1+1+2; 1019 if (ressize > PY_SSIZE_T_MAX - replsize) 1020 break; 1021 ressize += replsize; 1022 } 1023 end = i; 1024 res = PyUnicode_New(ressize, 127); 1025 if (res==NULL) 1026 return NULL; 1027 for (i = start, outp = PyUnicode_1BYTE_DATA(res); 1028 i < end; ++i) { 1029 c = PyUnicode_READ_CHAR(object, i); 1030 *outp++ = '\\'; 1031 if (ucnhash_CAPI && 1032 ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { 1033 *outp++ = 'N'; 1034 *outp++ = '{'; 1035 strcpy((char *)outp, buffer); 1036 outp += strlen(buffer); 1037 *outp++ = '}'; 1038 continue; 1039 } 1040 if (c >= 0x00010000) { 1041 *outp++ = 'U'; 1042 *outp++ = Py_hexdigits[(c>>28)&0xf]; 1043 *outp++ = Py_hexdigits[(c>>24)&0xf]; 1044 *outp++ = Py_hexdigits[(c>>20)&0xf]; 1045 *outp++ = Py_hexdigits[(c>>16)&0xf]; 1046 *outp++ = Py_hexdigits[(c>>12)&0xf]; 1047 *outp++ = Py_hexdigits[(c>>8)&0xf]; 1048 } 1049 else if (c >= 0x100) { 1050 *outp++ = 'u'; 1051 *outp++ = Py_hexdigits[(c>>12)&0xf]; 1052 *outp++ = Py_hexdigits[(c>>8)&0xf]; 1053 } 1054 else 1055 *outp++ = 'x'; 1056 *outp++ = Py_hexdigits[(c>>4)&0xf]; 1057 *outp++ = Py_hexdigits[c&0xf]; 1058 } 1059 1060 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); 1061 assert(_PyUnicode_CheckConsistency(res, 1)); 1062 restuple = Py_BuildValue("(Nn)", res, end); 1063 Py_DECREF(object); 1064 return restuple; 1065 } 1066 else { 1067 wrong_exception_type(exc); 1068 return NULL; 1069 } 1070} 1071 1072#define ENC_UNKNOWN -1 1073#define ENC_UTF8 0 1074#define ENC_UTF16BE 1 1075#define ENC_UTF16LE 2 1076#define ENC_UTF32BE 3 1077#define ENC_UTF32LE 4 1078 1079static int 1080get_standard_encoding(const char *encoding, int *bytelength) 1081{ 1082 if (Py_TOLOWER(encoding[0]) == 'u' && 1083 Py_TOLOWER(encoding[1]) == 't' && 1084 Py_TOLOWER(encoding[2]) == 'f') { 1085 encoding += 3; 1086 if (*encoding == '-' || *encoding == '_' ) 1087 encoding++; 1088 if (encoding[0] == '8' && encoding[1] == '\0') { 1089 *bytelength = 3; 1090 return ENC_UTF8; 1091 } 1092 else if (encoding[0] == '1' && encoding[1] == '6') { 1093 encoding += 2; 1094 *bytelength = 2; 1095 if (*encoding == '\0') { 1096#ifdef WORDS_BIGENDIAN 1097 return ENC_UTF16BE; 1098#else 1099 return ENC_UTF16LE; 1100#endif 1101 } 1102 if (*encoding == '-' || *encoding == '_' ) 1103 encoding++; 1104 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { 1105 if (Py_TOLOWER(encoding[0]) == 'b') 1106 return ENC_UTF16BE; 1107 if (Py_TOLOWER(encoding[0]) == 'l') 1108 return ENC_UTF16LE; 1109 } 1110 } 1111 else if (encoding[0] == '3' && encoding[1] == '2') { 1112 encoding += 2; 1113 *bytelength = 4; 1114 if (*encoding == '\0') { 1115#ifdef WORDS_BIGENDIAN 1116 return ENC_UTF32BE; 1117#else 1118 return ENC_UTF32LE; 1119#endif 1120 } 1121 if (*encoding == '-' || *encoding == '_' ) 1122 encoding++; 1123 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { 1124 if (Py_TOLOWER(encoding[0]) == 'b') 1125 return ENC_UTF32BE; 1126 if (Py_TOLOWER(encoding[0]) == 'l') 1127 return ENC_UTF32LE; 1128 } 1129 } 1130 } 1131 else if (strcmp(encoding, "CP_UTF8") == 0) { 1132 *bytelength = 3; 1133 return ENC_UTF8; 1134 } 1135 return ENC_UNKNOWN; 1136} 1137 1138/* This handler is declared static until someone demonstrates 1139 a need to call it directly. */ 1140static PyObject * 1141PyCodec_SurrogatePassErrors(PyObject *exc) 1142{ 1143 PyObject *restuple; 1144 PyObject *object; 1145 PyObject *encode; 1146 char *encoding; 1147 int code; 1148 int bytelength; 1149 Py_ssize_t i; 1150 Py_ssize_t start; 1151 Py_ssize_t end; 1152 PyObject *res; 1153 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 1154 unsigned char *outp; 1155 if (PyUnicodeEncodeError_GetStart(exc, &start)) 1156 return NULL; 1157 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 1158 return NULL; 1159 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 1160 return NULL; 1161 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { 1162 Py_DECREF(object); 1163 return NULL; 1164 } 1165 if (!(encoding = PyUnicode_AsUTF8(encode))) { 1166 Py_DECREF(object); 1167 Py_DECREF(encode); 1168 return NULL; 1169 } 1170 code = get_standard_encoding(encoding, &bytelength); 1171 Py_DECREF(encode); 1172 if (code == ENC_UNKNOWN) { 1173 /* Not supported, fail with original exception */ 1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1175 Py_DECREF(object); 1176 return NULL; 1177 } 1178 1179 if (end - start > PY_SSIZE_T_MAX / bytelength) 1180 end = start + PY_SSIZE_T_MAX / bytelength; 1181 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); 1182 if (!res) { 1183 Py_DECREF(object); 1184 return NULL; 1185 } 1186 outp = (unsigned char*)PyBytes_AsString(res); 1187 for (i = start; i < end; i++) { 1188 /* object is guaranteed to be "ready" */ 1189 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); 1190 if (!Py_UNICODE_IS_SURROGATE(ch)) { 1191 /* Not a surrogate, fail with original exception */ 1192 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1193 Py_DECREF(res); 1194 Py_DECREF(object); 1195 return NULL; 1196 } 1197 switch (code) { 1198 case ENC_UTF8: 1199 *outp++ = (unsigned char)(0xe0 | (ch >> 12)); 1200 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); 1201 *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); 1202 break; 1203 case ENC_UTF16LE: 1204 *outp++ = (unsigned char) ch; 1205 *outp++ = (unsigned char)(ch >> 8); 1206 break; 1207 case ENC_UTF16BE: 1208 *outp++ = (unsigned char)(ch >> 8); 1209 *outp++ = (unsigned char) ch; 1210 break; 1211 case ENC_UTF32LE: 1212 *outp++ = (unsigned char) ch; 1213 *outp++ = (unsigned char)(ch >> 8); 1214 *outp++ = (unsigned char)(ch >> 16); 1215 *outp++ = (unsigned char)(ch >> 24); 1216 break; 1217 case ENC_UTF32BE: 1218 *outp++ = (unsigned char)(ch >> 24); 1219 *outp++ = (unsigned char)(ch >> 16); 1220 *outp++ = (unsigned char)(ch >> 8); 1221 *outp++ = (unsigned char) ch; 1222 break; 1223 } 1224 } 1225 restuple = Py_BuildValue("(On)", res, end); 1226 Py_DECREF(res); 1227 Py_DECREF(object); 1228 return restuple; 1229 } 1230 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { 1231 unsigned char *p; 1232 Py_UCS4 ch = 0; 1233 if (PyUnicodeDecodeError_GetStart(exc, &start)) 1234 return NULL; 1235 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 1236 return NULL; 1237 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 1238 return NULL; 1239 if (!(p = (unsigned char*)PyBytes_AsString(object))) { 1240 Py_DECREF(object); 1241 return NULL; 1242 } 1243 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { 1244 Py_DECREF(object); 1245 return NULL; 1246 } 1247 if (!(encoding = PyUnicode_AsUTF8(encode))) { 1248 Py_DECREF(object); 1249 Py_DECREF(encode); 1250 return NULL; 1251 } 1252 code = get_standard_encoding(encoding, &bytelength); 1253 Py_DECREF(encode); 1254 if (code == ENC_UNKNOWN) { 1255 /* Not supported, fail with original exception */ 1256 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1257 Py_DECREF(object); 1258 return NULL; 1259 } 1260 1261 /* Try decoding a single surrogate character. If 1262 there are more, let the codec call us again. */ 1263 p += start; 1264 if (PyBytes_GET_SIZE(object) - start >= bytelength) { 1265 switch (code) { 1266 case ENC_UTF8: 1267 if ((p[0] & 0xf0) == 0xe0 && 1268 (p[1] & 0xc0) == 0x80 && 1269 (p[2] & 0xc0) == 0x80) { 1270 /* it's a three-byte code */ 1271 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); 1272 } 1273 break; 1274 case ENC_UTF16LE: 1275 ch = p[1] << 8 | p[0]; 1276 break; 1277 case ENC_UTF16BE: 1278 ch = p[0] << 8 | p[1]; 1279 break; 1280 case ENC_UTF32LE: 1281 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; 1282 break; 1283 case ENC_UTF32BE: 1284 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; 1285 break; 1286 } 1287 } 1288 1289 Py_DECREF(object); 1290 if (!Py_UNICODE_IS_SURROGATE(ch)) { 1291 /* it's not a surrogate - fail */ 1292 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1293 return NULL; 1294 } 1295 res = PyUnicode_FromOrdinal(ch); 1296 if (res == NULL) 1297 return NULL; 1298 return Py_BuildValue("(Nn)", res, start + bytelength); 1299 } 1300 else { 1301 wrong_exception_type(exc); 1302 return NULL; 1303 } 1304} 1305 1306static PyObject * 1307PyCodec_SurrogateEscapeErrors(PyObject *exc) 1308{ 1309 PyObject *restuple; 1310 PyObject *object; 1311 Py_ssize_t i; 1312 Py_ssize_t start; 1313 Py_ssize_t end; 1314 PyObject *res; 1315 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { 1316 char *outp; 1317 if (PyUnicodeEncodeError_GetStart(exc, &start)) 1318 return NULL; 1319 if (PyUnicodeEncodeError_GetEnd(exc, &end)) 1320 return NULL; 1321 if (!(object = PyUnicodeEncodeError_GetObject(exc))) 1322 return NULL; 1323 res = PyBytes_FromStringAndSize(NULL, end-start); 1324 if (!res) { 1325 Py_DECREF(object); 1326 return NULL; 1327 } 1328 outp = PyBytes_AsString(res); 1329 for (i = start; i < end; i++) { 1330 /* object is guaranteed to be "ready" */ 1331 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); 1332 if (ch < 0xdc80 || ch > 0xdcff) { 1333 /* Not a UTF-8b surrogate, fail with original exception */ 1334 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1335 Py_DECREF(res); 1336 Py_DECREF(object); 1337 return NULL; 1338 } 1339 *outp++ = ch - 0xdc00; 1340 } 1341 restuple = Py_BuildValue("(On)", res, end); 1342 Py_DECREF(res); 1343 Py_DECREF(object); 1344 return restuple; 1345 } 1346 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { 1347 PyObject *str; 1348 unsigned char *p; 1349 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ 1350 int consumed = 0; 1351 if (PyUnicodeDecodeError_GetStart(exc, &start)) 1352 return NULL; 1353 if (PyUnicodeDecodeError_GetEnd(exc, &end)) 1354 return NULL; 1355 if (!(object = PyUnicodeDecodeError_GetObject(exc))) 1356 return NULL; 1357 if (!(p = (unsigned char*)PyBytes_AsString(object))) { 1358 Py_DECREF(object); 1359 return NULL; 1360 } 1361 while (consumed < 4 && consumed < end-start) { 1362 /* Refuse to escape ASCII bytes. */ 1363 if (p[start+consumed] < 128) 1364 break; 1365 ch[consumed] = 0xdc00 + p[start+consumed]; 1366 consumed++; 1367 } 1368 Py_DECREF(object); 1369 if (!consumed) { 1370 /* codec complained about ASCII byte. */ 1371 PyErr_SetObject(PyExceptionInstance_Class(exc), exc); 1372 return NULL; 1373 } 1374 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); 1375 if (str == NULL) 1376 return NULL; 1377 return Py_BuildValue("(Nn)", str, start+consumed); 1378 } 1379 else { 1380 wrong_exception_type(exc); 1381 return NULL; 1382 } 1383} 1384 1385 1386static PyObject *strict_errors(PyObject *self, PyObject *exc) 1387{ 1388 return PyCodec_StrictErrors(exc); 1389} 1390 1391 1392static PyObject *ignore_errors(PyObject *self, PyObject *exc) 1393{ 1394 return PyCodec_IgnoreErrors(exc); 1395} 1396 1397 1398static PyObject *replace_errors(PyObject *self, PyObject *exc) 1399{ 1400 return PyCodec_ReplaceErrors(exc); 1401} 1402 1403 1404static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) 1405{ 1406 return PyCodec_XMLCharRefReplaceErrors(exc); 1407} 1408 1409 1410static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) 1411{ 1412 return PyCodec_BackslashReplaceErrors(exc); 1413} 1414 1415static PyObject *namereplace_errors(PyObject *self, PyObject *exc) 1416{ 1417 return PyCodec_NameReplaceErrors(exc); 1418} 1419 1420static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) 1421{ 1422 return PyCodec_SurrogatePassErrors(exc); 1423} 1424 1425static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) 1426{ 1427 return PyCodec_SurrogateEscapeErrors(exc); 1428} 1429 1430static int _PyCodecRegistry_Init(void) 1431{ 1432 static struct { 1433 char *name; 1434 PyMethodDef def; 1435 } methods[] = 1436 { 1437 { 1438 "strict", 1439 { 1440 "strict_errors", 1441 strict_errors, 1442 METH_O, 1443 PyDoc_STR("Implements the 'strict' error handling, which " 1444 "raises a UnicodeError on coding errors.") 1445 } 1446 }, 1447 { 1448 "ignore", 1449 { 1450 "ignore_errors", 1451 ignore_errors, 1452 METH_O, 1453 PyDoc_STR("Implements the 'ignore' error handling, which " 1454 "ignores malformed data and continues.") 1455 } 1456 }, 1457 { 1458 "replace", 1459 { 1460 "replace_errors", 1461 replace_errors, 1462 METH_O, 1463 PyDoc_STR("Implements the 'replace' error handling, which " 1464 "replaces malformed data with a replacement marker.") 1465 } 1466 }, 1467 { 1468 "xmlcharrefreplace", 1469 { 1470 "xmlcharrefreplace_errors", 1471 xmlcharrefreplace_errors, 1472 METH_O, 1473 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, " 1474 "which replaces an unencodable character with the " 1475 "appropriate XML character reference.") 1476 } 1477 }, 1478 { 1479 "backslashreplace", 1480 { 1481 "backslashreplace_errors", 1482 backslashreplace_errors, 1483 METH_O, 1484 PyDoc_STR("Implements the 'backslashreplace' error handling, " 1485 "which replaces malformed data with a backslashed " 1486 "escape sequence.") 1487 } 1488 }, 1489 { 1490 "namereplace", 1491 { 1492 "namereplace_errors", 1493 namereplace_errors, 1494 METH_O, 1495 PyDoc_STR("Implements the 'namereplace' error handling, " 1496 "which replaces an unencodable character with a " 1497 "\\N{...} escape sequence.") 1498 } 1499 }, 1500 { 1501 "surrogatepass", 1502 { 1503 "surrogatepass", 1504 surrogatepass_errors, 1505 METH_O 1506 } 1507 }, 1508 { 1509 "surrogateescape", 1510 { 1511 "surrogateescape", 1512 surrogateescape_errors, 1513 METH_O 1514 } 1515 } 1516 }; 1517 1518 PyInterpreterState *interp = PyThreadState_GET()->interp; 1519 PyObject *mod; 1520 unsigned i; 1521 1522 if (interp->codec_search_path != NULL) 1523 return 0; 1524 1525 interp->codec_search_path = PyList_New(0); 1526 interp->codec_search_cache = PyDict_New(); 1527 interp->codec_error_registry = PyDict_New(); 1528 1529 if (interp->codec_error_registry) { 1530 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { 1531 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL); 1532 int res; 1533 if (!func) 1534 Py_FatalError("can't initialize codec error registry"); 1535 res = PyCodec_RegisterError(methods[i].name, func); 1536 Py_DECREF(func); 1537 if (res) 1538 Py_FatalError("can't initialize codec error registry"); 1539 } 1540 } 1541 1542 if (interp->codec_search_path == NULL || 1543 interp->codec_search_cache == NULL || 1544 interp->codec_error_registry == NULL) 1545 Py_FatalError("can't initialize codec registry"); 1546 1547 mod = PyImport_ImportModuleNoBlock("encodings"); 1548 if (mod == NULL) { 1549 return -1; 1550 } 1551 Py_DECREF(mod); 1552 interp->codecs_initialized = 1; 1553 return 0; 1554} 1555