codecs.c revision 26861b0b29fdf64fba8cd120183408495f2c80e2
1/* ------------------------------------------------------------------------
2
3   Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9   ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include "ucnhash.h"
13#include <ctype.h>
14
15const char *Py_hexdigits = "0123456789abcdef";
16
17/* --- Codec Registry ----------------------------------------------------- */
18
19/* Import the standard encodings package which will register the first
20   codec search function.
21
22   This is done in a lazy way so that the Unicode implementation does
23   not downgrade startup time of scripts not needing it.
24
25   ImportErrors are silently ignored by this function. Only one try is
26   made.
27
28*/
29
30static int _PyCodecRegistry_Init(void); /* Forward */
31
32int PyCodec_Register(PyObject *search_function)
33{
34    PyInterpreterState *interp = PyThreadState_GET()->interp;
35    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
36        goto onError;
37    if (search_function == NULL) {
38        PyErr_BadArgument();
39        goto onError;
40    }
41    if (!PyCallable_Check(search_function)) {
42        PyErr_SetString(PyExc_TypeError, "argument must be callable");
43        goto onError;
44    }
45    return PyList_Append(interp->codec_search_path, search_function);
46
47 onError:
48    return -1;
49}
50
51/* Convert a string to a normalized Python string: all characters are
52   converted to lower case, spaces are replaced with underscores. */
53
54static
55PyObject *normalizestring(const char *string)
56{
57    size_t i;
58    size_t len = strlen(string);
59    char *p;
60    PyObject *v;
61
62    if (len > PY_SSIZE_T_MAX) {
63        PyErr_SetString(PyExc_OverflowError, "string is too large");
64        return NULL;
65    }
66
67    p = PyMem_Malloc(len + 1);
68    if (p == NULL)
69        return PyErr_NoMemory();
70    for (i = 0; i < len; i++) {
71        char ch = string[i];
72        if (ch == ' ')
73            ch = '-';
74        else
75            ch = Py_TOLOWER(Py_CHARMASK(ch));
76        p[i] = ch;
77    }
78    p[i] = '\0';
79    v = PyUnicode_FromString(p);
80    if (v == NULL)
81        return NULL;
82    PyMem_Free(p);
83    return v;
84}
85
86/* Lookup the given encoding and return a tuple providing the codec
87   facilities.
88
89   The encoding string is looked up converted to all lower-case
90   characters. This makes encodings looked up through this mechanism
91   effectively case-insensitive.
92
93   If no codec is found, a LookupError is set and NULL returned.
94
95   As side effect, this tries to load the encodings package, if not
96   yet done. This is part of the lazy load strategy for the encodings
97   package.
98
99*/
100
101PyObject *_PyCodec_Lookup(const char *encoding)
102{
103    PyInterpreterState *interp;
104    PyObject *result, *args = NULL, *v;
105    Py_ssize_t i, len;
106
107    if (encoding == NULL) {
108        PyErr_BadArgument();
109        goto onError;
110    }
111
112    interp = PyThreadState_GET()->interp;
113    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
114        goto onError;
115
116    /* Convert the encoding to a normalized Python string: all
117       characters are converted to lower case, spaces and hyphens are
118       replaced with underscores. */
119    v = normalizestring(encoding);
120    if (v == NULL)
121        goto onError;
122    PyUnicode_InternInPlace(&v);
123
124    /* First, try to lookup the name in the registry dictionary */
125    result = PyDict_GetItem(interp->codec_search_cache, v);
126    if (result != NULL) {
127        Py_INCREF(result);
128        Py_DECREF(v);
129        return result;
130    }
131
132    /* Next, scan the search functions in order of registration */
133    args = PyTuple_New(1);
134    if (args == NULL)
135        goto onError;
136    PyTuple_SET_ITEM(args,0,v);
137
138    len = PyList_Size(interp->codec_search_path);
139    if (len < 0)
140        goto onError;
141    if (len == 0) {
142        PyErr_SetString(PyExc_LookupError,
143                        "no codec search functions registered: "
144                        "can't find encoding");
145        goto onError;
146    }
147
148    for (i = 0; i < len; i++) {
149        PyObject *func;
150
151        func = PyList_GetItem(interp->codec_search_path, i);
152        if (func == NULL)
153            goto onError;
154        result = PyEval_CallObject(func, args);
155        if (result == NULL)
156            goto onError;
157        if (result == Py_None) {
158            Py_DECREF(result);
159            continue;
160        }
161        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162            PyErr_SetString(PyExc_TypeError,
163                            "codec search functions must return 4-tuples");
164            Py_DECREF(result);
165            goto onError;
166        }
167        break;
168    }
169    if (i == len) {
170        /* XXX Perhaps we should cache misses too ? */
171        PyErr_Format(PyExc_LookupError,
172                     "unknown encoding: %s", encoding);
173        goto onError;
174    }
175
176    /* Cache and return the result */
177    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
178        Py_DECREF(result);
179        goto onError;
180    }
181    Py_DECREF(args);
182    return result;
183
184 onError:
185    Py_XDECREF(args);
186    return NULL;
187}
188
189int _PyCodec_Forget(const char *encoding)
190{
191    PyInterpreterState *interp;
192    PyObject *v;
193    int result;
194
195    interp = PyThreadState_GET()->interp;
196    if (interp->codec_search_path == NULL) {
197        return -1;
198    }
199
200    /* Convert the encoding to a normalized Python string: all
201       characters are converted to lower case, spaces and hyphens are
202       replaced with underscores. */
203    v = normalizestring(encoding);
204    if (v == NULL) {
205        return -1;
206    }
207
208    /* Drop the named codec from the internal cache */
209    result = PyDict_DelItem(interp->codec_search_cache, v);
210    Py_DECREF(v);
211
212    return result;
213}
214
215/* Codec registry encoding check API. */
216
217int PyCodec_KnownEncoding(const char *encoding)
218{
219    PyObject *codecs;
220
221    codecs = _PyCodec_Lookup(encoding);
222    if (!codecs) {
223        PyErr_Clear();
224        return 0;
225    }
226    else {
227        Py_DECREF(codecs);
228        return 1;
229    }
230}
231
232static
233PyObject *args_tuple(PyObject *object,
234                     const char *errors)
235{
236    PyObject *args;
237
238    args = PyTuple_New(1 + (errors != NULL));
239    if (args == NULL)
240        return NULL;
241    Py_INCREF(object);
242    PyTuple_SET_ITEM(args,0,object);
243    if (errors) {
244        PyObject *v;
245
246        v = PyUnicode_FromString(errors);
247        if (v == NULL) {
248            Py_DECREF(args);
249            return NULL;
250        }
251        PyTuple_SET_ITEM(args, 1, v);
252    }
253    return args;
254}
255
256/* Helper function to get a codec item */
257
258static
259PyObject *codec_getitem(const char *encoding, int index)
260{
261    PyObject *codecs;
262    PyObject *v;
263
264    codecs = _PyCodec_Lookup(encoding);
265    if (codecs == NULL)
266        return NULL;
267    v = PyTuple_GET_ITEM(codecs, index);
268    Py_DECREF(codecs);
269    Py_INCREF(v);
270    return v;
271}
272
273/* Helper functions to create an incremental codec. */
274static
275PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276                                     const char *errors,
277                                     const char *attrname)
278{
279    PyObject *ret, *inccodec;
280
281    inccodec = PyObject_GetAttrString(codec_info, attrname);
282    if (inccodec == NULL)
283        return NULL;
284    if (errors)
285        ret = PyObject_CallFunction(inccodec, "s", errors);
286    else
287        ret = PyObject_CallFunction(inccodec, NULL);
288    Py_DECREF(inccodec);
289    return ret;
290}
291
292static
293PyObject *codec_getincrementalcodec(const char *encoding,
294                                    const char *errors,
295                                    const char *attrname)
296{
297    PyObject *codec_info, *ret;
298
299    codec_info = _PyCodec_Lookup(encoding);
300    if (codec_info == NULL)
301        return NULL;
302    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303    Py_DECREF(codec_info);
304    return ret;
305}
306
307/* Helper function to create a stream codec. */
308
309static
310PyObject *codec_getstreamcodec(const char *encoding,
311                               PyObject *stream,
312                               const char *errors,
313                               const int index)
314{
315    PyObject *codecs, *streamcodec, *codeccls;
316
317    codecs = _PyCodec_Lookup(encoding);
318    if (codecs == NULL)
319        return NULL;
320
321    codeccls = PyTuple_GET_ITEM(codecs, index);
322    if (errors != NULL)
323        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
324    else
325        streamcodec = PyObject_CallFunction(codeccls, "O", stream);
326    Py_DECREF(codecs);
327    return streamcodec;
328}
329
330/* Helpers to work with the result of _PyCodec_Lookup
331
332 */
333PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334                                             const char *errors)
335{
336    return codec_makeincrementalcodec(codec_info, errors,
337                                      "incrementaldecoder");
338}
339
340PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341                                             const char *errors)
342{
343    return codec_makeincrementalcodec(codec_info, errors,
344                                      "incrementalencoder");
345}
346
347
348/* Convenience APIs to query the Codec registry.
349
350   All APIs return a codec object with incremented refcount.
351
352 */
353
354PyObject *PyCodec_Encoder(const char *encoding)
355{
356    return codec_getitem(encoding, 0);
357}
358
359PyObject *PyCodec_Decoder(const char *encoding)
360{
361    return codec_getitem(encoding, 1);
362}
363
364PyObject *PyCodec_IncrementalEncoder(const char *encoding,
365                                     const char *errors)
366{
367    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
368}
369
370PyObject *PyCodec_IncrementalDecoder(const char *encoding,
371                                     const char *errors)
372{
373    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
374}
375
376PyObject *PyCodec_StreamReader(const char *encoding,
377                               PyObject *stream,
378                               const char *errors)
379{
380    return codec_getstreamcodec(encoding, stream, errors, 2);
381}
382
383PyObject *PyCodec_StreamWriter(const char *encoding,
384                               PyObject *stream,
385                               const char *errors)
386{
387    return codec_getstreamcodec(encoding, stream, errors, 3);
388}
389
390/* Helper that tries to ensure the reported exception chain indicates the
391 * codec that was invoked to trigger the failure without changing the type
392 * of the exception raised.
393 */
394static void
395wrap_codec_error(const char *operation,
396                 const char *encoding)
397{
398    /* TrySetFromCause will replace the active exception with a suitably
399     * updated clone if it can, otherwise it will leave the original
400     * exception alone.
401     */
402    _PyErr_TrySetFromCause("%s with '%s' codec failed",
403                           operation, encoding);
404}
405
406/* Encode an object (e.g. an Unicode object) using the given encoding
407   and return the resulting encoded object (usually a Python string).
408
409   errors is passed to the encoder factory as argument if non-NULL. */
410
411static PyObject *
412_PyCodec_EncodeInternal(PyObject *object,
413                        PyObject *encoder,
414                        const char *encoding,
415                        const char *errors)
416{
417    PyObject *args = NULL, *result = NULL;
418    PyObject *v = NULL;
419
420    args = args_tuple(object, errors);
421    if (args == NULL)
422        goto onError;
423
424    result = PyEval_CallObject(encoder, args);
425    if (result == NULL) {
426        wrap_codec_error("encoding", encoding);
427        goto onError;
428    }
429
430    if (!PyTuple_Check(result) ||
431        PyTuple_GET_SIZE(result) != 2) {
432        PyErr_SetString(PyExc_TypeError,
433                        "encoder must return a tuple (object, integer)");
434        goto onError;
435    }
436    v = PyTuple_GET_ITEM(result,0);
437    Py_INCREF(v);
438    /* We don't check or use the second (integer) entry. */
439
440    Py_DECREF(args);
441    Py_DECREF(encoder);
442    Py_DECREF(result);
443    return v;
444
445 onError:
446    Py_XDECREF(result);
447    Py_XDECREF(args);
448    Py_XDECREF(encoder);
449    return NULL;
450}
451
452/* Decode an object (usually a Python string) using the given encoding
453   and return an equivalent object (e.g. an Unicode object).
454
455   errors is passed to the decoder factory as argument if non-NULL. */
456
457static PyObject *
458_PyCodec_DecodeInternal(PyObject *object,
459                        PyObject *decoder,
460                        const char *encoding,
461                        const char *errors)
462{
463    PyObject *args = NULL, *result = NULL;
464    PyObject *v;
465
466    args = args_tuple(object, errors);
467    if (args == NULL)
468        goto onError;
469
470    result = PyEval_CallObject(decoder,args);
471    if (result == NULL) {
472        wrap_codec_error("decoding", encoding);
473        goto onError;
474    }
475    if (!PyTuple_Check(result) ||
476        PyTuple_GET_SIZE(result) != 2) {
477        PyErr_SetString(PyExc_TypeError,
478                        "decoder must return a tuple (object,integer)");
479        goto onError;
480    }
481    v = PyTuple_GET_ITEM(result,0);
482    Py_INCREF(v);
483    /* We don't check or use the second (integer) entry. */
484
485    Py_DECREF(args);
486    Py_DECREF(decoder);
487    Py_DECREF(result);
488    return v;
489
490 onError:
491    Py_XDECREF(args);
492    Py_XDECREF(decoder);
493    Py_XDECREF(result);
494    return NULL;
495}
496
497/* Generic encoding/decoding API */
498PyObject *PyCodec_Encode(PyObject *object,
499                         const char *encoding,
500                         const char *errors)
501{
502    PyObject *encoder;
503
504    encoder = PyCodec_Encoder(encoding);
505    if (encoder == NULL)
506        return NULL;
507
508    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509}
510
511PyObject *PyCodec_Decode(PyObject *object,
512                         const char *encoding,
513                         const char *errors)
514{
515    PyObject *decoder;
516
517    decoder = PyCodec_Decoder(encoding);
518    if (decoder == NULL)
519        return NULL;
520
521    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522}
523
524/* Text encoding/decoding API */
525PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526                                       const char *alternate_command)
527{
528    _Py_IDENTIFIER(_is_text_encoding);
529    PyObject *codec;
530    PyObject *attr;
531    int is_text_codec;
532
533    codec = _PyCodec_Lookup(encoding);
534    if (codec == NULL)
535        return NULL;
536
537    /* Backwards compatibility: assume any raw tuple describes a text
538     * encoding, and the same for anything lacking the private
539     * attribute.
540     */
541    if (!PyTuple_CheckExact(codec)) {
542        attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543        if (attr == NULL) {
544            if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545                PyErr_Clear();
546            } else {
547                Py_DECREF(codec);
548                return NULL;
549            }
550        } else {
551            is_text_codec = PyObject_IsTrue(attr);
552            Py_DECREF(attr);
553            if (!is_text_codec) {
554                Py_DECREF(codec);
555                PyErr_Format(PyExc_LookupError,
556                             "'%.400s' is not a text encoding; "
557                             "use %s to handle arbitrary codecs",
558                             encoding, alternate_command);
559                return NULL;
560            }
561        }
562    }
563
564    /* This appears to be a valid text encoding */
565    return codec;
566}
567
568
569static
570PyObject *codec_getitem_checked(const char *encoding,
571                                const char *alternate_command,
572                                int index)
573{
574    PyObject *codec;
575    PyObject *v;
576
577    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578    if (codec == NULL)
579        return NULL;
580
581    v = PyTuple_GET_ITEM(codec, index);
582    Py_INCREF(v);
583    Py_DECREF(codec);
584    return v;
585}
586
587static PyObject * _PyCodec_TextEncoder(const char *encoding)
588{
589    return codec_getitem_checked(encoding, "codecs.encode()", 0);
590}
591
592static PyObject * _PyCodec_TextDecoder(const char *encoding)
593{
594    return codec_getitem_checked(encoding, "codecs.decode()", 1);
595}
596
597PyObject *_PyCodec_EncodeText(PyObject *object,
598                              const char *encoding,
599                              const char *errors)
600{
601    PyObject *encoder;
602
603    encoder = _PyCodec_TextEncoder(encoding);
604    if (encoder == NULL)
605        return NULL;
606
607    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608}
609
610PyObject *_PyCodec_DecodeText(PyObject *object,
611                              const char *encoding,
612                              const char *errors)
613{
614    PyObject *decoder;
615
616    decoder = _PyCodec_TextDecoder(encoding);
617    if (decoder == NULL)
618        return NULL;
619
620    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621}
622
623/* Register the error handling callback function error under the name
624   name. This function will be called by the codec when it encounters
625   an unencodable characters/undecodable bytes and doesn't know the
626   callback name, when name is specified as the error parameter
627   in the call to the encode/decode function.
628   Return 0 on success, -1 on error */
629int PyCodec_RegisterError(const char *name, PyObject *error)
630{
631    PyInterpreterState *interp = PyThreadState_GET()->interp;
632    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
633        return -1;
634    if (!PyCallable_Check(error)) {
635        PyErr_SetString(PyExc_TypeError, "handler must be callable");
636        return -1;
637    }
638    return PyDict_SetItemString(interp->codec_error_registry,
639                                name, error);
640}
641
642/* Lookup the error handling callback function registered under the
643   name error. As a special case NULL can be passed, in which case
644   the error handling callback for strict encoding will be returned. */
645PyObject *PyCodec_LookupError(const char *name)
646{
647    PyObject *handler = NULL;
648
649    PyInterpreterState *interp = PyThreadState_GET()->interp;
650    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
651        return NULL;
652
653    if (name==NULL)
654        name = "strict";
655    handler = PyDict_GetItemString(interp->codec_error_registry, name);
656    if (!handler)
657        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
658    else
659        Py_INCREF(handler);
660    return handler;
661}
662
663static void wrong_exception_type(PyObject *exc)
664{
665    _Py_IDENTIFIER(__class__);
666    _Py_IDENTIFIER(__name__);
667    PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
668    if (type != NULL) {
669        PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
670        Py_DECREF(type);
671        if (name != NULL) {
672            PyErr_Format(PyExc_TypeError,
673                         "don't know how to handle %S in error callback", name);
674            Py_DECREF(name);
675        }
676    }
677}
678
679PyObject *PyCodec_StrictErrors(PyObject *exc)
680{
681    if (PyExceptionInstance_Check(exc))
682        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
683    else
684        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
685    return NULL;
686}
687
688
689PyObject *PyCodec_IgnoreErrors(PyObject *exc)
690{
691    Py_ssize_t end;
692    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
693        if (PyUnicodeEncodeError_GetEnd(exc, &end))
694            return NULL;
695    }
696    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
697        if (PyUnicodeDecodeError_GetEnd(exc, &end))
698            return NULL;
699    }
700    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
701        if (PyUnicodeTranslateError_GetEnd(exc, &end))
702            return NULL;
703    }
704    else {
705        wrong_exception_type(exc);
706        return NULL;
707    }
708    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
709}
710
711
712PyObject *PyCodec_ReplaceErrors(PyObject *exc)
713{
714    Py_ssize_t start, end, i, len;
715
716    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
717        PyObject *res;
718        int kind;
719        void *data;
720        if (PyUnicodeEncodeError_GetStart(exc, &start))
721            return NULL;
722        if (PyUnicodeEncodeError_GetEnd(exc, &end))
723            return NULL;
724        len = end - start;
725        res = PyUnicode_New(len, '?');
726        if (res == NULL)
727            return NULL;
728        kind = PyUnicode_KIND(res);
729        data = PyUnicode_DATA(res);
730        for (i = 0; i < len; ++i)
731            PyUnicode_WRITE(kind, data, i, '?');
732        assert(_PyUnicode_CheckConsistency(res, 1));
733        return Py_BuildValue("(Nn)", res, end);
734    }
735    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
736        if (PyUnicodeDecodeError_GetEnd(exc, &end))
737            return NULL;
738        return Py_BuildValue("(Cn)",
739                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
740                             end);
741    }
742    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
743        PyObject *res;
744        int kind;
745        void *data;
746        if (PyUnicodeTranslateError_GetStart(exc, &start))
747            return NULL;
748        if (PyUnicodeTranslateError_GetEnd(exc, &end))
749            return NULL;
750        len = end - start;
751        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
752        if (res == NULL)
753            return NULL;
754        kind = PyUnicode_KIND(res);
755        data = PyUnicode_DATA(res);
756        for (i=0; i < len; i++)
757            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
758        assert(_PyUnicode_CheckConsistency(res, 1));
759        return Py_BuildValue("(Nn)", res, end);
760    }
761    else {
762        wrong_exception_type(exc);
763        return NULL;
764    }
765}
766
767PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
768{
769    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
770        PyObject *restuple;
771        PyObject *object;
772        Py_ssize_t i;
773        Py_ssize_t start;
774        Py_ssize_t end;
775        PyObject *res;
776        unsigned char *outp;
777        Py_ssize_t ressize;
778        Py_UCS4 ch;
779        if (PyUnicodeEncodeError_GetStart(exc, &start))
780            return NULL;
781        if (PyUnicodeEncodeError_GetEnd(exc, &end))
782            return NULL;
783        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
784            return NULL;
785        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
786            end = start + PY_SSIZE_T_MAX / (2+7+1);
787        for (i = start, ressize = 0; i < end; ++i) {
788            /* object is guaranteed to be "ready" */
789            ch = PyUnicode_READ_CHAR(object, i);
790            if (ch<10)
791                ressize += 2+1+1;
792            else if (ch<100)
793                ressize += 2+2+1;
794            else if (ch<1000)
795                ressize += 2+3+1;
796            else if (ch<10000)
797                ressize += 2+4+1;
798            else if (ch<100000)
799                ressize += 2+5+1;
800            else if (ch<1000000)
801                ressize += 2+6+1;
802            else
803                ressize += 2+7+1;
804        }
805        /* allocate replacement */
806        res = PyUnicode_New(ressize, 127);
807        if (res == NULL) {
808            Py_DECREF(object);
809            return NULL;
810        }
811        outp = PyUnicode_1BYTE_DATA(res);
812        /* generate replacement */
813        for (i = start; i < end; ++i) {
814            int digits;
815            int base;
816            ch = PyUnicode_READ_CHAR(object, i);
817            *outp++ = '&';
818            *outp++ = '#';
819            if (ch<10) {
820                digits = 1;
821                base = 1;
822            }
823            else if (ch<100) {
824                digits = 2;
825                base = 10;
826            }
827            else if (ch<1000) {
828                digits = 3;
829                base = 100;
830            }
831            else if (ch<10000) {
832                digits = 4;
833                base = 1000;
834            }
835            else if (ch<100000) {
836                digits = 5;
837                base = 10000;
838            }
839            else if (ch<1000000) {
840                digits = 6;
841                base = 100000;
842            }
843            else {
844                digits = 7;
845                base = 1000000;
846            }
847            while (digits-->0) {
848                *outp++ = '0' + ch/base;
849                ch %= base;
850                base /= 10;
851            }
852            *outp++ = ';';
853        }
854        assert(_PyUnicode_CheckConsistency(res, 1));
855        restuple = Py_BuildValue("(Nn)", res, end);
856        Py_DECREF(object);
857        return restuple;
858    }
859    else {
860        wrong_exception_type(exc);
861        return NULL;
862    }
863}
864
865PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
866{
867    PyObject *object;
868    Py_ssize_t i;
869    Py_ssize_t start;
870    Py_ssize_t end;
871    PyObject *res;
872    unsigned char *outp;
873    int ressize;
874    Py_UCS4 c;
875
876    if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
877        unsigned char *p;
878        if (PyUnicodeDecodeError_GetStart(exc, &start))
879            return NULL;
880        if (PyUnicodeDecodeError_GetEnd(exc, &end))
881            return NULL;
882        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
883            return NULL;
884        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
885            Py_DECREF(object);
886            return NULL;
887        }
888        res = PyUnicode_New(4 * (end - start), 127);
889        if (res == NULL) {
890            Py_DECREF(object);
891            return NULL;
892        }
893        outp = PyUnicode_1BYTE_DATA(res);
894        for (i = start; i < end; i++, outp += 4) {
895            unsigned char c = p[i];
896            outp[0] = '\\';
897            outp[1] = 'x';
898            outp[2] = Py_hexdigits[(c>>4)&0xf];
899            outp[3] = Py_hexdigits[c&0xf];
900        }
901
902        assert(_PyUnicode_CheckConsistency(res, 1));
903        Py_DECREF(object);
904        return Py_BuildValue("(Nn)", res, end);
905    }
906    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
907        if (PyUnicodeEncodeError_GetStart(exc, &start))
908            return NULL;
909        if (PyUnicodeEncodeError_GetEnd(exc, &end))
910            return NULL;
911        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
912            return NULL;
913    }
914    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
915        if (PyUnicodeTranslateError_GetStart(exc, &start))
916            return NULL;
917        if (PyUnicodeTranslateError_GetEnd(exc, &end))
918            return NULL;
919        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
920            return NULL;
921    }
922    else {
923        wrong_exception_type(exc);
924        return NULL;
925    }
926
927    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
928        end = start + PY_SSIZE_T_MAX / (1+1+8);
929    for (i = start, ressize = 0; i < end; ++i) {
930        /* object is guaranteed to be "ready" */
931        c = PyUnicode_READ_CHAR(object, i);
932        if (c >= 0x10000) {
933            ressize += 1+1+8;
934        }
935        else if (c >= 0x100) {
936            ressize += 1+1+4;
937        }
938        else
939            ressize += 1+1+2;
940    }
941    res = PyUnicode_New(ressize, 127);
942    if (res == NULL) {
943        Py_DECREF(object);
944        return NULL;
945    }
946    outp = PyUnicode_1BYTE_DATA(res);
947    for (i = start; i < end; ++i) {
948        c = PyUnicode_READ_CHAR(object, i);
949        *outp++ = '\\';
950        if (c >= 0x00010000) {
951            *outp++ = 'U';
952            *outp++ = Py_hexdigits[(c>>28)&0xf];
953            *outp++ = Py_hexdigits[(c>>24)&0xf];
954            *outp++ = Py_hexdigits[(c>>20)&0xf];
955            *outp++ = Py_hexdigits[(c>>16)&0xf];
956            *outp++ = Py_hexdigits[(c>>12)&0xf];
957            *outp++ = Py_hexdigits[(c>>8)&0xf];
958        }
959        else if (c >= 0x100) {
960            *outp++ = 'u';
961            *outp++ = Py_hexdigits[(c>>12)&0xf];
962            *outp++ = Py_hexdigits[(c>>8)&0xf];
963        }
964        else
965            *outp++ = 'x';
966        *outp++ = Py_hexdigits[(c>>4)&0xf];
967        *outp++ = Py_hexdigits[c&0xf];
968    }
969
970    assert(_PyUnicode_CheckConsistency(res, 1));
971    Py_DECREF(object);
972    return Py_BuildValue("(Nn)", res, end);
973}
974
975static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
976static int ucnhash_initialized = 0;
977
978PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
979{
980    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
981        PyObject *restuple;
982        PyObject *object;
983        Py_ssize_t i;
984        Py_ssize_t start;
985        Py_ssize_t end;
986        PyObject *res;
987        unsigned char *outp;
988        Py_ssize_t ressize;
989        int replsize;
990        Py_UCS4 c;
991        char buffer[256]; /* NAME_MAXLEN */
992        if (PyUnicodeEncodeError_GetStart(exc, &start))
993            return NULL;
994        if (PyUnicodeEncodeError_GetEnd(exc, &end))
995            return NULL;
996        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
997            return NULL;
998        if (!ucnhash_initialized) {
999            /* load the unicode data module */
1000            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
1001                                            PyUnicodeData_CAPSULE_NAME, 1);
1002            ucnhash_initialized = 1;
1003        }
1004        for (i = start, ressize = 0; i < end; ++i) {
1005            /* object is guaranteed to be "ready" */
1006            c = PyUnicode_READ_CHAR(object, i);
1007            if (ucnhash_CAPI &&
1008                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1009                replsize = 1+1+1+(int)strlen(buffer)+1;
1010            }
1011            else if (c >= 0x10000) {
1012                replsize = 1+1+8;
1013            }
1014            else if (c >= 0x100) {
1015                replsize = 1+1+4;
1016            }
1017            else
1018                replsize = 1+1+2;
1019            if (ressize > PY_SSIZE_T_MAX - replsize)
1020                break;
1021            ressize += replsize;
1022        }
1023        end = i;
1024        res = PyUnicode_New(ressize, 127);
1025        if (res==NULL)
1026            return NULL;
1027        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1028            i < end; ++i) {
1029            c = PyUnicode_READ_CHAR(object, i);
1030            *outp++ = '\\';
1031            if (ucnhash_CAPI &&
1032                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1033                *outp++ = 'N';
1034                *outp++ = '{';
1035                strcpy((char *)outp, buffer);
1036                outp += strlen(buffer);
1037                *outp++ = '}';
1038                continue;
1039            }
1040            if (c >= 0x00010000) {
1041                *outp++ = 'U';
1042                *outp++ = Py_hexdigits[(c>>28)&0xf];
1043                *outp++ = Py_hexdigits[(c>>24)&0xf];
1044                *outp++ = Py_hexdigits[(c>>20)&0xf];
1045                *outp++ = Py_hexdigits[(c>>16)&0xf];
1046                *outp++ = Py_hexdigits[(c>>12)&0xf];
1047                *outp++ = Py_hexdigits[(c>>8)&0xf];
1048            }
1049            else if (c >= 0x100) {
1050                *outp++ = 'u';
1051                *outp++ = Py_hexdigits[(c>>12)&0xf];
1052                *outp++ = Py_hexdigits[(c>>8)&0xf];
1053            }
1054            else
1055                *outp++ = 'x';
1056            *outp++ = Py_hexdigits[(c>>4)&0xf];
1057            *outp++ = Py_hexdigits[c&0xf];
1058        }
1059
1060        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1061        assert(_PyUnicode_CheckConsistency(res, 1));
1062        restuple = Py_BuildValue("(Nn)", res, end);
1063        Py_DECREF(object);
1064        return restuple;
1065    }
1066    else {
1067        wrong_exception_type(exc);
1068        return NULL;
1069    }
1070}
1071
1072#define ENC_UNKNOWN     -1
1073#define ENC_UTF8        0
1074#define ENC_UTF16BE     1
1075#define ENC_UTF16LE     2
1076#define ENC_UTF32BE     3
1077#define ENC_UTF32LE     4
1078
1079static int
1080get_standard_encoding(const char *encoding, int *bytelength)
1081{
1082    if (Py_TOLOWER(encoding[0]) == 'u' &&
1083        Py_TOLOWER(encoding[1]) == 't' &&
1084        Py_TOLOWER(encoding[2]) == 'f') {
1085        encoding += 3;
1086        if (*encoding == '-' || *encoding == '_' )
1087            encoding++;
1088        if (encoding[0] == '8' && encoding[1] == '\0') {
1089            *bytelength = 3;
1090            return ENC_UTF8;
1091        }
1092        else if (encoding[0] == '1' && encoding[1] == '6') {
1093            encoding += 2;
1094            *bytelength = 2;
1095            if (*encoding == '\0') {
1096#ifdef WORDS_BIGENDIAN
1097                return ENC_UTF16BE;
1098#else
1099                return ENC_UTF16LE;
1100#endif
1101            }
1102            if (*encoding == '-' || *encoding == '_' )
1103                encoding++;
1104            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105                if (Py_TOLOWER(encoding[0]) == 'b')
1106                    return ENC_UTF16BE;
1107                if (Py_TOLOWER(encoding[0]) == 'l')
1108                    return ENC_UTF16LE;
1109            }
1110        }
1111        else if (encoding[0] == '3' && encoding[1] == '2') {
1112            encoding += 2;
1113            *bytelength = 4;
1114            if (*encoding == '\0') {
1115#ifdef WORDS_BIGENDIAN
1116                return ENC_UTF32BE;
1117#else
1118                return ENC_UTF32LE;
1119#endif
1120            }
1121            if (*encoding == '-' || *encoding == '_' )
1122                encoding++;
1123            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1124                if (Py_TOLOWER(encoding[0]) == 'b')
1125                    return ENC_UTF32BE;
1126                if (Py_TOLOWER(encoding[0]) == 'l')
1127                    return ENC_UTF32LE;
1128            }
1129        }
1130    }
1131    else if (strcmp(encoding, "CP_UTF8") == 0) {
1132        *bytelength = 3;
1133        return ENC_UTF8;
1134    }
1135    return ENC_UNKNOWN;
1136}
1137
1138/* This handler is declared static until someone demonstrates
1139   a need to call it directly. */
1140static PyObject *
1141PyCodec_SurrogatePassErrors(PyObject *exc)
1142{
1143    PyObject *restuple;
1144    PyObject *object;
1145    PyObject *encode;
1146    char *encoding;
1147    int code;
1148    int bytelength;
1149    Py_ssize_t i;
1150    Py_ssize_t start;
1151    Py_ssize_t end;
1152    PyObject *res;
1153    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
1154        unsigned char *outp;
1155        if (PyUnicodeEncodeError_GetStart(exc, &start))
1156            return NULL;
1157        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1158            return NULL;
1159        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1160            return NULL;
1161        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1162            Py_DECREF(object);
1163            return NULL;
1164        }
1165        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1166            Py_DECREF(object);
1167            Py_DECREF(encode);
1168            return NULL;
1169        }
1170        code = get_standard_encoding(encoding, &bytelength);
1171        Py_DECREF(encode);
1172        if (code == ENC_UNKNOWN) {
1173            /* Not supported, fail with original exception */
1174            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175            Py_DECREF(object);
1176            return NULL;
1177        }
1178
1179        if (end - start > PY_SSIZE_T_MAX / bytelength)
1180            end = start + PY_SSIZE_T_MAX / bytelength;
1181        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1182        if (!res) {
1183            Py_DECREF(object);
1184            return NULL;
1185        }
1186        outp = (unsigned char*)PyBytes_AsString(res);
1187        for (i = start; i < end; i++) {
1188            /* object is guaranteed to be "ready" */
1189            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1190            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1191                /* Not a surrogate, fail with original exception */
1192                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1193                Py_DECREF(res);
1194                Py_DECREF(object);
1195                return NULL;
1196            }
1197            switch (code) {
1198            case ENC_UTF8:
1199                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1200                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1201                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1202                break;
1203            case ENC_UTF16LE:
1204                *outp++ = (unsigned char) ch;
1205                *outp++ = (unsigned char)(ch >> 8);
1206                break;
1207            case ENC_UTF16BE:
1208                *outp++ = (unsigned char)(ch >> 8);
1209                *outp++ = (unsigned char) ch;
1210                break;
1211            case ENC_UTF32LE:
1212                *outp++ = (unsigned char) ch;
1213                *outp++ = (unsigned char)(ch >> 8);
1214                *outp++ = (unsigned char)(ch >> 16);
1215                *outp++ = (unsigned char)(ch >> 24);
1216                break;
1217            case ENC_UTF32BE:
1218                *outp++ = (unsigned char)(ch >> 24);
1219                *outp++ = (unsigned char)(ch >> 16);
1220                *outp++ = (unsigned char)(ch >> 8);
1221                *outp++ = (unsigned char) ch;
1222                break;
1223            }
1224        }
1225        restuple = Py_BuildValue("(On)", res, end);
1226        Py_DECREF(res);
1227        Py_DECREF(object);
1228        return restuple;
1229    }
1230    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
1231        unsigned char *p;
1232        Py_UCS4 ch = 0;
1233        if (PyUnicodeDecodeError_GetStart(exc, &start))
1234            return NULL;
1235        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1236            return NULL;
1237        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1238            return NULL;
1239        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1240            Py_DECREF(object);
1241            return NULL;
1242        }
1243        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1244            Py_DECREF(object);
1245            return NULL;
1246        }
1247        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1248            Py_DECREF(object);
1249            Py_DECREF(encode);
1250            return NULL;
1251        }
1252        code = get_standard_encoding(encoding, &bytelength);
1253        Py_DECREF(encode);
1254        if (code == ENC_UNKNOWN) {
1255            /* Not supported, fail with original exception */
1256            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1257            Py_DECREF(object);
1258            return NULL;
1259        }
1260
1261        /* Try decoding a single surrogate character. If
1262           there are more, let the codec call us again. */
1263        p += start;
1264        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1265            switch (code) {
1266            case ENC_UTF8:
1267                if ((p[0] & 0xf0) == 0xe0 &&
1268                    (p[1] & 0xc0) == 0x80 &&
1269                    (p[2] & 0xc0) == 0x80) {
1270                    /* it's a three-byte code */
1271                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1272                }
1273                break;
1274            case ENC_UTF16LE:
1275                ch = p[1] << 8 | p[0];
1276                break;
1277            case ENC_UTF16BE:
1278                ch = p[0] << 8 | p[1];
1279                break;
1280            case ENC_UTF32LE:
1281                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1282                break;
1283            case ENC_UTF32BE:
1284                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1285                break;
1286            }
1287        }
1288
1289        Py_DECREF(object);
1290        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1291            /* it's not a surrogate - fail */
1292            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1293            return NULL;
1294        }
1295        res = PyUnicode_FromOrdinal(ch);
1296        if (res == NULL)
1297            return NULL;
1298        return Py_BuildValue("(Nn)", res, start + bytelength);
1299    }
1300    else {
1301        wrong_exception_type(exc);
1302        return NULL;
1303    }
1304}
1305
1306static PyObject *
1307PyCodec_SurrogateEscapeErrors(PyObject *exc)
1308{
1309    PyObject *restuple;
1310    PyObject *object;
1311    Py_ssize_t i;
1312    Py_ssize_t start;
1313    Py_ssize_t end;
1314    PyObject *res;
1315    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
1316        char *outp;
1317        if (PyUnicodeEncodeError_GetStart(exc, &start))
1318            return NULL;
1319        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1320            return NULL;
1321        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1322            return NULL;
1323        res = PyBytes_FromStringAndSize(NULL, end-start);
1324        if (!res) {
1325            Py_DECREF(object);
1326            return NULL;
1327        }
1328        outp = PyBytes_AsString(res);
1329        for (i = start; i < end; i++) {
1330            /* object is guaranteed to be "ready" */
1331            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1332            if (ch < 0xdc80 || ch > 0xdcff) {
1333                /* Not a UTF-8b surrogate, fail with original exception */
1334                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1335                Py_DECREF(res);
1336                Py_DECREF(object);
1337                return NULL;
1338            }
1339            *outp++ = ch - 0xdc00;
1340        }
1341        restuple = Py_BuildValue("(On)", res, end);
1342        Py_DECREF(res);
1343        Py_DECREF(object);
1344        return restuple;
1345    }
1346    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
1347        PyObject *str;
1348        unsigned char *p;
1349        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1350        int consumed = 0;
1351        if (PyUnicodeDecodeError_GetStart(exc, &start))
1352            return NULL;
1353        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1354            return NULL;
1355        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1356            return NULL;
1357        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1358            Py_DECREF(object);
1359            return NULL;
1360        }
1361        while (consumed < 4 && consumed < end-start) {
1362            /* Refuse to escape ASCII bytes. */
1363            if (p[start+consumed] < 128)
1364                break;
1365            ch[consumed] = 0xdc00 + p[start+consumed];
1366            consumed++;
1367        }
1368        Py_DECREF(object);
1369        if (!consumed) {
1370            /* codec complained about ASCII byte. */
1371            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1372            return NULL;
1373        }
1374        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1375        if (str == NULL)
1376            return NULL;
1377        return Py_BuildValue("(Nn)", str, start+consumed);
1378    }
1379    else {
1380        wrong_exception_type(exc);
1381        return NULL;
1382    }
1383}
1384
1385
1386static PyObject *strict_errors(PyObject *self, PyObject *exc)
1387{
1388    return PyCodec_StrictErrors(exc);
1389}
1390
1391
1392static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1393{
1394    return PyCodec_IgnoreErrors(exc);
1395}
1396
1397
1398static PyObject *replace_errors(PyObject *self, PyObject *exc)
1399{
1400    return PyCodec_ReplaceErrors(exc);
1401}
1402
1403
1404static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1405{
1406    return PyCodec_XMLCharRefReplaceErrors(exc);
1407}
1408
1409
1410static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1411{
1412    return PyCodec_BackslashReplaceErrors(exc);
1413}
1414
1415static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1416{
1417    return PyCodec_NameReplaceErrors(exc);
1418}
1419
1420static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1421{
1422    return PyCodec_SurrogatePassErrors(exc);
1423}
1424
1425static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1426{
1427    return PyCodec_SurrogateEscapeErrors(exc);
1428}
1429
1430static int _PyCodecRegistry_Init(void)
1431{
1432    static struct {
1433        char *name;
1434        PyMethodDef def;
1435    } methods[] =
1436    {
1437        {
1438            "strict",
1439            {
1440                "strict_errors",
1441                strict_errors,
1442                METH_O,
1443                PyDoc_STR("Implements the 'strict' error handling, which "
1444                          "raises a UnicodeError on coding errors.")
1445            }
1446        },
1447        {
1448            "ignore",
1449            {
1450                "ignore_errors",
1451                ignore_errors,
1452                METH_O,
1453                PyDoc_STR("Implements the 'ignore' error handling, which "
1454                          "ignores malformed data and continues.")
1455            }
1456        },
1457        {
1458            "replace",
1459            {
1460                "replace_errors",
1461                replace_errors,
1462                METH_O,
1463                PyDoc_STR("Implements the 'replace' error handling, which "
1464                          "replaces malformed data with a replacement marker.")
1465            }
1466        },
1467        {
1468            "xmlcharrefreplace",
1469            {
1470                "xmlcharrefreplace_errors",
1471                xmlcharrefreplace_errors,
1472                METH_O,
1473                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1474                          "which replaces an unencodable character with the "
1475                          "appropriate XML character reference.")
1476            }
1477        },
1478        {
1479            "backslashreplace",
1480            {
1481                "backslashreplace_errors",
1482                backslashreplace_errors,
1483                METH_O,
1484                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1485                          "which replaces malformed data with a backslashed "
1486                          "escape sequence.")
1487            }
1488        },
1489        {
1490            "namereplace",
1491            {
1492                "namereplace_errors",
1493                namereplace_errors,
1494                METH_O,
1495                PyDoc_STR("Implements the 'namereplace' error handling, "
1496                          "which replaces an unencodable character with a "
1497                          "\\N{...} escape sequence.")
1498            }
1499        },
1500        {
1501            "surrogatepass",
1502            {
1503                "surrogatepass",
1504                surrogatepass_errors,
1505                METH_O
1506            }
1507        },
1508        {
1509            "surrogateescape",
1510            {
1511                "surrogateescape",
1512                surrogateescape_errors,
1513                METH_O
1514            }
1515        }
1516    };
1517
1518    PyInterpreterState *interp = PyThreadState_GET()->interp;
1519    PyObject *mod;
1520    unsigned i;
1521
1522    if (interp->codec_search_path != NULL)
1523        return 0;
1524
1525    interp->codec_search_path = PyList_New(0);
1526    interp->codec_search_cache = PyDict_New();
1527    interp->codec_error_registry = PyDict_New();
1528
1529    if (interp->codec_error_registry) {
1530        for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1531            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1532            int res;
1533            if (!func)
1534                Py_FatalError("can't initialize codec error registry");
1535            res = PyCodec_RegisterError(methods[i].name, func);
1536            Py_DECREF(func);
1537            if (res)
1538                Py_FatalError("can't initialize codec error registry");
1539        }
1540    }
1541
1542    if (interp->codec_search_path == NULL ||
1543        interp->codec_search_cache == NULL ||
1544        interp->codec_error_registry == NULL)
1545        Py_FatalError("can't initialize codec registry");
1546
1547    mod = PyImport_ImportModuleNoBlock("encodings");
1548    if (mod == NULL) {
1549        return -1;
1550    }
1551    Py_DECREF(mod);
1552    interp->codecs_initialized = 1;
1553    return 0;
1554}
1555