1/* ------------------------------------------------------------------------
2
3   Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9   ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include "ucnhash.h"
13#include <ctype.h>
14
15const char *Py_hexdigits = "0123456789abcdef";
16
17/* --- Codec Registry ----------------------------------------------------- */
18
19/* Import the standard encodings package which will register the first
20   codec search function.
21
22   This is done in a lazy way so that the Unicode implementation does
23   not downgrade startup time of scripts not needing it.
24
25   ImportErrors are silently ignored by this function. Only one try is
26   made.
27
28*/
29
30static int _PyCodecRegistry_Init(void); /* Forward */
31
32int PyCodec_Register(PyObject *search_function)
33{
34    PyInterpreterState *interp = PyThreadState_GET()->interp;
35    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
36        goto onError;
37    if (search_function == NULL) {
38        PyErr_BadArgument();
39        goto onError;
40    }
41    if (!PyCallable_Check(search_function)) {
42        PyErr_SetString(PyExc_TypeError, "argument must be callable");
43        goto onError;
44    }
45    return PyList_Append(interp->codec_search_path, search_function);
46
47 onError:
48    return -1;
49}
50
51/* Convert a string to a normalized Python string: all characters are
52   converted to lower case, spaces are replaced with underscores. */
53
54static
55PyObject *normalizestring(const char *string)
56{
57    size_t i;
58    size_t len = strlen(string);
59    char *p;
60    PyObject *v;
61
62    if (len > PY_SSIZE_T_MAX) {
63        PyErr_SetString(PyExc_OverflowError, "string is too large");
64        return NULL;
65    }
66
67    p = PyMem_Malloc(len + 1);
68    if (p == NULL)
69        return PyErr_NoMemory();
70    for (i = 0; i < len; i++) {
71        char ch = string[i];
72        if (ch == ' ')
73            ch = '-';
74        else
75            ch = Py_TOLOWER(Py_CHARMASK(ch));
76        p[i] = ch;
77    }
78    p[i] = '\0';
79    v = PyUnicode_FromString(p);
80    if (v == NULL)
81        return NULL;
82    PyMem_Free(p);
83    return v;
84}
85
86/* Lookup the given encoding and return a tuple providing the codec
87   facilities.
88
89   The encoding string is looked up converted to all lower-case
90   characters. This makes encodings looked up through this mechanism
91   effectively case-insensitive.
92
93   If no codec is found, a LookupError is set and NULL returned.
94
95   As side effect, this tries to load the encodings package, if not
96   yet done. This is part of the lazy load strategy for the encodings
97   package.
98
99*/
100
101PyObject *_PyCodec_Lookup(const char *encoding)
102{
103    PyInterpreterState *interp;
104    PyObject *result, *args = NULL, *v;
105    Py_ssize_t i, len;
106
107    if (encoding == NULL) {
108        PyErr_BadArgument();
109        goto onError;
110    }
111
112    interp = PyThreadState_GET()->interp;
113    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
114        goto onError;
115
116    /* Convert the encoding to a normalized Python string: all
117       characters are converted to lower case, spaces and hyphens are
118       replaced with underscores. */
119    v = normalizestring(encoding);
120    if (v == NULL)
121        goto onError;
122    PyUnicode_InternInPlace(&v);
123
124    /* First, try to lookup the name in the registry dictionary */
125    result = PyDict_GetItem(interp->codec_search_cache, v);
126    if (result != NULL) {
127        Py_INCREF(result);
128        Py_DECREF(v);
129        return result;
130    }
131
132    /* Next, scan the search functions in order of registration */
133    args = PyTuple_New(1);
134    if (args == NULL)
135        goto onError;
136    PyTuple_SET_ITEM(args,0,v);
137
138    len = PyList_Size(interp->codec_search_path);
139    if (len < 0)
140        goto onError;
141    if (len == 0) {
142        PyErr_SetString(PyExc_LookupError,
143                        "no codec search functions registered: "
144                        "can't find encoding");
145        goto onError;
146    }
147
148    for (i = 0; i < len; i++) {
149        PyObject *func;
150
151        func = PyList_GetItem(interp->codec_search_path, i);
152        if (func == NULL)
153            goto onError;
154        result = PyEval_CallObject(func, args);
155        if (result == NULL)
156            goto onError;
157        if (result == Py_None) {
158            Py_DECREF(result);
159            continue;
160        }
161        if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162            PyErr_SetString(PyExc_TypeError,
163                            "codec search functions must return 4-tuples");
164            Py_DECREF(result);
165            goto onError;
166        }
167        break;
168    }
169    if (i == len) {
170        /* XXX Perhaps we should cache misses too ? */
171        PyErr_Format(PyExc_LookupError,
172                     "unknown encoding: %s", encoding);
173        goto onError;
174    }
175
176    /* Cache and return the result */
177    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
178        Py_DECREF(result);
179        goto onError;
180    }
181    Py_DECREF(args);
182    return result;
183
184 onError:
185    Py_XDECREF(args);
186    return NULL;
187}
188
189int _PyCodec_Forget(const char *encoding)
190{
191    PyInterpreterState *interp;
192    PyObject *v;
193    int result;
194
195    interp = PyThreadState_GET()->interp;
196    if (interp->codec_search_path == NULL) {
197        return -1;
198    }
199
200    /* Convert the encoding to a normalized Python string: all
201       characters are converted to lower case, spaces and hyphens are
202       replaced with underscores. */
203    v = normalizestring(encoding);
204    if (v == NULL) {
205        return -1;
206    }
207
208    /* Drop the named codec from the internal cache */
209    result = PyDict_DelItem(interp->codec_search_cache, v);
210    Py_DECREF(v);
211
212    return result;
213}
214
215/* Codec registry encoding check API. */
216
217int PyCodec_KnownEncoding(const char *encoding)
218{
219    PyObject *codecs;
220
221    codecs = _PyCodec_Lookup(encoding);
222    if (!codecs) {
223        PyErr_Clear();
224        return 0;
225    }
226    else {
227        Py_DECREF(codecs);
228        return 1;
229    }
230}
231
232static
233PyObject *args_tuple(PyObject *object,
234                     const char *errors)
235{
236    PyObject *args;
237
238    args = PyTuple_New(1 + (errors != NULL));
239    if (args == NULL)
240        return NULL;
241    Py_INCREF(object);
242    PyTuple_SET_ITEM(args,0,object);
243    if (errors) {
244        PyObject *v;
245
246        v = PyUnicode_FromString(errors);
247        if (v == NULL) {
248            Py_DECREF(args);
249            return NULL;
250        }
251        PyTuple_SET_ITEM(args, 1, v);
252    }
253    return args;
254}
255
256/* Helper function to get a codec item */
257
258static
259PyObject *codec_getitem(const char *encoding, int index)
260{
261    PyObject *codecs;
262    PyObject *v;
263
264    codecs = _PyCodec_Lookup(encoding);
265    if (codecs == NULL)
266        return NULL;
267    v = PyTuple_GET_ITEM(codecs, index);
268    Py_DECREF(codecs);
269    Py_INCREF(v);
270    return v;
271}
272
273/* Helper functions to create an incremental codec. */
274static
275PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276                                     const char *errors,
277                                     const char *attrname)
278{
279    PyObject *ret, *inccodec;
280
281    inccodec = PyObject_GetAttrString(codec_info, attrname);
282    if (inccodec == NULL)
283        return NULL;
284    if (errors)
285        ret = PyObject_CallFunction(inccodec, "s", errors);
286    else
287        ret = PyObject_CallFunction(inccodec, NULL);
288    Py_DECREF(inccodec);
289    return ret;
290}
291
292static
293PyObject *codec_getincrementalcodec(const char *encoding,
294                                    const char *errors,
295                                    const char *attrname)
296{
297    PyObject *codec_info, *ret;
298
299    codec_info = _PyCodec_Lookup(encoding);
300    if (codec_info == NULL)
301        return NULL;
302    ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303    Py_DECREF(codec_info);
304    return ret;
305}
306
307/* Helper function to create a stream codec. */
308
309static
310PyObject *codec_getstreamcodec(const char *encoding,
311                               PyObject *stream,
312                               const char *errors,
313                               const int index)
314{
315    PyObject *codecs, *streamcodec, *codeccls;
316
317    codecs = _PyCodec_Lookup(encoding);
318    if (codecs == NULL)
319        return NULL;
320
321    codeccls = PyTuple_GET_ITEM(codecs, index);
322    if (errors != NULL)
323        streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
324    else
325        streamcodec = PyObject_CallFunction(codeccls, "O", stream);
326    Py_DECREF(codecs);
327    return streamcodec;
328}
329
330/* Helpers to work with the result of _PyCodec_Lookup
331
332 */
333PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334                                             const char *errors)
335{
336    return codec_makeincrementalcodec(codec_info, errors,
337                                      "incrementaldecoder");
338}
339
340PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341                                             const char *errors)
342{
343    return codec_makeincrementalcodec(codec_info, errors,
344                                      "incrementalencoder");
345}
346
347
348/* Convenience APIs to query the Codec registry.
349
350   All APIs return a codec object with incremented refcount.
351
352 */
353
354PyObject *PyCodec_Encoder(const char *encoding)
355{
356    return codec_getitem(encoding, 0);
357}
358
359PyObject *PyCodec_Decoder(const char *encoding)
360{
361    return codec_getitem(encoding, 1);
362}
363
364PyObject *PyCodec_IncrementalEncoder(const char *encoding,
365                                     const char *errors)
366{
367    return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
368}
369
370PyObject *PyCodec_IncrementalDecoder(const char *encoding,
371                                     const char *errors)
372{
373    return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
374}
375
376PyObject *PyCodec_StreamReader(const char *encoding,
377                               PyObject *stream,
378                               const char *errors)
379{
380    return codec_getstreamcodec(encoding, stream, errors, 2);
381}
382
383PyObject *PyCodec_StreamWriter(const char *encoding,
384                               PyObject *stream,
385                               const char *errors)
386{
387    return codec_getstreamcodec(encoding, stream, errors, 3);
388}
389
390/* Helper that tries to ensure the reported exception chain indicates the
391 * codec that was invoked to trigger the failure without changing the type
392 * of the exception raised.
393 */
394static void
395wrap_codec_error(const char *operation,
396                 const char *encoding)
397{
398    /* TrySetFromCause will replace the active exception with a suitably
399     * updated clone if it can, otherwise it will leave the original
400     * exception alone.
401     */
402    _PyErr_TrySetFromCause("%s with '%s' codec failed",
403                           operation, encoding);
404}
405
406/* Encode an object (e.g. a Unicode object) using the given encoding
407   and return the resulting encoded object (usually a Python string).
408
409   errors is passed to the encoder factory as argument if non-NULL. */
410
411static PyObject *
412_PyCodec_EncodeInternal(PyObject *object,
413                        PyObject *encoder,
414                        const char *encoding,
415                        const char *errors)
416{
417    PyObject *args = NULL, *result = NULL;
418    PyObject *v = NULL;
419
420    args = args_tuple(object, errors);
421    if (args == NULL)
422        goto onError;
423
424    result = PyEval_CallObject(encoder, args);
425    if (result == NULL) {
426        wrap_codec_error("encoding", encoding);
427        goto onError;
428    }
429
430    if (!PyTuple_Check(result) ||
431        PyTuple_GET_SIZE(result) != 2) {
432        PyErr_SetString(PyExc_TypeError,
433                        "encoder must return a tuple (object, integer)");
434        goto onError;
435    }
436    v = PyTuple_GET_ITEM(result,0);
437    Py_INCREF(v);
438    /* We don't check or use the second (integer) entry. */
439
440    Py_DECREF(args);
441    Py_DECREF(encoder);
442    Py_DECREF(result);
443    return v;
444
445 onError:
446    Py_XDECREF(result);
447    Py_XDECREF(args);
448    Py_XDECREF(encoder);
449    return NULL;
450}
451
452/* Decode an object (usually a Python string) using the given encoding
453   and return an equivalent object (e.g. a Unicode object).
454
455   errors is passed to the decoder factory as argument if non-NULL. */
456
457static PyObject *
458_PyCodec_DecodeInternal(PyObject *object,
459                        PyObject *decoder,
460                        const char *encoding,
461                        const char *errors)
462{
463    PyObject *args = NULL, *result = NULL;
464    PyObject *v;
465
466    args = args_tuple(object, errors);
467    if (args == NULL)
468        goto onError;
469
470    result = PyEval_CallObject(decoder,args);
471    if (result == NULL) {
472        wrap_codec_error("decoding", encoding);
473        goto onError;
474    }
475    if (!PyTuple_Check(result) ||
476        PyTuple_GET_SIZE(result) != 2) {
477        PyErr_SetString(PyExc_TypeError,
478                        "decoder must return a tuple (object,integer)");
479        goto onError;
480    }
481    v = PyTuple_GET_ITEM(result,0);
482    Py_INCREF(v);
483    /* We don't check or use the second (integer) entry. */
484
485    Py_DECREF(args);
486    Py_DECREF(decoder);
487    Py_DECREF(result);
488    return v;
489
490 onError:
491    Py_XDECREF(args);
492    Py_XDECREF(decoder);
493    Py_XDECREF(result);
494    return NULL;
495}
496
497/* Generic encoding/decoding API */
498PyObject *PyCodec_Encode(PyObject *object,
499                         const char *encoding,
500                         const char *errors)
501{
502    PyObject *encoder;
503
504    encoder = PyCodec_Encoder(encoding);
505    if (encoder == NULL)
506        return NULL;
507
508    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509}
510
511PyObject *PyCodec_Decode(PyObject *object,
512                         const char *encoding,
513                         const char *errors)
514{
515    PyObject *decoder;
516
517    decoder = PyCodec_Decoder(encoding);
518    if (decoder == NULL)
519        return NULL;
520
521    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522}
523
524/* Text encoding/decoding API */
525PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526                                       const char *alternate_command)
527{
528    _Py_IDENTIFIER(_is_text_encoding);
529    PyObject *codec;
530    PyObject *attr;
531    int is_text_codec;
532
533    codec = _PyCodec_Lookup(encoding);
534    if (codec == NULL)
535        return NULL;
536
537    /* Backwards compatibility: assume any raw tuple describes a text
538     * encoding, and the same for anything lacking the private
539     * attribute.
540     */
541    if (!PyTuple_CheckExact(codec)) {
542        attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543        if (attr == NULL) {
544            if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545                PyErr_Clear();
546            } else {
547                Py_DECREF(codec);
548                return NULL;
549            }
550        } else {
551            is_text_codec = PyObject_IsTrue(attr);
552            Py_DECREF(attr);
553            if (is_text_codec <= 0) {
554                Py_DECREF(codec);
555                if (!is_text_codec)
556                    PyErr_Format(PyExc_LookupError,
557                                 "'%.400s' is not a text encoding; "
558                                 "use %s to handle arbitrary codecs",
559                                 encoding, alternate_command);
560                return NULL;
561            }
562        }
563    }
564
565    /* This appears to be a valid text encoding */
566    return codec;
567}
568
569
570static
571PyObject *codec_getitem_checked(const char *encoding,
572                                const char *alternate_command,
573                                int index)
574{
575    PyObject *codec;
576    PyObject *v;
577
578    codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
579    if (codec == NULL)
580        return NULL;
581
582    v = PyTuple_GET_ITEM(codec, index);
583    Py_INCREF(v);
584    Py_DECREF(codec);
585    return v;
586}
587
588static PyObject * _PyCodec_TextEncoder(const char *encoding)
589{
590    return codec_getitem_checked(encoding, "codecs.encode()", 0);
591}
592
593static PyObject * _PyCodec_TextDecoder(const char *encoding)
594{
595    return codec_getitem_checked(encoding, "codecs.decode()", 1);
596}
597
598PyObject *_PyCodec_EncodeText(PyObject *object,
599                              const char *encoding,
600                              const char *errors)
601{
602    PyObject *encoder;
603
604    encoder = _PyCodec_TextEncoder(encoding);
605    if (encoder == NULL)
606        return NULL;
607
608    return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
609}
610
611PyObject *_PyCodec_DecodeText(PyObject *object,
612                              const char *encoding,
613                              const char *errors)
614{
615    PyObject *decoder;
616
617    decoder = _PyCodec_TextDecoder(encoding);
618    if (decoder == NULL)
619        return NULL;
620
621    return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
622}
623
624/* Register the error handling callback function error under the name
625   name. This function will be called by the codec when it encounters
626   an unencodable characters/undecodable bytes and doesn't know the
627   callback name, when name is specified as the error parameter
628   in the call to the encode/decode function.
629   Return 0 on success, -1 on error */
630int PyCodec_RegisterError(const char *name, PyObject *error)
631{
632    PyInterpreterState *interp = PyThreadState_GET()->interp;
633    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
634        return -1;
635    if (!PyCallable_Check(error)) {
636        PyErr_SetString(PyExc_TypeError, "handler must be callable");
637        return -1;
638    }
639    return PyDict_SetItemString(interp->codec_error_registry,
640                                name, error);
641}
642
643/* Lookup the error handling callback function registered under the
644   name error. As a special case NULL can be passed, in which case
645   the error handling callback for strict encoding will be returned. */
646PyObject *PyCodec_LookupError(const char *name)
647{
648    PyObject *handler = NULL;
649
650    PyInterpreterState *interp = PyThreadState_GET()->interp;
651    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
652        return NULL;
653
654    if (name==NULL)
655        name = "strict";
656    handler = PyDict_GetItemString(interp->codec_error_registry, name);
657    if (!handler)
658        PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
659    else
660        Py_INCREF(handler);
661    return handler;
662}
663
664static void wrong_exception_type(PyObject *exc)
665{
666    PyErr_Format(PyExc_TypeError,
667                 "don't know how to handle %.200s in error callback",
668                 exc->ob_type->tp_name);
669}
670
671PyObject *PyCodec_StrictErrors(PyObject *exc)
672{
673    if (PyExceptionInstance_Check(exc))
674        PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
675    else
676        PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
677    return NULL;
678}
679
680
681PyObject *PyCodec_IgnoreErrors(PyObject *exc)
682{
683    Py_ssize_t end;
684
685    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
686        if (PyUnicodeEncodeError_GetEnd(exc, &end))
687            return NULL;
688    }
689    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
690        if (PyUnicodeDecodeError_GetEnd(exc, &end))
691            return NULL;
692    }
693    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
694        if (PyUnicodeTranslateError_GetEnd(exc, &end))
695            return NULL;
696    }
697    else {
698        wrong_exception_type(exc);
699        return NULL;
700    }
701    return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
702}
703
704
705PyObject *PyCodec_ReplaceErrors(PyObject *exc)
706{
707    Py_ssize_t start, end, i, len;
708
709    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
710        PyObject *res;
711        int kind;
712        void *data;
713        if (PyUnicodeEncodeError_GetStart(exc, &start))
714            return NULL;
715        if (PyUnicodeEncodeError_GetEnd(exc, &end))
716            return NULL;
717        len = end - start;
718        res = PyUnicode_New(len, '?');
719        if (res == NULL)
720            return NULL;
721        kind = PyUnicode_KIND(res);
722        data = PyUnicode_DATA(res);
723        for (i = 0; i < len; ++i)
724            PyUnicode_WRITE(kind, data, i, '?');
725        assert(_PyUnicode_CheckConsistency(res, 1));
726        return Py_BuildValue("(Nn)", res, end);
727    }
728    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
729        if (PyUnicodeDecodeError_GetEnd(exc, &end))
730            return NULL;
731        return Py_BuildValue("(Cn)",
732                             (int)Py_UNICODE_REPLACEMENT_CHARACTER,
733                             end);
734    }
735    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
736        PyObject *res;
737        int kind;
738        void *data;
739        if (PyUnicodeTranslateError_GetStart(exc, &start))
740            return NULL;
741        if (PyUnicodeTranslateError_GetEnd(exc, &end))
742            return NULL;
743        len = end - start;
744        res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
745        if (res == NULL)
746            return NULL;
747        kind = PyUnicode_KIND(res);
748        data = PyUnicode_DATA(res);
749        for (i=0; i < len; i++)
750            PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
751        assert(_PyUnicode_CheckConsistency(res, 1));
752        return Py_BuildValue("(Nn)", res, end);
753    }
754    else {
755        wrong_exception_type(exc);
756        return NULL;
757    }
758}
759
760PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
761{
762    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
763        PyObject *restuple;
764        PyObject *object;
765        Py_ssize_t i;
766        Py_ssize_t start;
767        Py_ssize_t end;
768        PyObject *res;
769        unsigned char *outp;
770        Py_ssize_t ressize;
771        Py_UCS4 ch;
772        if (PyUnicodeEncodeError_GetStart(exc, &start))
773            return NULL;
774        if (PyUnicodeEncodeError_GetEnd(exc, &end))
775            return NULL;
776        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
777            return NULL;
778        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
779            end = start + PY_SSIZE_T_MAX / (2+7+1);
780        for (i = start, ressize = 0; i < end; ++i) {
781            /* object is guaranteed to be "ready" */
782            ch = PyUnicode_READ_CHAR(object, i);
783            if (ch<10)
784                ressize += 2+1+1;
785            else if (ch<100)
786                ressize += 2+2+1;
787            else if (ch<1000)
788                ressize += 2+3+1;
789            else if (ch<10000)
790                ressize += 2+4+1;
791            else if (ch<100000)
792                ressize += 2+5+1;
793            else if (ch<1000000)
794                ressize += 2+6+1;
795            else
796                ressize += 2+7+1;
797        }
798        /* allocate replacement */
799        res = PyUnicode_New(ressize, 127);
800        if (res == NULL) {
801            Py_DECREF(object);
802            return NULL;
803        }
804        outp = PyUnicode_1BYTE_DATA(res);
805        /* generate replacement */
806        for (i = start; i < end; ++i) {
807            int digits;
808            int base;
809            ch = PyUnicode_READ_CHAR(object, i);
810            *outp++ = '&';
811            *outp++ = '#';
812            if (ch<10) {
813                digits = 1;
814                base = 1;
815            }
816            else if (ch<100) {
817                digits = 2;
818                base = 10;
819            }
820            else if (ch<1000) {
821                digits = 3;
822                base = 100;
823            }
824            else if (ch<10000) {
825                digits = 4;
826                base = 1000;
827            }
828            else if (ch<100000) {
829                digits = 5;
830                base = 10000;
831            }
832            else if (ch<1000000) {
833                digits = 6;
834                base = 100000;
835            }
836            else {
837                digits = 7;
838                base = 1000000;
839            }
840            while (digits-->0) {
841                *outp++ = '0' + ch/base;
842                ch %= base;
843                base /= 10;
844            }
845            *outp++ = ';';
846        }
847        assert(_PyUnicode_CheckConsistency(res, 1));
848        restuple = Py_BuildValue("(Nn)", res, end);
849        Py_DECREF(object);
850        return restuple;
851    }
852    else {
853        wrong_exception_type(exc);
854        return NULL;
855    }
856}
857
858PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
859{
860    PyObject *object;
861    Py_ssize_t i;
862    Py_ssize_t start;
863    Py_ssize_t end;
864    PyObject *res;
865    unsigned char *outp;
866    int ressize;
867    Py_UCS4 c;
868
869    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
870        unsigned char *p;
871        if (PyUnicodeDecodeError_GetStart(exc, &start))
872            return NULL;
873        if (PyUnicodeDecodeError_GetEnd(exc, &end))
874            return NULL;
875        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
876            return NULL;
877        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
878            Py_DECREF(object);
879            return NULL;
880        }
881        res = PyUnicode_New(4 * (end - start), 127);
882        if (res == NULL) {
883            Py_DECREF(object);
884            return NULL;
885        }
886        outp = PyUnicode_1BYTE_DATA(res);
887        for (i = start; i < end; i++, outp += 4) {
888            unsigned char c = p[i];
889            outp[0] = '\\';
890            outp[1] = 'x';
891            outp[2] = Py_hexdigits[(c>>4)&0xf];
892            outp[3] = Py_hexdigits[c&0xf];
893        }
894
895        assert(_PyUnicode_CheckConsistency(res, 1));
896        Py_DECREF(object);
897        return Py_BuildValue("(Nn)", res, end);
898    }
899    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
900        if (PyUnicodeEncodeError_GetStart(exc, &start))
901            return NULL;
902        if (PyUnicodeEncodeError_GetEnd(exc, &end))
903            return NULL;
904        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
905            return NULL;
906    }
907    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
908        if (PyUnicodeTranslateError_GetStart(exc, &start))
909            return NULL;
910        if (PyUnicodeTranslateError_GetEnd(exc, &end))
911            return NULL;
912        if (!(object = PyUnicodeTranslateError_GetObject(exc)))
913            return NULL;
914    }
915    else {
916        wrong_exception_type(exc);
917        return NULL;
918    }
919
920    if (end - start > PY_SSIZE_T_MAX / (1+1+8))
921        end = start + PY_SSIZE_T_MAX / (1+1+8);
922    for (i = start, ressize = 0; i < end; ++i) {
923        /* object is guaranteed to be "ready" */
924        c = PyUnicode_READ_CHAR(object, i);
925        if (c >= 0x10000) {
926            ressize += 1+1+8;
927        }
928        else if (c >= 0x100) {
929            ressize += 1+1+4;
930        }
931        else
932            ressize += 1+1+2;
933    }
934    res = PyUnicode_New(ressize, 127);
935    if (res == NULL) {
936        Py_DECREF(object);
937        return NULL;
938    }
939    outp = PyUnicode_1BYTE_DATA(res);
940    for (i = start; i < end; ++i) {
941        c = PyUnicode_READ_CHAR(object, i);
942        *outp++ = '\\';
943        if (c >= 0x00010000) {
944            *outp++ = 'U';
945            *outp++ = Py_hexdigits[(c>>28)&0xf];
946            *outp++ = Py_hexdigits[(c>>24)&0xf];
947            *outp++ = Py_hexdigits[(c>>20)&0xf];
948            *outp++ = Py_hexdigits[(c>>16)&0xf];
949            *outp++ = Py_hexdigits[(c>>12)&0xf];
950            *outp++ = Py_hexdigits[(c>>8)&0xf];
951        }
952        else if (c >= 0x100) {
953            *outp++ = 'u';
954            *outp++ = Py_hexdigits[(c>>12)&0xf];
955            *outp++ = Py_hexdigits[(c>>8)&0xf];
956        }
957        else
958            *outp++ = 'x';
959        *outp++ = Py_hexdigits[(c>>4)&0xf];
960        *outp++ = Py_hexdigits[c&0xf];
961    }
962
963    assert(_PyUnicode_CheckConsistency(res, 1));
964    Py_DECREF(object);
965    return Py_BuildValue("(Nn)", res, end);
966}
967
968static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
969
970PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
971{
972    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
973        PyObject *restuple;
974        PyObject *object;
975        Py_ssize_t i;
976        Py_ssize_t start;
977        Py_ssize_t end;
978        PyObject *res;
979        unsigned char *outp;
980        Py_ssize_t ressize;
981        int replsize;
982        Py_UCS4 c;
983        char buffer[256]; /* NAME_MAXLEN */
984        if (PyUnicodeEncodeError_GetStart(exc, &start))
985            return NULL;
986        if (PyUnicodeEncodeError_GetEnd(exc, &end))
987            return NULL;
988        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
989            return NULL;
990        if (!ucnhash_CAPI) {
991            /* load the unicode data module */
992            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
993                                            PyUnicodeData_CAPSULE_NAME, 1);
994            if (!ucnhash_CAPI)
995                return NULL;
996        }
997        for (i = start, ressize = 0; i < end; ++i) {
998            /* object is guaranteed to be "ready" */
999            c = PyUnicode_READ_CHAR(object, i);
1000            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1001                replsize = 1+1+1+(int)strlen(buffer)+1;
1002            }
1003            else if (c >= 0x10000) {
1004                replsize = 1+1+8;
1005            }
1006            else if (c >= 0x100) {
1007                replsize = 1+1+4;
1008            }
1009            else
1010                replsize = 1+1+2;
1011            if (ressize > PY_SSIZE_T_MAX - replsize)
1012                break;
1013            ressize += replsize;
1014        }
1015        end = i;
1016        res = PyUnicode_New(ressize, 127);
1017        if (res==NULL)
1018            return NULL;
1019        for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1020            i < end; ++i) {
1021            c = PyUnicode_READ_CHAR(object, i);
1022            *outp++ = '\\';
1023            if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1024                *outp++ = 'N';
1025                *outp++ = '{';
1026                strcpy((char *)outp, buffer);
1027                outp += strlen(buffer);
1028                *outp++ = '}';
1029                continue;
1030            }
1031            if (c >= 0x00010000) {
1032                *outp++ = 'U';
1033                *outp++ = Py_hexdigits[(c>>28)&0xf];
1034                *outp++ = Py_hexdigits[(c>>24)&0xf];
1035                *outp++ = Py_hexdigits[(c>>20)&0xf];
1036                *outp++ = Py_hexdigits[(c>>16)&0xf];
1037                *outp++ = Py_hexdigits[(c>>12)&0xf];
1038                *outp++ = Py_hexdigits[(c>>8)&0xf];
1039            }
1040            else if (c >= 0x100) {
1041                *outp++ = 'u';
1042                *outp++ = Py_hexdigits[(c>>12)&0xf];
1043                *outp++ = Py_hexdigits[(c>>8)&0xf];
1044            }
1045            else
1046                *outp++ = 'x';
1047            *outp++ = Py_hexdigits[(c>>4)&0xf];
1048            *outp++ = Py_hexdigits[c&0xf];
1049        }
1050
1051        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1052        assert(_PyUnicode_CheckConsistency(res, 1));
1053        restuple = Py_BuildValue("(Nn)", res, end);
1054        Py_DECREF(object);
1055        return restuple;
1056    }
1057    else {
1058        wrong_exception_type(exc);
1059        return NULL;
1060    }
1061}
1062
1063#define ENC_UNKNOWN     -1
1064#define ENC_UTF8        0
1065#define ENC_UTF16BE     1
1066#define ENC_UTF16LE     2
1067#define ENC_UTF32BE     3
1068#define ENC_UTF32LE     4
1069
1070static int
1071get_standard_encoding(const char *encoding, int *bytelength)
1072{
1073    if (Py_TOLOWER(encoding[0]) == 'u' &&
1074        Py_TOLOWER(encoding[1]) == 't' &&
1075        Py_TOLOWER(encoding[2]) == 'f') {
1076        encoding += 3;
1077        if (*encoding == '-' || *encoding == '_' )
1078            encoding++;
1079        if (encoding[0] == '8' && encoding[1] == '\0') {
1080            *bytelength = 3;
1081            return ENC_UTF8;
1082        }
1083        else if (encoding[0] == '1' && encoding[1] == '6') {
1084            encoding += 2;
1085            *bytelength = 2;
1086            if (*encoding == '\0') {
1087#ifdef WORDS_BIGENDIAN
1088                return ENC_UTF16BE;
1089#else
1090                return ENC_UTF16LE;
1091#endif
1092            }
1093            if (*encoding == '-' || *encoding == '_' )
1094                encoding++;
1095            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1096                if (Py_TOLOWER(encoding[0]) == 'b')
1097                    return ENC_UTF16BE;
1098                if (Py_TOLOWER(encoding[0]) == 'l')
1099                    return ENC_UTF16LE;
1100            }
1101        }
1102        else if (encoding[0] == '3' && encoding[1] == '2') {
1103            encoding += 2;
1104            *bytelength = 4;
1105            if (*encoding == '\0') {
1106#ifdef WORDS_BIGENDIAN
1107                return ENC_UTF32BE;
1108#else
1109                return ENC_UTF32LE;
1110#endif
1111            }
1112            if (*encoding == '-' || *encoding == '_' )
1113                encoding++;
1114            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1115                if (Py_TOLOWER(encoding[0]) == 'b')
1116                    return ENC_UTF32BE;
1117                if (Py_TOLOWER(encoding[0]) == 'l')
1118                    return ENC_UTF32LE;
1119            }
1120        }
1121    }
1122    else if (strcmp(encoding, "CP_UTF8") == 0) {
1123        *bytelength = 3;
1124        return ENC_UTF8;
1125    }
1126    return ENC_UNKNOWN;
1127}
1128
1129/* This handler is declared static until someone demonstrates
1130   a need to call it directly. */
1131static PyObject *
1132PyCodec_SurrogatePassErrors(PyObject *exc)
1133{
1134    PyObject *restuple;
1135    PyObject *object;
1136    PyObject *encode;
1137    char *encoding;
1138    int code;
1139    int bytelength;
1140    Py_ssize_t i;
1141    Py_ssize_t start;
1142    Py_ssize_t end;
1143    PyObject *res;
1144
1145    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1146        unsigned char *outp;
1147        if (PyUnicodeEncodeError_GetStart(exc, &start))
1148            return NULL;
1149        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1150            return NULL;
1151        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1152            return NULL;
1153        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1154            Py_DECREF(object);
1155            return NULL;
1156        }
1157        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1158            Py_DECREF(object);
1159            Py_DECREF(encode);
1160            return NULL;
1161        }
1162        code = get_standard_encoding(encoding, &bytelength);
1163        Py_DECREF(encode);
1164        if (code == ENC_UNKNOWN) {
1165            /* Not supported, fail with original exception */
1166            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1167            Py_DECREF(object);
1168            return NULL;
1169        }
1170
1171        if (end - start > PY_SSIZE_T_MAX / bytelength)
1172            end = start + PY_SSIZE_T_MAX / bytelength;
1173        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1174        if (!res) {
1175            Py_DECREF(object);
1176            return NULL;
1177        }
1178        outp = (unsigned char*)PyBytes_AsString(res);
1179        for (i = start; i < end; i++) {
1180            /* object is guaranteed to be "ready" */
1181            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1182            if (!Py_UNICODE_IS_SURROGATE(ch)) {
1183                /* Not a surrogate, fail with original exception */
1184                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1185                Py_DECREF(res);
1186                Py_DECREF(object);
1187                return NULL;
1188            }
1189            switch (code) {
1190            case ENC_UTF8:
1191                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1192                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1193                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1194                break;
1195            case ENC_UTF16LE:
1196                *outp++ = (unsigned char) ch;
1197                *outp++ = (unsigned char)(ch >> 8);
1198                break;
1199            case ENC_UTF16BE:
1200                *outp++ = (unsigned char)(ch >> 8);
1201                *outp++ = (unsigned char) ch;
1202                break;
1203            case ENC_UTF32LE:
1204                *outp++ = (unsigned char) ch;
1205                *outp++ = (unsigned char)(ch >> 8);
1206                *outp++ = (unsigned char)(ch >> 16);
1207                *outp++ = (unsigned char)(ch >> 24);
1208                break;
1209            case ENC_UTF32BE:
1210                *outp++ = (unsigned char)(ch >> 24);
1211                *outp++ = (unsigned char)(ch >> 16);
1212                *outp++ = (unsigned char)(ch >> 8);
1213                *outp++ = (unsigned char) ch;
1214                break;
1215            }
1216        }
1217        restuple = Py_BuildValue("(On)", res, end);
1218        Py_DECREF(res);
1219        Py_DECREF(object);
1220        return restuple;
1221    }
1222    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1223        unsigned char *p;
1224        Py_UCS4 ch = 0;
1225        if (PyUnicodeDecodeError_GetStart(exc, &start))
1226            return NULL;
1227        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1228            return NULL;
1229        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1230            return NULL;
1231        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1232            Py_DECREF(object);
1233            return NULL;
1234        }
1235        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1236            Py_DECREF(object);
1237            return NULL;
1238        }
1239        if (!(encoding = PyUnicode_AsUTF8(encode))) {
1240            Py_DECREF(object);
1241            Py_DECREF(encode);
1242            return NULL;
1243        }
1244        code = get_standard_encoding(encoding, &bytelength);
1245        Py_DECREF(encode);
1246        if (code == ENC_UNKNOWN) {
1247            /* Not supported, fail with original exception */
1248            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1249            Py_DECREF(object);
1250            return NULL;
1251        }
1252
1253        /* Try decoding a single surrogate character. If
1254           there are more, let the codec call us again. */
1255        p += start;
1256        if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1257            switch (code) {
1258            case ENC_UTF8:
1259                if ((p[0] & 0xf0) == 0xe0 &&
1260                    (p[1] & 0xc0) == 0x80 &&
1261                    (p[2] & 0xc0) == 0x80) {
1262                    /* it's a three-byte code */
1263                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1264                }
1265                break;
1266            case ENC_UTF16LE:
1267                ch = p[1] << 8 | p[0];
1268                break;
1269            case ENC_UTF16BE:
1270                ch = p[0] << 8 | p[1];
1271                break;
1272            case ENC_UTF32LE:
1273                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1274                break;
1275            case ENC_UTF32BE:
1276                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1277                break;
1278            }
1279        }
1280
1281        Py_DECREF(object);
1282        if (!Py_UNICODE_IS_SURROGATE(ch)) {
1283            /* it's not a surrogate - fail */
1284            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1285            return NULL;
1286        }
1287        res = PyUnicode_FromOrdinal(ch);
1288        if (res == NULL)
1289            return NULL;
1290        return Py_BuildValue("(Nn)", res, start + bytelength);
1291    }
1292    else {
1293        wrong_exception_type(exc);
1294        return NULL;
1295    }
1296}
1297
1298static PyObject *
1299PyCodec_SurrogateEscapeErrors(PyObject *exc)
1300{
1301    PyObject *restuple;
1302    PyObject *object;
1303    Py_ssize_t i;
1304    Py_ssize_t start;
1305    Py_ssize_t end;
1306    PyObject *res;
1307
1308    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1309        char *outp;
1310        if (PyUnicodeEncodeError_GetStart(exc, &start))
1311            return NULL;
1312        if (PyUnicodeEncodeError_GetEnd(exc, &end))
1313            return NULL;
1314        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1315            return NULL;
1316        res = PyBytes_FromStringAndSize(NULL, end-start);
1317        if (!res) {
1318            Py_DECREF(object);
1319            return NULL;
1320        }
1321        outp = PyBytes_AsString(res);
1322        for (i = start; i < end; i++) {
1323            /* object is guaranteed to be "ready" */
1324            Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1325            if (ch < 0xdc80 || ch > 0xdcff) {
1326                /* Not a UTF-8b surrogate, fail with original exception */
1327                PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1328                Py_DECREF(res);
1329                Py_DECREF(object);
1330                return NULL;
1331            }
1332            *outp++ = ch - 0xdc00;
1333        }
1334        restuple = Py_BuildValue("(On)", res, end);
1335        Py_DECREF(res);
1336        Py_DECREF(object);
1337        return restuple;
1338    }
1339    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1340        PyObject *str;
1341        unsigned char *p;
1342        Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1343        int consumed = 0;
1344        if (PyUnicodeDecodeError_GetStart(exc, &start))
1345            return NULL;
1346        if (PyUnicodeDecodeError_GetEnd(exc, &end))
1347            return NULL;
1348        if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1349            return NULL;
1350        if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1351            Py_DECREF(object);
1352            return NULL;
1353        }
1354        while (consumed < 4 && consumed < end-start) {
1355            /* Refuse to escape ASCII bytes. */
1356            if (p[start+consumed] < 128)
1357                break;
1358            ch[consumed] = 0xdc00 + p[start+consumed];
1359            consumed++;
1360        }
1361        Py_DECREF(object);
1362        if (!consumed) {
1363            /* codec complained about ASCII byte. */
1364            PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1365            return NULL;
1366        }
1367        str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1368        if (str == NULL)
1369            return NULL;
1370        return Py_BuildValue("(Nn)", str, start+consumed);
1371    }
1372    else {
1373        wrong_exception_type(exc);
1374        return NULL;
1375    }
1376}
1377
1378
1379static PyObject *strict_errors(PyObject *self, PyObject *exc)
1380{
1381    return PyCodec_StrictErrors(exc);
1382}
1383
1384
1385static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1386{
1387    return PyCodec_IgnoreErrors(exc);
1388}
1389
1390
1391static PyObject *replace_errors(PyObject *self, PyObject *exc)
1392{
1393    return PyCodec_ReplaceErrors(exc);
1394}
1395
1396
1397static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1398{
1399    return PyCodec_XMLCharRefReplaceErrors(exc);
1400}
1401
1402
1403static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1404{
1405    return PyCodec_BackslashReplaceErrors(exc);
1406}
1407
1408static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1409{
1410    return PyCodec_NameReplaceErrors(exc);
1411}
1412
1413static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1414{
1415    return PyCodec_SurrogatePassErrors(exc);
1416}
1417
1418static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1419{
1420    return PyCodec_SurrogateEscapeErrors(exc);
1421}
1422
1423static int _PyCodecRegistry_Init(void)
1424{
1425    static struct {
1426        char *name;
1427        PyMethodDef def;
1428    } methods[] =
1429    {
1430        {
1431            "strict",
1432            {
1433                "strict_errors",
1434                strict_errors,
1435                METH_O,
1436                PyDoc_STR("Implements the 'strict' error handling, which "
1437                          "raises a UnicodeError on coding errors.")
1438            }
1439        },
1440        {
1441            "ignore",
1442            {
1443                "ignore_errors",
1444                ignore_errors,
1445                METH_O,
1446                PyDoc_STR("Implements the 'ignore' error handling, which "
1447                          "ignores malformed data and continues.")
1448            }
1449        },
1450        {
1451            "replace",
1452            {
1453                "replace_errors",
1454                replace_errors,
1455                METH_O,
1456                PyDoc_STR("Implements the 'replace' error handling, which "
1457                          "replaces malformed data with a replacement marker.")
1458            }
1459        },
1460        {
1461            "xmlcharrefreplace",
1462            {
1463                "xmlcharrefreplace_errors",
1464                xmlcharrefreplace_errors,
1465                METH_O,
1466                PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1467                          "which replaces an unencodable character with the "
1468                          "appropriate XML character reference.")
1469            }
1470        },
1471        {
1472            "backslashreplace",
1473            {
1474                "backslashreplace_errors",
1475                backslashreplace_errors,
1476                METH_O,
1477                PyDoc_STR("Implements the 'backslashreplace' error handling, "
1478                          "which replaces malformed data with a backslashed "
1479                          "escape sequence.")
1480            }
1481        },
1482        {
1483            "namereplace",
1484            {
1485                "namereplace_errors",
1486                namereplace_errors,
1487                METH_O,
1488                PyDoc_STR("Implements the 'namereplace' error handling, "
1489                          "which replaces an unencodable character with a "
1490                          "\\N{...} escape sequence.")
1491            }
1492        },
1493        {
1494            "surrogatepass",
1495            {
1496                "surrogatepass",
1497                surrogatepass_errors,
1498                METH_O
1499            }
1500        },
1501        {
1502            "surrogateescape",
1503            {
1504                "surrogateescape",
1505                surrogateescape_errors,
1506                METH_O
1507            }
1508        }
1509    };
1510
1511    PyInterpreterState *interp = PyThreadState_GET()->interp;
1512    PyObject *mod;
1513    unsigned i;
1514
1515    if (interp->codec_search_path != NULL)
1516        return 0;
1517
1518    interp->codec_search_path = PyList_New(0);
1519    interp->codec_search_cache = PyDict_New();
1520    interp->codec_error_registry = PyDict_New();
1521
1522    if (interp->codec_error_registry) {
1523        for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1524            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1525            int res;
1526            if (!func)
1527                Py_FatalError("can't initialize codec error registry");
1528            res = PyCodec_RegisterError(methods[i].name, func);
1529            Py_DECREF(func);
1530            if (res)
1531                Py_FatalError("can't initialize codec error registry");
1532        }
1533    }
1534
1535    if (interp->codec_search_path == NULL ||
1536        interp->codec_search_cache == NULL ||
1537        interp->codec_error_registry == NULL)
1538        Py_FatalError("can't initialize codec registry");
1539
1540    mod = PyImport_ImportModuleNoBlock("encodings");
1541    if (mod == NULL) {
1542        return -1;
1543    }
1544    Py_DECREF(mod);
1545    interp->codecs_initialized = 1;
1546    return 0;
1547}
1548