codecs.c revision 7e47402264cf87b9bbb61fc9ff610af08add7c7b
1/* ------------------------------------------------------------------------
2
3   Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9   ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13#ifdef HAVE_LIMITS_H
14#include <limits.h>
15#endif
16
17/* --- Globals ------------------------------------------------------------ */
18
19static PyObject *_PyCodec_SearchPath;
20static PyObject *_PyCodec_SearchCache;
21
22/* Flag used for lazy import of the standard encodings package */
23static int import_encodings_called = 0;
24
25/* --- Codec Registry ----------------------------------------------------- */
26
27/* Import the standard encodings package which will register the first
28   codec search function.
29
30   This is done in a lazy way so that the Unicode implementation does
31   not downgrade startup time of scripts not needing it.
32
33   ImportErrors are silently ignored by this function. Only one try is
34   made.
35
36*/
37
38static
39int import_encodings()
40{
41    PyObject *mod;
42
43    import_encodings_called = 1;
44    mod = PyImport_ImportModule("encodings");
45    if (mod == NULL) {
46	if (PyErr_ExceptionMatches(PyExc_ImportError)) {
47	    /* Ignore ImportErrors... this is done so that
48	       distributions can disable the encodings package. Note
49	       that other errors are not masked, e.g. SystemErrors
50	       raised to inform the user of an error in the Python
51	       configuration are still reported back to the user. */
52	    PyErr_Clear();
53	    return 0;
54	}
55	return -1;
56    }
57    Py_DECREF(mod);
58    return 0;
59}
60
61int PyCodec_Register(PyObject *search_function)
62{
63    if (!import_encodings_called) {
64	if (import_encodings())
65	    goto onError;
66    }
67    if (search_function == NULL) {
68	PyErr_BadArgument();
69	goto onError;
70    }
71    if (!PyCallable_Check(search_function)) {
72	PyErr_SetString(PyExc_TypeError,
73			"argument must be callable");
74	goto onError;
75    }
76    return PyList_Append(_PyCodec_SearchPath, search_function);
77
78 onError:
79    return -1;
80}
81
82/* Convert a string to a normalized Python string: all characters are
83   converted to lower case, spaces are replaced with underscores. */
84
85static
86PyObject *normalizestring(const char *string)
87{
88    register size_t i;
89    size_t len = strlen(string);
90    char *p;
91    PyObject *v;
92
93	if (len > INT_MAX) {
94		PyErr_SetString(PyExc_OverflowError, "string is too large");
95		return NULL;
96	}
97
98    v = PyString_FromStringAndSize(NULL, (int)len);
99    if (v == NULL)
100	return NULL;
101    p = PyString_AS_STRING(v);
102    for (i = 0; i < len; i++) {
103        register char ch = string[i];
104        if (ch == ' ')
105            ch = '-';
106        else
107            ch = tolower(ch);
108	p[i] = ch;
109    }
110    return v;
111}
112
113/* Lookup the given encoding and return a tuple providing the codec
114   facilities.
115
116   The encoding string is looked up converted to all lower-case
117   characters. This makes encodings looked up through this mechanism
118   effectively case-insensitive.
119
120   If no codec is found, a LookupError is set and NULL returned.
121
122   As side effect, this tries to load the encodings package, if not
123   yet done. This is part of the lazy load strategy for the encodings
124   package.
125
126*/
127
128PyObject *_PyCodec_Lookup(const char *encoding)
129{
130    PyObject *result, *args = NULL, *v;
131    int i, len;
132
133    if (encoding == NULL) {
134	PyErr_BadArgument();
135	goto onError;
136    }
137    if (_PyCodec_SearchCache == NULL ||
138	_PyCodec_SearchPath == NULL) {
139	PyErr_SetString(PyExc_SystemError,
140			"codec module not properly initialized");
141	goto onError;
142    }
143    if (!import_encodings_called) {
144	if (import_encodings())
145	    goto onError;
146    }
147
148    /* Convert the encoding to a normalized Python string: all
149       characters are converted to lower case, spaces and hyphens are
150       replaced with underscores. */
151    v = normalizestring(encoding);
152    if (v == NULL)
153	goto onError;
154    PyString_InternInPlace(&v);
155
156    /* First, try to lookup the name in the registry dictionary */
157    result = PyDict_GetItem(_PyCodec_SearchCache, v);
158    if (result != NULL) {
159	Py_INCREF(result);
160	Py_DECREF(v);
161	return result;
162    }
163
164    /* Next, scan the search functions in order of registration */
165    args = PyTuple_New(1);
166    if (args == NULL)
167	goto onError;
168    PyTuple_SET_ITEM(args,0,v);
169
170    len = PyList_Size(_PyCodec_SearchPath);
171    if (len < 0)
172	goto onError;
173    if (len == 0) {
174	PyErr_SetString(PyExc_LookupError,
175			"no codec search functions registered: "
176			"can't find encoding");
177	goto onError;
178    }
179
180    for (i = 0; i < len; i++) {
181	PyObject *func;
182
183	func = PyList_GetItem(_PyCodec_SearchPath, i);
184	if (func == NULL)
185	    goto onError;
186	result = PyEval_CallObject(func, args);
187	if (result == NULL)
188	    goto onError;
189	if (result == Py_None) {
190	    Py_DECREF(result);
191	    continue;
192	}
193	if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
194	    PyErr_SetString(PyExc_TypeError,
195			    "codec search functions must return 4-tuples");
196	    Py_DECREF(result);
197	    goto onError;
198	}
199	break;
200    }
201    if (i == len) {
202	/* XXX Perhaps we should cache misses too ? */
203	PyErr_SetString(PyExc_LookupError,
204			"unknown encoding");
205	goto onError;
206    }
207
208    /* Cache and return the result */
209    PyDict_SetItem(_PyCodec_SearchCache, v, result);
210    Py_DECREF(args);
211    return result;
212
213 onError:
214    Py_XDECREF(args);
215    return NULL;
216}
217
218static
219PyObject *args_tuple(PyObject *object,
220		     const char *errors)
221{
222    PyObject *args;
223
224    args = PyTuple_New(1 + (errors != NULL));
225    if (args == NULL)
226	return NULL;
227    Py_INCREF(object);
228    PyTuple_SET_ITEM(args,0,object);
229    if (errors) {
230	PyObject *v;
231
232	v = PyString_FromString(errors);
233	if (v == NULL) {
234	    Py_DECREF(args);
235	    return NULL;
236	}
237	PyTuple_SET_ITEM(args, 1, v);
238    }
239    return args;
240}
241
242/* Build a codec by calling factory(stream[,errors]) or just
243   factory(errors) depending on whether the given parameters are
244   non-NULL. */
245
246static
247PyObject *build_stream_codec(PyObject *factory,
248			     PyObject *stream,
249			     const char *errors)
250{
251    PyObject *args, *codec;
252
253    args = args_tuple(stream, errors);
254    if (args == NULL)
255	return NULL;
256
257    codec = PyEval_CallObject(factory, args);
258    Py_DECREF(args);
259    return codec;
260}
261
262/* Convenience APIs to query the Codec registry.
263
264   All APIs return a codec object with incremented refcount.
265
266 */
267
268PyObject *PyCodec_Encoder(const char *encoding)
269{
270    PyObject *codecs;
271    PyObject *v;
272
273    codecs = _PyCodec_Lookup(encoding);
274    if (codecs == NULL)
275	goto onError;
276    v = PyTuple_GET_ITEM(codecs,0);
277    Py_INCREF(v);
278    return v;
279
280 onError:
281    return NULL;
282}
283
284PyObject *PyCodec_Decoder(const char *encoding)
285{
286    PyObject *codecs;
287    PyObject *v;
288
289    codecs = _PyCodec_Lookup(encoding);
290    if (codecs == NULL)
291	goto onError;
292    v = PyTuple_GET_ITEM(codecs,1);
293    Py_INCREF(v);
294    return v;
295
296 onError:
297    return NULL;
298}
299
300PyObject *PyCodec_StreamReader(const char *encoding,
301			       PyObject *stream,
302			       const char *errors)
303{
304    PyObject *codecs;
305
306    codecs = _PyCodec_Lookup(encoding);
307    if (codecs == NULL)
308	goto onError;
309    return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
310
311 onError:
312    return NULL;
313}
314
315PyObject *PyCodec_StreamWriter(const char *encoding,
316			       PyObject *stream,
317			       const char *errors)
318{
319    PyObject *codecs;
320
321    codecs = _PyCodec_Lookup(encoding);
322    if (codecs == NULL)
323	goto onError;
324    return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
325
326 onError:
327    return NULL;
328}
329
330/* Encode an object (e.g. an Unicode object) using the given encoding
331   and return the resulting encoded object (usually a Python string).
332
333   errors is passed to the encoder factory as argument if non-NULL. */
334
335PyObject *PyCodec_Encode(PyObject *object,
336			 const char *encoding,
337			 const char *errors)
338{
339    PyObject *encoder = NULL;
340    PyObject *args = NULL, *result;
341    PyObject *v;
342
343    encoder = PyCodec_Encoder(encoding);
344    if (encoder == NULL)
345	goto onError;
346
347    args = args_tuple(object, errors);
348    if (args == NULL)
349	goto onError;
350
351    result = PyEval_CallObject(encoder,args);
352    if (result == NULL)
353	goto onError;
354
355    if (!PyTuple_Check(result) ||
356	PyTuple_GET_SIZE(result) != 2) {
357	PyErr_SetString(PyExc_TypeError,
358			"encoder must return a tuple (object,integer)");
359	goto onError;
360    }
361    v = PyTuple_GET_ITEM(result,0);
362    Py_INCREF(v);
363    /* We don't check or use the second (integer) entry. */
364
365    Py_DECREF(args);
366    Py_DECREF(encoder);
367    Py_DECREF(result);
368    return v;
369
370 onError:
371    Py_XDECREF(args);
372    Py_XDECREF(encoder);
373    return NULL;
374}
375
376/* Decode an object (usually a Python string) using the given encoding
377   and return an equivalent object (e.g. an Unicode object).
378
379   errors is passed to the decoder factory as argument if non-NULL. */
380
381PyObject *PyCodec_Decode(PyObject *object,
382			 const char *encoding,
383			 const char *errors)
384{
385    PyObject *decoder = NULL;
386    PyObject *args = NULL, *result = NULL;
387    PyObject *v;
388
389    decoder = PyCodec_Decoder(encoding);
390    if (decoder == NULL)
391	goto onError;
392
393    args = args_tuple(object, errors);
394    if (args == NULL)
395	goto onError;
396
397    result = PyEval_CallObject(decoder,args);
398    if (result == NULL)
399	goto onError;
400    if (!PyTuple_Check(result) ||
401	PyTuple_GET_SIZE(result) != 2) {
402	PyErr_SetString(PyExc_TypeError,
403			"decoder must return a tuple (object,integer)");
404	goto onError;
405    }
406    v = PyTuple_GET_ITEM(result,0);
407    Py_INCREF(v);
408    /* We don't check or use the second (integer) entry. */
409
410    Py_DECREF(args);
411    Py_DECREF(decoder);
412    Py_DECREF(result);
413    return v;
414
415 onError:
416    Py_XDECREF(args);
417    Py_XDECREF(decoder);
418    Py_XDECREF(result);
419    return NULL;
420}
421
422void _PyCodecRegistry_Init()
423{
424    if (_PyCodec_SearchPath == NULL)
425	_PyCodec_SearchPath = PyList_New(0);
426    if (_PyCodec_SearchCache == NULL)
427	_PyCodec_SearchCache = PyDict_New();
428    if (_PyCodec_SearchPath == NULL ||
429	_PyCodec_SearchCache == NULL)
430	Py_FatalError("can't initialize codec registry");
431}
432
433void _PyCodecRegistry_Fini()
434{
435    Py_XDECREF(_PyCodec_SearchPath);
436    _PyCodec_SearchPath = NULL;
437    Py_XDECREF(_PyCodec_SearchCache);
438    _PyCodec_SearchCache = NULL;
439}
440