codecs.c revision 7e47402264cf87b9bbb61fc9ff610af08add7c7b
1/* ------------------------------------------------------------------------ 2 3 Python Codec Registry and support functions 4 5Written by Marc-Andre Lemburg (mal@lemburg.com). 6 7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 8 9 ------------------------------------------------------------------------ */ 10 11#include "Python.h" 12#include <ctype.h> 13#ifdef HAVE_LIMITS_H 14#include <limits.h> 15#endif 16 17/* --- Globals ------------------------------------------------------------ */ 18 19static PyObject *_PyCodec_SearchPath; 20static PyObject *_PyCodec_SearchCache; 21 22/* Flag used for lazy import of the standard encodings package */ 23static int import_encodings_called = 0; 24 25/* --- Codec Registry ----------------------------------------------------- */ 26 27/* Import the standard encodings package which will register the first 28 codec search function. 29 30 This is done in a lazy way so that the Unicode implementation does 31 not downgrade startup time of scripts not needing it. 32 33 ImportErrors are silently ignored by this function. Only one try is 34 made. 35 36*/ 37 38static 39int import_encodings() 40{ 41 PyObject *mod; 42 43 import_encodings_called = 1; 44 mod = PyImport_ImportModule("encodings"); 45 if (mod == NULL) { 46 if (PyErr_ExceptionMatches(PyExc_ImportError)) { 47 /* Ignore ImportErrors... this is done so that 48 distributions can disable the encodings package. Note 49 that other errors are not masked, e.g. SystemErrors 50 raised to inform the user of an error in the Python 51 configuration are still reported back to the user. */ 52 PyErr_Clear(); 53 return 0; 54 } 55 return -1; 56 } 57 Py_DECREF(mod); 58 return 0; 59} 60 61int PyCodec_Register(PyObject *search_function) 62{ 63 if (!import_encodings_called) { 64 if (import_encodings()) 65 goto onError; 66 } 67 if (search_function == NULL) { 68 PyErr_BadArgument(); 69 goto onError; 70 } 71 if (!PyCallable_Check(search_function)) { 72 PyErr_SetString(PyExc_TypeError, 73 "argument must be callable"); 74 goto onError; 75 } 76 return PyList_Append(_PyCodec_SearchPath, search_function); 77 78 onError: 79 return -1; 80} 81 82/* Convert a string to a normalized Python string: all characters are 83 converted to lower case, spaces are replaced with underscores. */ 84 85static 86PyObject *normalizestring(const char *string) 87{ 88 register size_t i; 89 size_t len = strlen(string); 90 char *p; 91 PyObject *v; 92 93 if (len > INT_MAX) { 94 PyErr_SetString(PyExc_OverflowError, "string is too large"); 95 return NULL; 96 } 97 98 v = PyString_FromStringAndSize(NULL, (int)len); 99 if (v == NULL) 100 return NULL; 101 p = PyString_AS_STRING(v); 102 for (i = 0; i < len; i++) { 103 register char ch = string[i]; 104 if (ch == ' ') 105 ch = '-'; 106 else 107 ch = tolower(ch); 108 p[i] = ch; 109 } 110 return v; 111} 112 113/* Lookup the given encoding and return a tuple providing the codec 114 facilities. 115 116 The encoding string is looked up converted to all lower-case 117 characters. This makes encodings looked up through this mechanism 118 effectively case-insensitive. 119 120 If no codec is found, a LookupError is set and NULL returned. 121 122 As side effect, this tries to load the encodings package, if not 123 yet done. This is part of the lazy load strategy for the encodings 124 package. 125 126*/ 127 128PyObject *_PyCodec_Lookup(const char *encoding) 129{ 130 PyObject *result, *args = NULL, *v; 131 int i, len; 132 133 if (encoding == NULL) { 134 PyErr_BadArgument(); 135 goto onError; 136 } 137 if (_PyCodec_SearchCache == NULL || 138 _PyCodec_SearchPath == NULL) { 139 PyErr_SetString(PyExc_SystemError, 140 "codec module not properly initialized"); 141 goto onError; 142 } 143 if (!import_encodings_called) { 144 if (import_encodings()) 145 goto onError; 146 } 147 148 /* Convert the encoding to a normalized Python string: all 149 characters are converted to lower case, spaces and hyphens are 150 replaced with underscores. */ 151 v = normalizestring(encoding); 152 if (v == NULL) 153 goto onError; 154 PyString_InternInPlace(&v); 155 156 /* First, try to lookup the name in the registry dictionary */ 157 result = PyDict_GetItem(_PyCodec_SearchCache, v); 158 if (result != NULL) { 159 Py_INCREF(result); 160 Py_DECREF(v); 161 return result; 162 } 163 164 /* Next, scan the search functions in order of registration */ 165 args = PyTuple_New(1); 166 if (args == NULL) 167 goto onError; 168 PyTuple_SET_ITEM(args,0,v); 169 170 len = PyList_Size(_PyCodec_SearchPath); 171 if (len < 0) 172 goto onError; 173 if (len == 0) { 174 PyErr_SetString(PyExc_LookupError, 175 "no codec search functions registered: " 176 "can't find encoding"); 177 goto onError; 178 } 179 180 for (i = 0; i < len; i++) { 181 PyObject *func; 182 183 func = PyList_GetItem(_PyCodec_SearchPath, i); 184 if (func == NULL) 185 goto onError; 186 result = PyEval_CallObject(func, args); 187 if (result == NULL) 188 goto onError; 189 if (result == Py_None) { 190 Py_DECREF(result); 191 continue; 192 } 193 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) { 194 PyErr_SetString(PyExc_TypeError, 195 "codec search functions must return 4-tuples"); 196 Py_DECREF(result); 197 goto onError; 198 } 199 break; 200 } 201 if (i == len) { 202 /* XXX Perhaps we should cache misses too ? */ 203 PyErr_SetString(PyExc_LookupError, 204 "unknown encoding"); 205 goto onError; 206 } 207 208 /* Cache and return the result */ 209 PyDict_SetItem(_PyCodec_SearchCache, v, result); 210 Py_DECREF(args); 211 return result; 212 213 onError: 214 Py_XDECREF(args); 215 return NULL; 216} 217 218static 219PyObject *args_tuple(PyObject *object, 220 const char *errors) 221{ 222 PyObject *args; 223 224 args = PyTuple_New(1 + (errors != NULL)); 225 if (args == NULL) 226 return NULL; 227 Py_INCREF(object); 228 PyTuple_SET_ITEM(args,0,object); 229 if (errors) { 230 PyObject *v; 231 232 v = PyString_FromString(errors); 233 if (v == NULL) { 234 Py_DECREF(args); 235 return NULL; 236 } 237 PyTuple_SET_ITEM(args, 1, v); 238 } 239 return args; 240} 241 242/* Build a codec by calling factory(stream[,errors]) or just 243 factory(errors) depending on whether the given parameters are 244 non-NULL. */ 245 246static 247PyObject *build_stream_codec(PyObject *factory, 248 PyObject *stream, 249 const char *errors) 250{ 251 PyObject *args, *codec; 252 253 args = args_tuple(stream, errors); 254 if (args == NULL) 255 return NULL; 256 257 codec = PyEval_CallObject(factory, args); 258 Py_DECREF(args); 259 return codec; 260} 261 262/* Convenience APIs to query the Codec registry. 263 264 All APIs return a codec object with incremented refcount. 265 266 */ 267 268PyObject *PyCodec_Encoder(const char *encoding) 269{ 270 PyObject *codecs; 271 PyObject *v; 272 273 codecs = _PyCodec_Lookup(encoding); 274 if (codecs == NULL) 275 goto onError; 276 v = PyTuple_GET_ITEM(codecs,0); 277 Py_INCREF(v); 278 return v; 279 280 onError: 281 return NULL; 282} 283 284PyObject *PyCodec_Decoder(const char *encoding) 285{ 286 PyObject *codecs; 287 PyObject *v; 288 289 codecs = _PyCodec_Lookup(encoding); 290 if (codecs == NULL) 291 goto onError; 292 v = PyTuple_GET_ITEM(codecs,1); 293 Py_INCREF(v); 294 return v; 295 296 onError: 297 return NULL; 298} 299 300PyObject *PyCodec_StreamReader(const char *encoding, 301 PyObject *stream, 302 const char *errors) 303{ 304 PyObject *codecs; 305 306 codecs = _PyCodec_Lookup(encoding); 307 if (codecs == NULL) 308 goto onError; 309 return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors); 310 311 onError: 312 return NULL; 313} 314 315PyObject *PyCodec_StreamWriter(const char *encoding, 316 PyObject *stream, 317 const char *errors) 318{ 319 PyObject *codecs; 320 321 codecs = _PyCodec_Lookup(encoding); 322 if (codecs == NULL) 323 goto onError; 324 return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors); 325 326 onError: 327 return NULL; 328} 329 330/* Encode an object (e.g. an Unicode object) using the given encoding 331 and return the resulting encoded object (usually a Python string). 332 333 errors is passed to the encoder factory as argument if non-NULL. */ 334 335PyObject *PyCodec_Encode(PyObject *object, 336 const char *encoding, 337 const char *errors) 338{ 339 PyObject *encoder = NULL; 340 PyObject *args = NULL, *result; 341 PyObject *v; 342 343 encoder = PyCodec_Encoder(encoding); 344 if (encoder == NULL) 345 goto onError; 346 347 args = args_tuple(object, errors); 348 if (args == NULL) 349 goto onError; 350 351 result = PyEval_CallObject(encoder,args); 352 if (result == NULL) 353 goto onError; 354 355 if (!PyTuple_Check(result) || 356 PyTuple_GET_SIZE(result) != 2) { 357 PyErr_SetString(PyExc_TypeError, 358 "encoder must return a tuple (object,integer)"); 359 goto onError; 360 } 361 v = PyTuple_GET_ITEM(result,0); 362 Py_INCREF(v); 363 /* We don't check or use the second (integer) entry. */ 364 365 Py_DECREF(args); 366 Py_DECREF(encoder); 367 Py_DECREF(result); 368 return v; 369 370 onError: 371 Py_XDECREF(args); 372 Py_XDECREF(encoder); 373 return NULL; 374} 375 376/* Decode an object (usually a Python string) using the given encoding 377 and return an equivalent object (e.g. an Unicode object). 378 379 errors is passed to the decoder factory as argument if non-NULL. */ 380 381PyObject *PyCodec_Decode(PyObject *object, 382 const char *encoding, 383 const char *errors) 384{ 385 PyObject *decoder = NULL; 386 PyObject *args = NULL, *result = NULL; 387 PyObject *v; 388 389 decoder = PyCodec_Decoder(encoding); 390 if (decoder == NULL) 391 goto onError; 392 393 args = args_tuple(object, errors); 394 if (args == NULL) 395 goto onError; 396 397 result = PyEval_CallObject(decoder,args); 398 if (result == NULL) 399 goto onError; 400 if (!PyTuple_Check(result) || 401 PyTuple_GET_SIZE(result) != 2) { 402 PyErr_SetString(PyExc_TypeError, 403 "decoder must return a tuple (object,integer)"); 404 goto onError; 405 } 406 v = PyTuple_GET_ITEM(result,0); 407 Py_INCREF(v); 408 /* We don't check or use the second (integer) entry. */ 409 410 Py_DECREF(args); 411 Py_DECREF(decoder); 412 Py_DECREF(result); 413 return v; 414 415 onError: 416 Py_XDECREF(args); 417 Py_XDECREF(decoder); 418 Py_XDECREF(result); 419 return NULL; 420} 421 422void _PyCodecRegistry_Init() 423{ 424 if (_PyCodec_SearchPath == NULL) 425 _PyCodec_SearchPath = PyList_New(0); 426 if (_PyCodec_SearchCache == NULL) 427 _PyCodec_SearchCache = PyDict_New(); 428 if (_PyCodec_SearchPath == NULL || 429 _PyCodec_SearchCache == NULL) 430 Py_FatalError("can't initialize codec registry"); 431} 432 433void _PyCodecRegistry_Fini() 434{ 435 Py_XDECREF(_PyCodec_SearchPath); 436 _PyCodec_SearchPath = NULL; 437 Py_XDECREF(_PyCodec_SearchCache); 438 _PyCodec_SearchCache = NULL; 439} 440