codecs.py revision e99d5ea25ba994491c773d9b5872332334ccd1c5
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import struct,types,__builtin__ 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError,why: 17 raise SystemError,\ 18 'Failed to load the builtin codecs: %s' % why 19 20__all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE", 21 "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"] 22 23### Constants 24 25# 26# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE) 27# 28BOM = struct.pack('=H',0xFEFF) 29# 30BOM_BE = BOM32_BE = '\376\377' 31# corresponds to Unicode U+FEFF in UTF-16 on big endian 32# platforms == ZERO WIDTH NO-BREAK SPACE 33BOM_LE = BOM32_LE = '\377\376' 34# corresponds to Unicode U+FFFE in UTF-16 on little endian 35# platforms == defined as being an illegal Unicode character 36 37# 38# 64-bit Byte Order Marks 39# 40BOM64_BE = '\000\000\376\377' 41# corresponds to Unicode U+0000FEFF in UCS-4 42BOM64_LE = '\377\376\000\000' 43# corresponds to Unicode U+0000FFFE in UCS-4 44 45 46### Codec base classes (defining the API) 47 48class Codec: 49 50 """ Defines the interface for stateless encoders/decoders. 51 52 The .encode()/.decode() methods may implement different error 53 handling schemes by providing the errors argument. These 54 string values are defined: 55 56 'strict' - raise a ValueError error (or a subclass) 57 'ignore' - ignore the character and continue with the next 58 'replace' - replace with a suitable replacement character; 59 Python will use the official U+FFFD REPLACEMENT 60 CHARACTER for the builtin Unicode codecs. 61 62 """ 63 def encode(self,input,errors='strict'): 64 65 """ Encodes the object input and returns a tuple (output 66 object, length consumed). 67 68 errors defines the error handling to apply. It defaults to 69 'strict' handling. 70 71 The method may not store state in the Codec instance. Use 72 StreamCodec for codecs which have to keep state in order to 73 make encoding/decoding efficient. 74 75 The encoder must be able to handle zero length input and 76 return an empty object of the output object type in this 77 situation. 78 79 """ 80 raise NotImplementedError 81 82 def decode(self,input,errors='strict'): 83 84 """ Decodes the object input and returns a tuple (output 85 object, length consumed). 86 87 input must be an object which provides the bf_getreadbuf 88 buffer slot. Python strings, buffer objects and memory 89 mapped files are examples of objects providing this slot. 90 91 errors defines the error handling to apply. It defaults to 92 'strict' handling. 93 94 The method may not store state in the Codec instance. Use 95 StreamCodec for codecs which have to keep state in order to 96 make encoding/decoding efficient. 97 98 The decoder must be able to handle zero length input and 99 return an empty object of the output object type in this 100 situation. 101 102 """ 103 raise NotImplementedError 104 105# 106# The StreamWriter and StreamReader class provide generic working 107# interfaces which can be used to implement new encodings submodules 108# very easily. See encodings/utf_8.py for an example on how this is 109# done. 110# 111 112class StreamWriter(Codec): 113 114 def __init__(self,stream,errors='strict'): 115 116 """ Creates a StreamWriter instance. 117 118 stream must be a file-like object open for writing 119 (binary) data. 120 121 The StreamWriter may implement different error handling 122 schemes by providing the errors keyword argument. These 123 parameters are defined: 124 125 'strict' - raise a ValueError (or a subclass) 126 'ignore' - ignore the character and continue with the next 127 'replace'- replace with a suitable replacement character 128 129 """ 130 self.stream = stream 131 self.errors = errors 132 133 def write(self, object): 134 135 """ Writes the object's contents encoded to self.stream. 136 """ 137 data, consumed = self.encode(object,self.errors) 138 self.stream.write(data) 139 140 def writelines(self, list): 141 142 """ Writes the concatenated list of strings to the stream 143 using .write(). 144 """ 145 self.write(''.join(list)) 146 147 def reset(self): 148 149 """ Flushes and resets the codec buffers used for keeping state. 150 151 Calling this method should ensure that the data on the 152 output is put into a clean state, that allows appending 153 of new fresh data without having to rescan the whole 154 stream to recover state. 155 156 """ 157 pass 158 159 def __getattr__(self,name, 160 161 getattr=getattr): 162 163 """ Inherit all other methods from the underlying stream. 164 """ 165 return getattr(self.stream,name) 166 167### 168 169class StreamReader(Codec): 170 171 def __init__(self,stream,errors='strict'): 172 173 """ Creates a StreamReader instance. 174 175 stream must be a file-like object open for reading 176 (binary) data. 177 178 The StreamReader may implement different error handling 179 schemes by providing the errors keyword argument. These 180 parameters are defined: 181 182 'strict' - raise a ValueError (or a subclass) 183 'ignore' - ignore the character and continue with the next 184 'replace'- replace with a suitable replacement character; 185 186 """ 187 self.stream = stream 188 self.errors = errors 189 190 def read(self, size=-1): 191 192 """ Decodes data from the stream self.stream and returns the 193 resulting object. 194 195 size indicates the approximate maximum number of bytes to 196 read from the stream for decoding purposes. The decoder 197 can modify this setting as appropriate. The default value 198 -1 indicates to read and decode as much as possible. size 199 is intended to prevent having to decode huge files in one 200 step. 201 202 The method should use a greedy read strategy meaning that 203 it should read as much data as is allowed within the 204 definition of the encoding and the given size, e.g. if 205 optional encoding endings or state markers are available 206 on the stream, these should be read too. 207 208 """ 209 # Unsliced reading: 210 if size < 0: 211 return self.decode(self.stream.read(), self.errors)[0] 212 213 # Sliced reading: 214 read = self.stream.read 215 decode = self.decode 216 data = read(size) 217 i = 0 218 while 1: 219 try: 220 object, decodedbytes = decode(data, self.errors) 221 except ValueError,why: 222 # This method is slow but should work under pretty much 223 # all conditions; at most 10 tries are made 224 i = i + 1 225 newdata = read(1) 226 if not newdata or i > 10: 227 raise 228 data = data + newdata 229 else: 230 return object 231 232 def readline(self, size=None): 233 234 """ Read one line from the input stream and return the 235 decoded data. 236 237 Note: Unlike the .readlines() method, this method inherits 238 the line breaking knowledge from the underlying stream's 239 .readline() method -- there is currently no support for 240 line breaking using the codec decoder due to lack of line 241 buffering. Sublcasses should however, if possible, try to 242 implement this method using their own knowledge of line 243 breaking. 244 245 size, if given, is passed as size argument to the stream's 246 .readline() method. 247 248 """ 249 if size is None: 250 line = self.stream.readline() 251 else: 252 line = self.stream.readline(size) 253 return self.decode(line,self.errors)[0] 254 255 256 def readlines(self, sizehint=0): 257 258 """ Read all lines available on the input stream 259 and return them as list of lines. 260 261 Line breaks are implemented using the codec's decoder 262 method and are included in the list entries. 263 264 sizehint, if given, is passed as size argument to the 265 stream's .read() method. 266 267 """ 268 if sizehint is None: 269 data = self.stream.read() 270 else: 271 data = self.stream.read(sizehint) 272 return self.decode(data,self.errors)[0].splitlines(1) 273 274 def reset(self): 275 276 """ Resets the codec buffers used for keeping state. 277 278 Note that no stream repositioning should take place. 279 This method is primarily intended to be able to recover 280 from decoding errors. 281 282 """ 283 pass 284 285 def __getattr__(self,name, 286 287 getattr=getattr): 288 289 """ Inherit all other methods from the underlying stream. 290 """ 291 return getattr(self.stream,name) 292 293### 294 295class StreamReaderWriter: 296 297 """ StreamReaderWriter instances allow wrapping streams which 298 work in both read and write modes. 299 300 The design is such that one can use the factory functions 301 returned by the codec.lookup() function to construct the 302 instance. 303 304 """ 305 # Optional attributes set by the file wrappers below 306 encoding = 'unknown' 307 308 def __init__(self,stream,Reader,Writer,errors='strict'): 309 310 """ Creates a StreamReaderWriter instance. 311 312 stream must be a Stream-like object. 313 314 Reader, Writer must be factory functions or classes 315 providing the StreamReader, StreamWriter interface resp. 316 317 Error handling is done in the same way as defined for the 318 StreamWriter/Readers. 319 320 """ 321 self.stream = stream 322 self.reader = Reader(stream, errors) 323 self.writer = Writer(stream, errors) 324 self.errors = errors 325 326 def read(self,size=-1): 327 328 return self.reader.read(size) 329 330 def readline(self, size=None): 331 332 return self.reader.readline(size) 333 334 def readlines(self, sizehint=None): 335 336 return self.reader.readlines(sizehint) 337 338 def write(self,data): 339 340 return self.writer.write(data) 341 342 def writelines(self,list): 343 344 return self.writer.writelines(list) 345 346 def reset(self): 347 348 self.reader.reset() 349 self.writer.reset() 350 351 def __getattr__(self,name, 352 353 getattr=getattr): 354 355 """ Inherit all other methods from the underlying stream. 356 """ 357 return getattr(self.stream,name) 358 359### 360 361class StreamRecoder: 362 363 """ StreamRecoder instances provide a frontend - backend 364 view of encoding data. 365 366 They use the complete set of APIs returned by the 367 codecs.lookup() function to implement their task. 368 369 Data written to the stream is first decoded into an 370 intermediate format (which is dependent on the given codec 371 combination) and then written to the stream using an instance 372 of the provided Writer class. 373 374 In the other direction, data is read from the stream using a 375 Reader instance and then return encoded data to the caller. 376 377 """ 378 # Optional attributes set by the file wrappers below 379 data_encoding = 'unknown' 380 file_encoding = 'unknown' 381 382 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'): 383 384 """ Creates a StreamRecoder instance which implements a two-way 385 conversion: encode and decode work on the frontend (the 386 input to .read() and output of .write()) while 387 Reader and Writer work on the backend (reading and 388 writing to the stream). 389 390 You can use these objects to do transparent direct 391 recodings from e.g. latin-1 to utf-8 and back. 392 393 stream must be a file-like object. 394 395 encode, decode must adhere to the Codec interface, Reader, 396 Writer must be factory functions or classes providing the 397 StreamReader, StreamWriter interface resp. 398 399 encode and decode are needed for the frontend translation, 400 Reader and Writer for the backend translation. Unicode is 401 used as intermediate encoding. 402 403 Error handling is done in the same way as defined for the 404 StreamWriter/Readers. 405 406 """ 407 self.stream = stream 408 self.encode = encode 409 self.decode = decode 410 self.reader = Reader(stream, errors) 411 self.writer = Writer(stream, errors) 412 self.errors = errors 413 414 def read(self,size=-1): 415 416 data = self.reader.read(size) 417 data, bytesencoded = self.encode(data, self.errors) 418 return data 419 420 def readline(self,size=None): 421 422 if size is None: 423 data = self.reader.readline() 424 else: 425 data = self.reader.readline(size) 426 data, bytesencoded = self.encode(data, self.errors) 427 return data 428 429 def readlines(self,sizehint=None): 430 431 if sizehint is None: 432 data = self.reader.read() 433 else: 434 data = self.reader.read(sizehint) 435 data, bytesencoded = self.encode(data, self.errors) 436 return data.splitlines(1) 437 438 def write(self,data): 439 440 data, bytesdecoded = self.decode(data, self.errors) 441 return self.writer.write(data) 442 443 def writelines(self,list): 444 445 data = ''.join(list) 446 data, bytesdecoded = self.decode(data, self.errors) 447 return self.writer.write(data) 448 449 def reset(self): 450 451 self.reader.reset() 452 self.writer.reset() 453 454 def __getattr__(self,name, 455 456 getattr=getattr): 457 458 """ Inherit all other methods from the underlying stream. 459 """ 460 return getattr(self.stream,name) 461 462### Shortcuts 463 464def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 465 466 """ Open an encoded file using the given mode and return 467 a wrapped version providing transparent encoding/decoding. 468 469 Note: The wrapped version will only accept the object format 470 defined by the codecs, i.e. Unicode objects for most builtin 471 codecs. Output is also codec dependent and will usually by 472 Unicode as well. 473 474 Files are always opened in binary mode, even if no binary mode 475 was specified. Thisis done to avoid data loss due to encodings 476 using 8-bit values. The default file mode is 'rb' meaning to 477 open the file in binary read mode. 478 479 encoding specifies the encoding which is to be used for the 480 the file. 481 482 errors may be given to define the error handling. It defaults 483 to 'strict' which causes ValueErrors to be raised in case an 484 encoding error occurs. 485 486 buffering has the same meaning as for the builtin open() API. 487 It defaults to line buffered. 488 489 The returned wrapped file object provides an extra attribute 490 .encoding which allows querying the used encoding. This 491 attribute is only available if an encoding was specified as 492 parameter. 493 494 """ 495 if encoding is not None and \ 496 'b' not in mode: 497 # Force opening of the file in binary mode 498 mode = mode + 'b' 499 file = __builtin__.open(filename, mode, buffering) 500 if encoding is None: 501 return file 502 (e,d,sr,sw) = lookup(encoding) 503 srw = StreamReaderWriter(file, sr, sw, errors) 504 # Add attributes to simplify introspection 505 srw.encoding = encoding 506 return srw 507 508def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 509 510 """ Return a wrapped version of file which provides transparent 511 encoding translation. 512 513 Strings written to the wrapped file are interpreted according 514 to the given data_encoding and then written to the original 515 file as string using file_encoding. The intermediate encoding 516 will usually be Unicode but depends on the specified codecs. 517 518 Strings are read from the file using file_encoding and then 519 passed back to the caller as string using data_encoding. 520 521 If file_encoding is not given, it defaults to data_encoding. 522 523 errors may be given to define the error handling. It defaults 524 to 'strict' which causes ValueErrors to be raised in case an 525 encoding error occurs. 526 527 The returned wrapped file object provides two extra attributes 528 .data_encoding and .file_encoding which reflect the given 529 parameters of the same name. The attributes can be used for 530 introspection by Python programs. 531 532 """ 533 if file_encoding is None: 534 file_encoding = data_encoding 535 encode, decode = lookup(data_encoding)[:2] 536 Reader, Writer = lookup(file_encoding)[2:] 537 sr = StreamRecoder(file, 538 encode,decode,Reader,Writer, 539 errors) 540 # Add attributes to simplify introspection 541 sr.data_encoding = data_encoding 542 sr.file_encoding = file_encoding 543 return sr 544 545### Helpers for charmap-based codecs 546 547def make_identity_dict(rng): 548 549 """ make_identity_dict(rng) -> dict 550 551 Return a dictionary where elements of the rng sequence are 552 mapped to themselves. 553 554 """ 555 res = {} 556 for i in rng: 557 res[i]=i 558 return res 559 560### Tests 561 562if __name__ == '__main__': 563 564 import sys 565 566 # Make stdout translate Latin-1 output into UTF-8 output 567 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 568 569 # Have stdin translate Latin-1 input into UTF-8 input 570 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 571