codecs.py revision 3aeb632c3152fa082132ce55b9a880e0d16b04ae
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import struct, __builtin__ 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError, why: 17 raise SystemError,\ 18 'Failed to load the builtin codecs: %s' % why 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "strict_errors", "ignore_errors", "replace_errors", 25 "xmlcharrefreplace_errors", 26 "register_error", "lookup_error"] 27 28### Constants 29 30# 31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 32# and its possible byte string values 33# for UTF8/UTF16/UTF32 output and little/big endian machines 34# 35 36# UTF-8 37BOM_UTF8 = '\xef\xbb\xbf' 38 39# UTF-16, little endian 40BOM_LE = BOM_UTF16_LE = '\xff\xfe' 41 42# UTF-16, big endian 43BOM_BE = BOM_UTF16_BE = '\xfe\xff' 44 45# UTF-32, little endian 46BOM_UTF32_LE = '\xff\xfe\x00\x00' 47 48# UTF-32, big endian 49BOM_UTF32_BE = '\x00\x00\xfe\xff' 50 51# UTF-16, native endianness 52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF) 53 54# UTF-32, native endianness 55BOM_UTF32 = struct.pack('=L', 0x0000FEFF) 56 57# Old broken names (don't use in new code) 58BOM32_LE = BOM_UTF16_LE 59BOM32_BE = BOM_UTF16_BE 60BOM64_LE = BOM_UTF32_LE 61BOM64_BE = BOM_UTF32_BE 62 63 64### Codec base classes (defining the API) 65 66class Codec: 67 68 """ Defines the interface for stateless encoders/decoders. 69 70 The .encode()/.decode() methods may implement different error 71 handling schemes by providing the errors argument. These 72 string values are defined: 73 74 'strict' - raise a ValueError error (or a subclass) 75 'ignore' - ignore the character and continue with the next 76 'replace' - replace with a suitable replacement character; 77 Python will use the official U+FFFD REPLACEMENT 78 CHARACTER for the builtin Unicode codecs. 79 80 """ 81 def encode(self, input, errors='strict'): 82 83 """ Encodes the object input and returns a tuple (output 84 object, length consumed). 85 86 errors defines the error handling to apply. It defaults to 87 'strict' handling. 88 89 The method may not store state in the Codec instance. Use 90 StreamCodec for codecs which have to keep state in order to 91 make encoding/decoding efficient. 92 93 The encoder must be able to handle zero length input and 94 return an empty object of the output object type in this 95 situation. 96 97 """ 98 raise NotImplementedError 99 100 def decode(self, input, errors='strict'): 101 102 """ Decodes the object input and returns a tuple (output 103 object, length consumed). 104 105 input must be an object which provides the bf_getreadbuf 106 buffer slot. Python strings, buffer objects and memory 107 mapped files are examples of objects providing this slot. 108 109 errors defines the error handling to apply. It defaults to 110 'strict' handling. 111 112 The method may not store state in the Codec instance. Use 113 StreamCodec for codecs which have to keep state in order to 114 make encoding/decoding efficient. 115 116 The decoder must be able to handle zero length input and 117 return an empty object of the output object type in this 118 situation. 119 120 """ 121 raise NotImplementedError 122 123# 124# The StreamWriter and StreamReader class provide generic working 125# interfaces which can be used to implement new encoding submodules 126# very easily. See encodings/utf_8.py for an example on how this is 127# done. 128# 129 130class StreamWriter(Codec): 131 132 def __init__(self, stream, errors='strict'): 133 134 """ Creates a StreamWriter instance. 135 136 stream must be a file-like object open for writing 137 (binary) data. 138 139 The StreamWriter may implement different error handling 140 schemes by providing the errors keyword argument. These 141 parameters are defined: 142 143 'strict' - raise a ValueError (or a subclass) 144 'ignore' - ignore the character and continue with the next 145 'replace'- replace with a suitable replacement character 146 147 """ 148 self.stream = stream 149 self.errors = errors 150 151 def write(self, object): 152 153 """ Writes the object's contents encoded to self.stream. 154 """ 155 data, consumed = self.encode(object, self.errors) 156 self.stream.write(data) 157 158 def writelines(self, list): 159 160 """ Writes the concatenated list of strings to the stream 161 using .write(). 162 """ 163 self.write(''.join(list)) 164 165 def reset(self): 166 167 """ Flushes and resets the codec buffers used for keeping state. 168 169 Calling this method should ensure that the data on the 170 output is put into a clean state, that allows appending 171 of new fresh data without having to rescan the whole 172 stream to recover state. 173 174 """ 175 pass 176 177 def __getattr__(self, name, 178 getattr=getattr): 179 180 """ Inherit all other methods from the underlying stream. 181 """ 182 return getattr(self.stream, name) 183 184### 185 186class StreamReader(Codec): 187 188 def __init__(self, stream, errors='strict'): 189 190 """ Creates a StreamReader instance. 191 192 stream must be a file-like object open for reading 193 (binary) data. 194 195 The StreamReader may implement different error handling 196 schemes by providing the errors keyword argument. These 197 parameters are defined: 198 199 'strict' - raise a ValueError (or a subclass) 200 'ignore' - ignore the character and continue with the next 201 'replace'- replace with a suitable replacement character; 202 203 """ 204 self.stream = stream 205 self.errors = errors 206 207 def read(self, size=-1): 208 209 """ Decodes data from the stream self.stream and returns the 210 resulting object. 211 212 size indicates the approximate maximum number of bytes to 213 read from the stream for decoding purposes. The decoder 214 can modify this setting as appropriate. The default value 215 -1 indicates to read and decode as much as possible. size 216 is intended to prevent having to decode huge files in one 217 step. 218 219 The method should use a greedy read strategy meaning that 220 it should read as much data as is allowed within the 221 definition of the encoding and the given size, e.g. if 222 optional encoding endings or state markers are available 223 on the stream, these should be read too. 224 225 """ 226 # Unsliced reading: 227 if size < 0: 228 return self.decode(self.stream.read(), self.errors)[0] 229 230 # Sliced reading: 231 read = self.stream.read 232 decode = self.decode 233 data = read(size) 234 i = 0 235 while 1: 236 try: 237 object, decodedbytes = decode(data, self.errors) 238 except ValueError, why: 239 # This method is slow but should work under pretty much 240 # all conditions; at most 10 tries are made 241 i = i + 1 242 newdata = read(1) 243 if not newdata or i > 10: 244 raise 245 data = data + newdata 246 else: 247 return object 248 249 def readline(self, size=None): 250 251 """ Read one line from the input stream and return the 252 decoded data. 253 254 Note: Unlike the .readlines() method, this method inherits 255 the line breaking knowledge from the underlying stream's 256 .readline() method -- there is currently no support for 257 line breaking using the codec decoder due to lack of line 258 buffering. Sublcasses should however, if possible, try to 259 implement this method using their own knowledge of line 260 breaking. 261 262 size, if given, is passed as size argument to the stream's 263 .readline() method. 264 265 """ 266 if size is None: 267 line = self.stream.readline() 268 else: 269 line = self.stream.readline(size) 270 return self.decode(line, self.errors)[0] 271 272 273 def readlines(self, sizehint=None): 274 275 """ Read all lines available on the input stream 276 and return them as list of lines. 277 278 Line breaks are implemented using the codec's decoder 279 method and are included in the list entries. 280 281 sizehint, if given, is passed as size argument to the 282 stream's .read() method. 283 284 """ 285 if sizehint is None: 286 data = self.stream.read() 287 else: 288 data = self.stream.read(sizehint) 289 return self.decode(data, self.errors)[0].splitlines(1) 290 291 def reset(self): 292 293 """ Resets the codec buffers used for keeping state. 294 295 Note that no stream repositioning should take place. 296 This method is primarily intended to be able to recover 297 from decoding errors. 298 299 """ 300 pass 301 302 def __getattr__(self, name, 303 getattr=getattr): 304 305 """ Inherit all other methods from the underlying stream. 306 """ 307 return getattr(self.stream, name) 308 309### 310 311class StreamReaderWriter: 312 313 """ StreamReaderWriter instances allow wrapping streams which 314 work in both read and write modes. 315 316 The design is such that one can use the factory functions 317 returned by the codec.lookup() function to construct the 318 instance. 319 320 """ 321 # Optional attributes set by the file wrappers below 322 encoding = 'unknown' 323 324 def __init__(self, stream, Reader, Writer, errors='strict'): 325 326 """ Creates a StreamReaderWriter instance. 327 328 stream must be a Stream-like object. 329 330 Reader, Writer must be factory functions or classes 331 providing the StreamReader, StreamWriter interface resp. 332 333 Error handling is done in the same way as defined for the 334 StreamWriter/Readers. 335 336 """ 337 self.stream = stream 338 self.reader = Reader(stream, errors) 339 self.writer = Writer(stream, errors) 340 self.errors = errors 341 342 def read(self, size=-1): 343 344 return self.reader.read(size) 345 346 def readline(self, size=None): 347 348 return self.reader.readline(size) 349 350 def readlines(self, sizehint=None): 351 352 return self.reader.readlines(sizehint) 353 354 def write(self, data): 355 356 return self.writer.write(data) 357 358 def writelines(self, list): 359 360 return self.writer.writelines(list) 361 362 def reset(self): 363 364 self.reader.reset() 365 self.writer.reset() 366 367 def __getattr__(self, name, 368 getattr=getattr): 369 370 """ Inherit all other methods from the underlying stream. 371 """ 372 return getattr(self.stream, name) 373 374### 375 376class StreamRecoder: 377 378 """ StreamRecoder instances provide a frontend - backend 379 view of encoding data. 380 381 They use the complete set of APIs returned by the 382 codecs.lookup() function to implement their task. 383 384 Data written to the stream is first decoded into an 385 intermediate format (which is dependent on the given codec 386 combination) and then written to the stream using an instance 387 of the provided Writer class. 388 389 In the other direction, data is read from the stream using a 390 Reader instance and then return encoded data to the caller. 391 392 """ 393 # Optional attributes set by the file wrappers below 394 data_encoding = 'unknown' 395 file_encoding = 'unknown' 396 397 def __init__(self, stream, encode, decode, Reader, Writer, 398 errors='strict'): 399 400 """ Creates a StreamRecoder instance which implements a two-way 401 conversion: encode and decode work on the frontend (the 402 input to .read() and output of .write()) while 403 Reader and Writer work on the backend (reading and 404 writing to the stream). 405 406 You can use these objects to do transparent direct 407 recodings from e.g. latin-1 to utf-8 and back. 408 409 stream must be a file-like object. 410 411 encode, decode must adhere to the Codec interface, Reader, 412 Writer must be factory functions or classes providing the 413 StreamReader, StreamWriter interface resp. 414 415 encode and decode are needed for the frontend translation, 416 Reader and Writer for the backend translation. Unicode is 417 used as intermediate encoding. 418 419 Error handling is done in the same way as defined for the 420 StreamWriter/Readers. 421 422 """ 423 self.stream = stream 424 self.encode = encode 425 self.decode = decode 426 self.reader = Reader(stream, errors) 427 self.writer = Writer(stream, errors) 428 self.errors = errors 429 430 def read(self, size=-1): 431 432 data = self.reader.read(size) 433 data, bytesencoded = self.encode(data, self.errors) 434 return data 435 436 def readline(self, size=None): 437 438 if size is None: 439 data = self.reader.readline() 440 else: 441 data = self.reader.readline(size) 442 data, bytesencoded = self.encode(data, self.errors) 443 return data 444 445 def readlines(self, sizehint=None): 446 447 if sizehint is None: 448 data = self.reader.read() 449 else: 450 data = self.reader.read(sizehint) 451 data, bytesencoded = self.encode(data, self.errors) 452 return data.splitlines(1) 453 454 def write(self, data): 455 456 data, bytesdecoded = self.decode(data, self.errors) 457 return self.writer.write(data) 458 459 def writelines(self, list): 460 461 data = ''.join(list) 462 data, bytesdecoded = self.decode(data, self.errors) 463 return self.writer.write(data) 464 465 def reset(self): 466 467 self.reader.reset() 468 self.writer.reset() 469 470 def __getattr__(self, name, 471 getattr=getattr): 472 473 """ Inherit all other methods from the underlying stream. 474 """ 475 return getattr(self.stream, name) 476 477### Shortcuts 478 479def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 480 481 """ Open an encoded file using the given mode and return 482 a wrapped version providing transparent encoding/decoding. 483 484 Note: The wrapped version will only accept the object format 485 defined by the codecs, i.e. Unicode objects for most builtin 486 codecs. Output is also codec dependent and will usually by 487 Unicode as well. 488 489 Files are always opened in binary mode, even if no binary mode 490 was specified. Thisis done to avoid data loss due to encodings 491 using 8-bit values. The default file mode is 'rb' meaning to 492 open the file in binary read mode. 493 494 encoding specifies the encoding which is to be used for the 495 the file. 496 497 errors may be given to define the error handling. It defaults 498 to 'strict' which causes ValueErrors to be raised in case an 499 encoding error occurs. 500 501 buffering has the same meaning as for the builtin open() API. 502 It defaults to line buffered. 503 504 The returned wrapped file object provides an extra attribute 505 .encoding which allows querying the used encoding. This 506 attribute is only available if an encoding was specified as 507 parameter. 508 509 """ 510 if encoding is not None and \ 511 'b' not in mode: 512 # Force opening of the file in binary mode 513 mode = mode + 'b' 514 file = __builtin__.open(filename, mode, buffering) 515 if encoding is None: 516 return file 517 (e, d, sr, sw) = lookup(encoding) 518 srw = StreamReaderWriter(file, sr, sw, errors) 519 # Add attributes to simplify introspection 520 srw.encoding = encoding 521 return srw 522 523def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 524 525 """ Return a wrapped version of file which provides transparent 526 encoding translation. 527 528 Strings written to the wrapped file are interpreted according 529 to the given data_encoding and then written to the original 530 file as string using file_encoding. The intermediate encoding 531 will usually be Unicode but depends on the specified codecs. 532 533 Strings are read from the file using file_encoding and then 534 passed back to the caller as string using data_encoding. 535 536 If file_encoding is not given, it defaults to data_encoding. 537 538 errors may be given to define the error handling. It defaults 539 to 'strict' which causes ValueErrors to be raised in case an 540 encoding error occurs. 541 542 The returned wrapped file object provides two extra attributes 543 .data_encoding and .file_encoding which reflect the given 544 parameters of the same name. The attributes can be used for 545 introspection by Python programs. 546 547 """ 548 if file_encoding is None: 549 file_encoding = data_encoding 550 encode, decode = lookup(data_encoding)[:2] 551 Reader, Writer = lookup(file_encoding)[2:] 552 sr = StreamRecoder(file, 553 encode, decode, Reader, Writer, 554 errors) 555 # Add attributes to simplify introspection 556 sr.data_encoding = data_encoding 557 sr.file_encoding = file_encoding 558 return sr 559 560### Helpers for codec lookup 561 562def getencoder(encoding): 563 564 """ Lookup up the codec for the given encoding and return 565 its encoder function. 566 567 Raises a LookupError in case the encoding cannot be found. 568 569 """ 570 return lookup(encoding)[0] 571 572def getdecoder(encoding): 573 574 """ Lookup up the codec for the given encoding and return 575 its decoder function. 576 577 Raises a LookupError in case the encoding cannot be found. 578 579 """ 580 return lookup(encoding)[1] 581 582def getreader(encoding): 583 584 """ Lookup up the codec for the given encoding and return 585 its StreamReader class or factory function. 586 587 Raises a LookupError in case the encoding cannot be found. 588 589 """ 590 return lookup(encoding)[2] 591 592def getwriter(encoding): 593 594 """ Lookup up the codec for the given encoding and return 595 its StreamWriter class or factory function. 596 597 Raises a LookupError in case the encoding cannot be found. 598 599 """ 600 return lookup(encoding)[3] 601 602### Helpers for charmap-based codecs 603 604def make_identity_dict(rng): 605 606 """ make_identity_dict(rng) -> dict 607 608 Return a dictionary where elements of the rng sequence are 609 mapped to themselves. 610 611 """ 612 res = {} 613 for i in rng: 614 res[i]=i 615 return res 616 617def make_encoding_map(decoding_map): 618 619 """ Creates an encoding map from a decoding map. 620 621 If a target mapping in the decoding map occurrs multiple 622 times, then that target is mapped to None (undefined mapping), 623 causing an exception when encountered by the charmap codec 624 during translation. 625 626 One example where this happens is cp875.py which decodes 627 multiple character to \u001a. 628 629 """ 630 m = {} 631 for k,v in decoding_map.items(): 632 if not v in m: 633 m[v] = k 634 else: 635 m[v] = None 636 return m 637 638### error handlers 639 640strict_errors = lookup_error("strict") 641ignore_errors = lookup_error("ignore") 642replace_errors = lookup_error("replace") 643xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 644backslashreplace_errors = lookup_error("backslashreplace") 645 646# Tell modulefinder that using codecs probably needs the encodings 647# package 648_false = 0 649if _false: 650 import encodings 651 652### Tests 653 654if __name__ == '__main__': 655 656 import sys 657 658 # Make stdout translate Latin-1 output into UTF-8 output 659 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 660 661 # Have stdin translate Latin-1 input into UTF-8 input 662 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 663