codecs.py revision 3fed0870a6fec72665068e09200c674b574dabdb
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import builtins, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError as why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "strict_errors", "ignore_errors", "replace_errors", 24 "xmlcharrefreplace_errors", 25 "register_error", "lookup_error"] 26 27### Constants 28 29# 30# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 31# and its possible byte string values 32# for UTF8/UTF16/UTF32 output and little/big endian machines 33# 34 35# UTF-8 36BOM_UTF8 = b'\xef\xbb\xbf' 37 38# UTF-16, little endian 39BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 40 41# UTF-16, big endian 42BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 43 44# UTF-32, little endian 45BOM_UTF32_LE = b'\xff\xfe\x00\x00' 46 47# UTF-32, big endian 48BOM_UTF32_BE = b'\x00\x00\xfe\xff' 49 50if sys.byteorder == 'little': 51 52 # UTF-16, native endianness 53 BOM = BOM_UTF16 = BOM_UTF16_LE 54 55 # UTF-32, native endianness 56 BOM_UTF32 = BOM_UTF32_LE 57 58else: 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_BE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_BE 65 66# Old broken names (don't use in new code) 67BOM32_LE = BOM_UTF16_LE 68BOM32_BE = BOM_UTF16_BE 69BOM64_LE = BOM_UTF32_LE 70BOM64_BE = BOM_UTF32_BE 71 72 73### Codec base classes (defining the API) 74 75class CodecInfo(tuple): 76 77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 78 incrementalencoder=None, incrementaldecoder=None, name=None): 79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 80 self.name = name 81 self.encode = encode 82 self.decode = decode 83 self.incrementalencoder = incrementalencoder 84 self.incrementaldecoder = incrementaldecoder 85 self.streamwriter = streamwriter 86 self.streamreader = streamreader 87 return self 88 89 def __repr__(self): 90 return "<%s.%s object for encoding %s at 0x%x>" % \ 91 (self.__class__.__module__, self.__class__.__name__, 92 self.name, id(self)) 93 94class Codec: 95 96 """ Defines the interface for stateless encoders/decoders. 97 98 The .encode()/.decode() methods may use different error 99 handling schemes by providing the errors argument. These 100 string values are predefined: 101 102 'strict' - raise a ValueError error (or a subclass) 103 'ignore' - ignore the character and continue with the next 104 'replace' - replace with a suitable replacement character; 105 Python will use the official U+FFFD REPLACEMENT 106 CHARACTER for the builtin Unicode codecs on 107 decoding and '?' on encoding. 108 'xmlcharrefreplace' - Replace with the appropriate XML 109 character reference (only for encoding). 110 'backslashreplace' - Replace with backslashed escape sequences 111 (only for encoding). 112 113 The set of allowed values can be extended via register_error. 114 115 """ 116 def encode(self, input, errors='strict'): 117 118 """ Encodes the object input and returns a tuple (output 119 object, length consumed). 120 121 errors defines the error handling to apply. It defaults to 122 'strict' handling. 123 124 The method may not store state in the Codec instance. Use 125 StreamCodec for codecs which have to keep state in order to 126 make encoding/decoding efficient. 127 128 The encoder must be able to handle zero length input and 129 return an empty object of the output object type in this 130 situation. 131 132 """ 133 raise NotImplementedError 134 135 def decode(self, input, errors='strict'): 136 137 """ Decodes the object input and returns a tuple (output 138 object, length consumed). 139 140 input must be an object which provides the bf_getreadbuf 141 buffer slot. Python strings, buffer objects and memory 142 mapped files are examples of objects providing this slot. 143 144 errors defines the error handling to apply. It defaults to 145 'strict' handling. 146 147 The method may not store state in the Codec instance. Use 148 StreamCodec for codecs which have to keep state in order to 149 make encoding/decoding efficient. 150 151 The decoder must be able to handle zero length input and 152 return an empty object of the output object type in this 153 situation. 154 155 """ 156 raise NotImplementedError 157 158class IncrementalEncoder(object): 159 """ 160 An IncrementalEncoder encodes an input in multiple steps. The input can 161 be passed piece by piece to the encode() method. The IncrementalEncoder 162 remembers the state of the encoding process between calls to encode(). 163 """ 164 def __init__(self, errors='strict'): 165 """ 166 Creates an IncrementalEncoder instance. 167 168 The IncrementalEncoder may use different error handling schemes by 169 providing the errors keyword argument. See the module docstring 170 for a list of possible values. 171 """ 172 self.errors = errors 173 self.buffer = "" 174 175 def encode(self, input, final=False): 176 """ 177 Encodes input and returns the resulting object. 178 """ 179 raise NotImplementedError 180 181 def reset(self): 182 """ 183 Resets the encoder to the initial state. 184 """ 185 186 def getstate(self): 187 """ 188 Return the current state of the encoder. 189 """ 190 return 0 191 192 def setstate(self, state): 193 """ 194 Set the current state of the encoder. state must have been 195 returned by getstate(). 196 """ 197 198class BufferedIncrementalEncoder(IncrementalEncoder): 199 """ 200 This subclass of IncrementalEncoder can be used as the baseclass for an 201 incremental encoder if the encoder must keep some of the output in a 202 buffer between calls to encode(). 203 """ 204 def __init__(self, errors='strict'): 205 IncrementalEncoder.__init__(self, errors) 206 # unencoded input that is kept between calls to encode() 207 self.buffer = "" 208 209 def _buffer_encode(self, input, errors, final): 210 # Overwrite this method in subclasses: It must encode input 211 # and return an (output, length consumed) tuple 212 raise NotImplementedError 213 214 def encode(self, input, final=False): 215 # encode input (taking the buffer into account) 216 data = self.buffer + input 217 (result, consumed) = self._buffer_encode(data, self.errors, final) 218 # keep unencoded input until the next call 219 self.buffer = data[consumed:] 220 return result 221 222 def reset(self): 223 IncrementalEncoder.reset(self) 224 self.buffer = "" 225 226 def getstate(self): 227 return self.buffer or 0 228 229 def setstate(self, state): 230 self.buffer = state or "" 231 232class IncrementalDecoder(object): 233 """ 234 An IncrementalDecoder decodes an input in multiple steps. The input can 235 be passed piece by piece to the decode() method. The IncrementalDecoder 236 remembers the state of the decoding process between calls to decode(). 237 """ 238 def __init__(self, errors='strict'): 239 """ 240 Create a IncrementalDecoder instance. 241 242 The IncrementalDecoder may use different error handling schemes by 243 providing the errors keyword argument. See the module docstring 244 for a list of possible values. 245 """ 246 self.errors = errors 247 248 def decode(self, input, final=False): 249 """ 250 Decode input and returns the resulting object. 251 """ 252 raise NotImplementedError 253 254 def reset(self): 255 """ 256 Reset the decoder to the initial state. 257 """ 258 259 def getstate(self): 260 """ 261 Return the current state of the decoder. 262 263 This must be a (buffered_input, additional_state_info) tuple. 264 buffered_input must be a bytes object containing bytes that 265 were passed to decode() that have not yet been converted. 266 additional_state_info must be a non-negative integer 267 representing the state of the decoder WITHOUT yet having 268 processed the contents of buffered_input. In the initial state 269 and after reset(), getstate() must return (b"", 0). 270 """ 271 return (b"", 0) 272 273 def setstate(self, state): 274 """ 275 Set the current state of the decoder. 276 277 state must have been returned by getstate(). The effect of 278 setstate((b"", 0)) must be equivalent to reset(). 279 """ 280 281class BufferedIncrementalDecoder(IncrementalDecoder): 282 """ 283 This subclass of IncrementalDecoder can be used as the baseclass for an 284 incremental decoder if the decoder must be able to handle incomplete 285 byte sequences. 286 """ 287 def __init__(self, errors='strict'): 288 IncrementalDecoder.__init__(self, errors) 289 # undecoded input that is kept between calls to decode() 290 self.buffer = b"" 291 292 def _buffer_decode(self, input, errors, final): 293 # Overwrite this method in subclasses: It must decode input 294 # and return an (output, length consumed) tuple 295 raise NotImplementedError 296 297 def decode(self, input, final=False): 298 # decode input (taking the buffer into account) 299 data = self.buffer + input 300 (result, consumed) = self._buffer_decode(data, self.errors, final) 301 # keep undecoded input until the next call 302 self.buffer = data[consumed:] 303 return result 304 305 def reset(self): 306 IncrementalDecoder.reset(self) 307 self.buffer = b"" 308 309 def getstate(self): 310 # additional state info is always 0 311 return (self.buffer, 0) 312 313 def setstate(self, state): 314 # ignore additional state info 315 self.buffer = state[0] 316 317# 318# The StreamWriter and StreamReader class provide generic working 319# interfaces which can be used to implement new encoding submodules 320# very easily. See encodings/utf_8.py for an example on how this is 321# done. 322# 323 324class StreamWriter(Codec): 325 326 def __init__(self, stream, errors='strict'): 327 328 """ Creates a StreamWriter instance. 329 330 stream must be a file-like object open for writing 331 (binary) data. 332 333 The StreamWriter may use different error handling 334 schemes by providing the errors keyword argument. These 335 parameters are predefined: 336 337 'strict' - raise a ValueError (or a subclass) 338 'ignore' - ignore the character and continue with the next 339 'replace'- replace with a suitable replacement character 340 'xmlcharrefreplace' - Replace with the appropriate XML 341 character reference. 342 'backslashreplace' - Replace with backslashed escape 343 sequences (only for encoding). 344 345 The set of allowed parameter values can be extended via 346 register_error. 347 """ 348 self.stream = stream 349 self.errors = errors 350 351 def write(self, object): 352 353 """ Writes the object's contents encoded to self.stream. 354 """ 355 data, consumed = self.encode(object, self.errors) 356 self.stream.write(data) 357 358 def writelines(self, list): 359 360 """ Writes the concatenated list of strings to the stream 361 using .write(). 362 """ 363 self.write(''.join(list)) 364 365 def reset(self): 366 367 """ Flushes and resets the codec buffers used for keeping state. 368 369 Calling this method should ensure that the data on the 370 output is put into a clean state, that allows appending 371 of new fresh data without having to rescan the whole 372 stream to recover state. 373 374 """ 375 pass 376 377 def __getattr__(self, name, 378 getattr=getattr): 379 380 """ Inherit all other methods from the underlying stream. 381 """ 382 return getattr(self.stream, name) 383 384 def __enter__(self): 385 return self 386 387 def __exit__(self, type, value, tb): 388 self.stream.close() 389 390### 391 392class StreamReader(Codec): 393 394 def __init__(self, stream, errors='strict'): 395 396 """ Creates a StreamReader instance. 397 398 stream must be a file-like object open for reading 399 (binary) data. 400 401 The StreamReader may use different error handling 402 schemes by providing the errors keyword argument. These 403 parameters are predefined: 404 405 'strict' - raise a ValueError (or a subclass) 406 'ignore' - ignore the character and continue with the next 407 'replace'- replace with a suitable replacement character; 408 409 The set of allowed parameter values can be extended via 410 register_error. 411 """ 412 self.stream = stream 413 self.errors = errors 414 self.bytebuffer = b"" 415 # For str->str decoding this will stay a str 416 # For str->unicode decoding the first read will promote it to unicode 417 self.charbuffer = "" 418 self.linebuffer = None 419 420 def decode(self, input, errors='strict'): 421 raise NotImplementedError 422 423 def read(self, size=-1, chars=-1, firstline=False): 424 425 """ Decodes data from the stream self.stream and returns the 426 resulting object. 427 428 chars indicates the number of characters to read from the 429 stream. read() will never return more than chars 430 characters, but it might return less, if there are not enough 431 characters available. 432 433 size indicates the approximate maximum number of bytes to 434 read from the stream for decoding purposes. The decoder 435 can modify this setting as appropriate. The default value 436 -1 indicates to read and decode as much as possible. size 437 is intended to prevent having to decode huge files in one 438 step. 439 440 If firstline is true, and a UnicodeDecodeError happens 441 after the first line terminator in the input only the first line 442 will be returned, the rest of the input will be kept until the 443 next call to read(). 444 445 The method should use a greedy read strategy meaning that 446 it should read as much data as is allowed within the 447 definition of the encoding and the given size, e.g. if 448 optional encoding endings or state markers are available 449 on the stream, these should be read too. 450 """ 451 # If we have lines cached, first merge them back into characters 452 if self.linebuffer: 453 self.charbuffer = "".join(self.linebuffer) 454 self.linebuffer = None 455 456 # read until we get the required number of characters (if available) 457 while True: 458 # can the request can be satisfied from the character buffer? 459 if chars < 0: 460 if size < 0: 461 if self.charbuffer: 462 break 463 elif len(self.charbuffer) >= size: 464 break 465 else: 466 if len(self.charbuffer) >= chars: 467 break 468 # we need more data 469 if size < 0: 470 newdata = self.stream.read() 471 else: 472 newdata = self.stream.read(size) 473 # decode bytes (those remaining from the last call included) 474 data = self.bytebuffer + newdata 475 try: 476 newchars, decodedbytes = self.decode(data, self.errors) 477 except UnicodeDecodeError as exc: 478 if firstline: 479 newchars, decodedbytes = \ 480 self.decode(data[:exc.start], self.errors) 481 lines = newchars.splitlines(True) 482 if len(lines)<=1: 483 raise 484 else: 485 raise 486 # keep undecoded bytes until the next call 487 self.bytebuffer = data[decodedbytes:] 488 # put new characters in the character buffer 489 self.charbuffer += newchars 490 # there was no data available 491 if not newdata: 492 break 493 if chars < 0: 494 # Return everything we've got 495 result = self.charbuffer 496 self.charbuffer = "" 497 else: 498 # Return the first chars characters 499 result = self.charbuffer[:chars] 500 self.charbuffer = self.charbuffer[chars:] 501 return result 502 503 def readline(self, size=None, keepends=True): 504 505 """ Read one line from the input stream and return the 506 decoded data. 507 508 size, if given, is passed as size argument to the 509 read() method. 510 511 """ 512 # If we have lines cached from an earlier read, return 513 # them unconditionally 514 if self.linebuffer: 515 line = self.linebuffer[0] 516 del self.linebuffer[0] 517 if len(self.linebuffer) == 1: 518 # revert to charbuffer mode; we might need more data 519 # next time 520 self.charbuffer = self.linebuffer[0] 521 self.linebuffer = None 522 if not keepends: 523 line = line.splitlines(False)[0] 524 return line 525 526 readsize = size or 72 527 line = "" 528 # If size is given, we call read() only once 529 while True: 530 data = self.read(readsize, firstline=True) 531 if data: 532 # If we're at a "\r" read one extra character (which might 533 # be a "\n") to get a proper line ending. If the stream is 534 # temporarily exhausted we return the wrong line ending. 535 if data.endswith("\r"): 536 data += self.read(size=1, chars=1) 537 538 line += data 539 lines = line.splitlines(True) 540 if lines: 541 if len(lines) > 1: 542 # More than one line result; the first line is a full line 543 # to return 544 line = lines[0] 545 del lines[0] 546 if len(lines) > 1: 547 # cache the remaining lines 548 lines[-1] += self.charbuffer 549 self.linebuffer = lines 550 self.charbuffer = None 551 else: 552 # only one remaining line, put it back into charbuffer 553 self.charbuffer = lines[0] + self.charbuffer 554 if not keepends: 555 line = line.splitlines(False)[0] 556 break 557 line0withend = lines[0] 558 line0withoutend = lines[0].splitlines(False)[0] 559 if line0withend != line0withoutend: # We really have a line end 560 # Put the rest back together and keep it until the next call 561 self.charbuffer = "".join(lines[1:]) + self.charbuffer 562 if keepends: 563 line = line0withend 564 else: 565 line = line0withoutend 566 break 567 # we didn't get anything or this was our only try 568 if not data or size is not None: 569 if line and not keepends: 570 line = line.splitlines(False)[0] 571 break 572 if readsize<8000: 573 readsize *= 2 574 return line 575 576 def readlines(self, sizehint=None, keepends=True): 577 578 """ Read all lines available on the input stream 579 and return them as list of lines. 580 581 Line breaks are implemented using the codec's decoder 582 method and are included in the list entries. 583 584 sizehint, if given, is ignored since there is no efficient 585 way to finding the true end-of-line. 586 587 """ 588 data = self.read() 589 return data.splitlines(keepends) 590 591 def reset(self): 592 593 """ Resets the codec buffers used for keeping state. 594 595 Note that no stream repositioning should take place. 596 This method is primarily intended to be able to recover 597 from decoding errors. 598 599 """ 600 self.bytebuffer = b"" 601 self.charbuffer = "" 602 self.linebuffer = None 603 604 def seek(self, offset, whence=0): 605 """ Set the input stream's current position. 606 607 Resets the codec buffers used for keeping state. 608 """ 609 self.reset() 610 self.stream.seek(offset, whence) 611 612 def __next__(self): 613 614 """ Return the next decoded line from the input stream.""" 615 line = self.readline() 616 if line: 617 return line 618 raise StopIteration 619 620 def __iter__(self): 621 return self 622 623 def __getattr__(self, name, 624 getattr=getattr): 625 626 """ Inherit all other methods from the underlying stream. 627 """ 628 return getattr(self.stream, name) 629 630 def __enter__(self): 631 return self 632 633 def __exit__(self, type, value, tb): 634 self.stream.close() 635 636### 637 638class StreamReaderWriter: 639 640 """ StreamReaderWriter instances allow wrapping streams which 641 work in both read and write modes. 642 643 The design is such that one can use the factory functions 644 returned by the codec.lookup() function to construct the 645 instance. 646 647 """ 648 # Optional attributes set by the file wrappers below 649 encoding = 'unknown' 650 651 def __init__(self, stream, Reader, Writer, errors='strict'): 652 653 """ Creates a StreamReaderWriter instance. 654 655 stream must be a Stream-like object. 656 657 Reader, Writer must be factory functions or classes 658 providing the StreamReader, StreamWriter interface resp. 659 660 Error handling is done in the same way as defined for the 661 StreamWriter/Readers. 662 663 """ 664 self.stream = stream 665 self.reader = Reader(stream, errors) 666 self.writer = Writer(stream, errors) 667 self.errors = errors 668 669 def read(self, size=-1): 670 671 return self.reader.read(size) 672 673 def readline(self, size=None): 674 675 return self.reader.readline(size) 676 677 def readlines(self, sizehint=None): 678 679 return self.reader.readlines(sizehint) 680 681 def __next__(self): 682 683 """ Return the next decoded line from the input stream.""" 684 return next(self.reader) 685 686 def __iter__(self): 687 return self 688 689 def write(self, data): 690 691 return self.writer.write(data) 692 693 def writelines(self, list): 694 695 return self.writer.writelines(list) 696 697 def reset(self): 698 699 self.reader.reset() 700 self.writer.reset() 701 702 def seek(self, offset, whence=0): 703 self.reader.seek(offset, whence) 704 self.writer.seek(offset, whence) 705 706 def __getattr__(self, name, 707 getattr=getattr): 708 709 """ Inherit all other methods from the underlying stream. 710 """ 711 return getattr(self.stream, name) 712 713 # these are needed to make "with codecs.open(...)" work properly 714 715 def __enter__(self): 716 return self 717 718 def __exit__(self, type, value, tb): 719 self.stream.close() 720 721### 722 723class StreamRecoder: 724 725 """ StreamRecoder instances provide a frontend - backend 726 view of encoding data. 727 728 They use the complete set of APIs returned by the 729 codecs.lookup() function to implement their task. 730 731 Data written to the stream is first decoded into an 732 intermediate format (which is dependent on the given codec 733 combination) and then written to the stream using an instance 734 of the provided Writer class. 735 736 In the other direction, data is read from the stream using a 737 Reader instance and then return encoded data to the caller. 738 739 """ 740 # Optional attributes set by the file wrappers below 741 data_encoding = 'unknown' 742 file_encoding = 'unknown' 743 744 def __init__(self, stream, encode, decode, Reader, Writer, 745 errors='strict'): 746 747 """ Creates a StreamRecoder instance which implements a two-way 748 conversion: encode and decode work on the frontend (the 749 input to .read() and output of .write()) while 750 Reader and Writer work on the backend (reading and 751 writing to the stream). 752 753 You can use these objects to do transparent direct 754 recodings from e.g. latin-1 to utf-8 and back. 755 756 stream must be a file-like object. 757 758 encode, decode must adhere to the Codec interface, Reader, 759 Writer must be factory functions or classes providing the 760 StreamReader, StreamWriter interface resp. 761 762 encode and decode are needed for the frontend translation, 763 Reader and Writer for the backend translation. Unicode is 764 used as intermediate encoding. 765 766 Error handling is done in the same way as defined for the 767 StreamWriter/Readers. 768 769 """ 770 self.stream = stream 771 self.encode = encode 772 self.decode = decode 773 self.reader = Reader(stream, errors) 774 self.writer = Writer(stream, errors) 775 self.errors = errors 776 777 def read(self, size=-1): 778 779 data = self.reader.read(size) 780 data, bytesencoded = self.encode(data, self.errors) 781 return data 782 783 def readline(self, size=None): 784 785 if size is None: 786 data = self.reader.readline() 787 else: 788 data = self.reader.readline(size) 789 data, bytesencoded = self.encode(data, self.errors) 790 return data 791 792 def readlines(self, sizehint=None): 793 794 data = self.reader.read() 795 data, bytesencoded = self.encode(data, self.errors) 796 return data.splitlines(1) 797 798 def __next__(self): 799 800 """ Return the next decoded line from the input stream.""" 801 data = next(self.reader) 802 data, bytesencoded = self.encode(data, self.errors) 803 return data 804 805 def __iter__(self): 806 return self 807 808 def write(self, data): 809 810 data, bytesdecoded = self.decode(data, self.errors) 811 return self.writer.write(data) 812 813 def writelines(self, list): 814 815 data = ''.join(list) 816 data, bytesdecoded = self.decode(data, self.errors) 817 return self.writer.write(data) 818 819 def reset(self): 820 821 self.reader.reset() 822 self.writer.reset() 823 824 def __getattr__(self, name, 825 getattr=getattr): 826 827 """ Inherit all other methods from the underlying stream. 828 """ 829 return getattr(self.stream, name) 830 831 def __enter__(self): 832 return self 833 834 def __exit__(self, type, value, tb): 835 self.stream.close() 836 837### Shortcuts 838 839def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 840 841 """ Open an encoded file using the given mode and return 842 a wrapped version providing transparent encoding/decoding. 843 844 Note: The wrapped version will only accept the object format 845 defined by the codecs, i.e. Unicode objects for most builtin 846 codecs. Output is also codec dependent and will usually be 847 Unicode as well. 848 849 Files are always opened in binary mode, even if no binary mode 850 was specified. This is done to avoid data loss due to encodings 851 using 8-bit values. The default file mode is 'rb' meaning to 852 open the file in binary read mode. 853 854 encoding specifies the encoding which is to be used for the 855 file. 856 857 errors may be given to define the error handling. It defaults 858 to 'strict' which causes ValueErrors to be raised in case an 859 encoding error occurs. 860 861 buffering has the same meaning as for the builtin open() API. 862 It defaults to line buffered. 863 864 The returned wrapped file object provides an extra attribute 865 .encoding which allows querying the used encoding. This 866 attribute is only available if an encoding was specified as 867 parameter. 868 869 """ 870 if encoding is not None and \ 871 'b' not in mode: 872 # Force opening of the file in binary mode 873 mode = mode + 'b' 874 file = builtins.open(filename, mode, buffering) 875 if encoding is None: 876 return file 877 info = lookup(encoding) 878 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 879 # Add attributes to simplify introspection 880 srw.encoding = encoding 881 return srw 882 883def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 884 885 """ Return a wrapped version of file which provides transparent 886 encoding translation. 887 888 Strings written to the wrapped file are interpreted according 889 to the given data_encoding and then written to the original 890 file as string using file_encoding. The intermediate encoding 891 will usually be Unicode but depends on the specified codecs. 892 893 Strings are read from the file using file_encoding and then 894 passed back to the caller as string using data_encoding. 895 896 If file_encoding is not given, it defaults to data_encoding. 897 898 errors may be given to define the error handling. It defaults 899 to 'strict' which causes ValueErrors to be raised in case an 900 encoding error occurs. 901 902 The returned wrapped file object provides two extra attributes 903 .data_encoding and .file_encoding which reflect the given 904 parameters of the same name. The attributes can be used for 905 introspection by Python programs. 906 907 """ 908 if file_encoding is None: 909 file_encoding = data_encoding 910 data_info = lookup(data_encoding) 911 file_info = lookup(file_encoding) 912 sr = StreamRecoder(file, data_info.encode, data_info.decode, 913 file_info.streamreader, file_info.streamwriter, errors) 914 # Add attributes to simplify introspection 915 sr.data_encoding = data_encoding 916 sr.file_encoding = file_encoding 917 return sr 918 919### Helpers for codec lookup 920 921def getencoder(encoding): 922 923 """ Lookup up the codec for the given encoding and return 924 its encoder function. 925 926 Raises a LookupError in case the encoding cannot be found. 927 928 """ 929 return lookup(encoding).encode 930 931def getdecoder(encoding): 932 933 """ Lookup up the codec for the given encoding and return 934 its decoder function. 935 936 Raises a LookupError in case the encoding cannot be found. 937 938 """ 939 return lookup(encoding).decode 940 941def getincrementalencoder(encoding): 942 943 """ Lookup up the codec for the given encoding and return 944 its IncrementalEncoder class or factory function. 945 946 Raises a LookupError in case the encoding cannot be found 947 or the codecs doesn't provide an incremental encoder. 948 949 """ 950 encoder = lookup(encoding).incrementalencoder 951 if encoder is None: 952 raise LookupError(encoding) 953 return encoder 954 955def getincrementaldecoder(encoding): 956 957 """ Lookup up the codec for the given encoding and return 958 its IncrementalDecoder class or factory function. 959 960 Raises a LookupError in case the encoding cannot be found 961 or the codecs doesn't provide an incremental decoder. 962 963 """ 964 decoder = lookup(encoding).incrementaldecoder 965 if decoder is None: 966 raise LookupError(encoding) 967 return decoder 968 969def getreader(encoding): 970 971 """ Lookup up the codec for the given encoding and return 972 its StreamReader class or factory function. 973 974 Raises a LookupError in case the encoding cannot be found. 975 976 """ 977 return lookup(encoding).streamreader 978 979def getwriter(encoding): 980 981 """ Lookup up the codec for the given encoding and return 982 its StreamWriter class or factory function. 983 984 Raises a LookupError in case the encoding cannot be found. 985 986 """ 987 return lookup(encoding).streamwriter 988 989def iterencode(iterator, encoding, errors='strict', **kwargs): 990 """ 991 Encoding iterator. 992 993 Encodes the input strings from the iterator using a IncrementalEncoder. 994 995 errors and kwargs are passed through to the IncrementalEncoder 996 constructor. 997 """ 998 encoder = getincrementalencoder(encoding)(errors, **kwargs) 999 for input in iterator: 1000 output = encoder.encode(input) 1001 if output: 1002 yield output 1003 output = encoder.encode("", True) 1004 if output: 1005 yield output 1006 1007def iterdecode(iterator, encoding, errors='strict', **kwargs): 1008 """ 1009 Decoding iterator. 1010 1011 Decodes the input strings from the iterator using a IncrementalDecoder. 1012 1013 errors and kwargs are passed through to the IncrementalDecoder 1014 constructor. 1015 """ 1016 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1017 for input in iterator: 1018 output = decoder.decode(input) 1019 if output: 1020 yield output 1021 output = decoder.decode(b"", True) 1022 if output: 1023 yield output 1024 1025### Helpers for charmap-based codecs 1026 1027def make_identity_dict(rng): 1028 1029 """ make_identity_dict(rng) -> dict 1030 1031 Return a dictionary where elements of the rng sequence are 1032 mapped to themselves. 1033 1034 """ 1035 res = {} 1036 for i in rng: 1037 res[i]=i 1038 return res 1039 1040def make_encoding_map(decoding_map): 1041 1042 """ Creates an encoding map from a decoding map. 1043 1044 If a target mapping in the decoding map occurs multiple 1045 times, then that target is mapped to None (undefined mapping), 1046 causing an exception when encountered by the charmap codec 1047 during translation. 1048 1049 One example where this happens is cp875.py which decodes 1050 multiple character to \u001a. 1051 1052 """ 1053 m = {} 1054 for k,v in decoding_map.items(): 1055 if not v in m: 1056 m[v] = k 1057 else: 1058 m[v] = None 1059 return m 1060 1061### error handlers 1062 1063try: 1064 strict_errors = lookup_error("strict") 1065 ignore_errors = lookup_error("ignore") 1066 replace_errors = lookup_error("replace") 1067 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1068 backslashreplace_errors = lookup_error("backslashreplace") 1069except LookupError: 1070 # In --disable-unicode builds, these error handler are missing 1071 strict_errors = None 1072 ignore_errors = None 1073 replace_errors = None 1074 xmlcharrefreplace_errors = None 1075 backslashreplace_errors = None 1076 1077# Tell modulefinder that using codecs probably needs the encodings 1078# package 1079_false = 0 1080if _false: 1081 import encodings 1082 1083### Tests 1084 1085if __name__ == '__main__': 1086 1087 # Make stdout translate Latin-1 output into UTF-8 output 1088 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1089 1090 # Have stdin translate Latin-1 input into UTF-8 input 1091 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1092