codecs.py revision e84b6336db4a2521de91aa916676bdf494aa8205
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import builtins, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError as why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "strict_errors", "ignore_errors", "replace_errors", 24 "xmlcharrefreplace_errors", 25 "register_error", "lookup_error"] 26 27### Constants 28 29# 30# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 31# and its possible byte string values 32# for UTF8/UTF16/UTF32 output and little/big endian machines 33# 34 35# UTF-8 36BOM_UTF8 = b'\xef\xbb\xbf' 37 38# UTF-16, little endian 39BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 40 41# UTF-16, big endian 42BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 43 44# UTF-32, little endian 45BOM_UTF32_LE = b'\xff\xfe\x00\x00' 46 47# UTF-32, big endian 48BOM_UTF32_BE = b'\x00\x00\xfe\xff' 49 50if sys.byteorder == 'little': 51 52 # UTF-16, native endianness 53 BOM = BOM_UTF16 = BOM_UTF16_LE 54 55 # UTF-32, native endianness 56 BOM_UTF32 = BOM_UTF32_LE 57 58else: 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_BE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_BE 65 66# Old broken names (don't use in new code) 67BOM32_LE = BOM_UTF16_LE 68BOM32_BE = BOM_UTF16_BE 69BOM64_LE = BOM_UTF32_LE 70BOM64_BE = BOM_UTF32_BE 71 72 73### Codec base classes (defining the API) 74 75class CodecInfo(tuple): 76 77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 78 incrementalencoder=None, incrementaldecoder=None, name=None): 79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 80 self.name = name 81 self.encode = encode 82 self.decode = decode 83 self.incrementalencoder = incrementalencoder 84 self.incrementaldecoder = incrementaldecoder 85 self.streamwriter = streamwriter 86 self.streamreader = streamreader 87 return self 88 89 def __repr__(self): 90 return "<%s.%s object for encoding %s at 0x%x>" % \ 91 (self.__class__.__module__, self.__class__.__name__, 92 self.name, id(self)) 93 94class Codec: 95 96 """ Defines the interface for stateless encoders/decoders. 97 98 The .encode()/.decode() methods may use different error 99 handling schemes by providing the errors argument. These 100 string values are predefined: 101 102 'strict' - raise a ValueError error (or a subclass) 103 'ignore' - ignore the character and continue with the next 104 'replace' - replace with a suitable replacement character; 105 Python will use the official U+FFFD REPLACEMENT 106 CHARACTER for the builtin Unicode codecs on 107 decoding and '?' on encoding. 108 'xmlcharrefreplace' - Replace with the appropriate XML 109 character reference (only for encoding). 110 'backslashreplace' - Replace with backslashed escape sequences 111 (only for encoding). 112 113 The set of allowed values can be extended via register_error. 114 115 """ 116 def encode(self, input, errors='strict'): 117 118 """ Encodes the object input and returns a tuple (output 119 object, length consumed). 120 121 errors defines the error handling to apply. It defaults to 122 'strict' handling. 123 124 The method may not store state in the Codec instance. Use 125 StreamCodec for codecs which have to keep state in order to 126 make encoding/decoding efficient. 127 128 The encoder must be able to handle zero length input and 129 return an empty object of the output object type in this 130 situation. 131 132 """ 133 raise NotImplementedError 134 135 def decode(self, input, errors='strict'): 136 137 """ Decodes the object input and returns a tuple (output 138 object, length consumed). 139 140 input must be an object which provides the bf_getreadbuf 141 buffer slot. Python strings, buffer objects and memory 142 mapped files are examples of objects providing this slot. 143 144 errors defines the error handling to apply. It defaults to 145 'strict' handling. 146 147 The method may not store state in the Codec instance. Use 148 StreamCodec for codecs which have to keep state in order to 149 make encoding/decoding efficient. 150 151 The decoder must be able to handle zero length input and 152 return an empty object of the output object type in this 153 situation. 154 155 """ 156 raise NotImplementedError 157 158class IncrementalEncoder(object): 159 """ 160 An IncrementalEncoder encodes an input in multiple steps. The input can 161 be passed piece by piece to the encode() method. The IncrementalEncoder 162 remembers the state of the encoding process between calls to encode(). 163 """ 164 def __init__(self, errors='strict'): 165 """ 166 Creates an IncrementalEncoder instance. 167 168 The IncrementalEncoder may use different error handling schemes by 169 providing the errors keyword argument. See the module docstring 170 for a list of possible values. 171 """ 172 self.errors = errors 173 self.buffer = "" 174 175 def encode(self, input, final=False): 176 """ 177 Encodes input and returns the resulting object. 178 """ 179 raise NotImplementedError 180 181 def reset(self): 182 """ 183 Resets the encoder to the initial state. 184 """ 185 186 def getstate(self): 187 """ 188 Return the current state of the encoder. 189 """ 190 return 0 191 192 def setstate(self, state): 193 """ 194 Set the current state of the encoder. state must have been 195 returned by getstate(). 196 """ 197 198class BufferedIncrementalEncoder(IncrementalEncoder): 199 """ 200 This subclass of IncrementalEncoder can be used as the baseclass for an 201 incremental encoder if the encoder must keep some of the output in a 202 buffer between calls to encode(). 203 """ 204 def __init__(self, errors='strict'): 205 IncrementalEncoder.__init__(self, errors) 206 # unencoded input that is kept between calls to encode() 207 self.buffer = "" 208 209 def _buffer_encode(self, input, errors, final): 210 # Overwrite this method in subclasses: It must encode input 211 # and return an (output, length consumed) tuple 212 raise NotImplementedError 213 214 def encode(self, input, final=False): 215 # encode input (taking the buffer into account) 216 data = self.buffer + input 217 (result, consumed) = self._buffer_encode(data, self.errors, final) 218 # keep unencoded input until the next call 219 self.buffer = data[consumed:] 220 return result 221 222 def reset(self): 223 IncrementalEncoder.reset(self) 224 self.buffer = "" 225 226 def getstate(self): 227 return self.buffer or 0 228 229 def setstate(self, state): 230 self.buffer = state or "" 231 232class IncrementalDecoder(object): 233 """ 234 An IncrementalDecoder decodes an input in multiple steps. The input can 235 be passed piece by piece to the decode() method. The IncrementalDecoder 236 remembers the state of the decoding process between calls to decode(). 237 """ 238 def __init__(self, errors='strict'): 239 """ 240 Creates a IncrementalDecoder instance. 241 242 The IncrementalDecoder may use different error handling schemes by 243 providing the errors keyword argument. See the module docstring 244 for a list of possible values. 245 """ 246 self.errors = errors 247 248 def decode(self, input, final=False): 249 """ 250 Decodes input and returns the resulting object. 251 """ 252 raise NotImplementedError 253 254 def reset(self): 255 """ 256 Resets the decoder to the initial state. 257 """ 258 259 def getstate(self): 260 """ 261 Return the current state of the decoder. This must be a 262 (buffered_input, additional_state_info) tuple. By convention, 263 additional_state_info should represent the state of the decoder 264 WITHOUT yet having processed the contents of buffered_input. 265 """ 266 return (b"", 0) 267 268 def setstate(self, state): 269 """ 270 Set the current state of the decoder. state must have been 271 returned by getstate(). 272 """ 273 274class BufferedIncrementalDecoder(IncrementalDecoder): 275 """ 276 This subclass of IncrementalDecoder can be used as the baseclass for an 277 incremental decoder if the decoder must be able to handle incomplete 278 byte sequences. 279 """ 280 def __init__(self, errors='strict'): 281 IncrementalDecoder.__init__(self, errors) 282 # undecoded input that is kept between calls to decode() 283 self.buffer = b"" 284 285 def _buffer_decode(self, input, errors, final): 286 # Overwrite this method in subclasses: It must decode input 287 # and return an (output, length consumed) tuple 288 raise NotImplementedError 289 290 def decode(self, input, final=False): 291 # decode input (taking the buffer into account) 292 data = self.buffer + input 293 (result, consumed) = self._buffer_decode(data, self.errors, final) 294 # keep undecoded input until the next call 295 self.buffer = data[consumed:] 296 return result 297 298 def reset(self): 299 IncrementalDecoder.reset(self) 300 self.buffer = b"" 301 302 def getstate(self): 303 # additional state info is always 0 304 return (self.buffer, 0) 305 306 def setstate(self, state): 307 # ignore additional state info 308 self.buffer = state[0] 309 310# 311# The StreamWriter and StreamReader class provide generic working 312# interfaces which can be used to implement new encoding submodules 313# very easily. See encodings/utf_8.py for an example on how this is 314# done. 315# 316 317class StreamWriter(Codec): 318 319 def __init__(self, stream, errors='strict'): 320 321 """ Creates a StreamWriter instance. 322 323 stream must be a file-like object open for writing 324 (binary) data. 325 326 The StreamWriter may use different error handling 327 schemes by providing the errors keyword argument. These 328 parameters are predefined: 329 330 'strict' - raise a ValueError (or a subclass) 331 'ignore' - ignore the character and continue with the next 332 'replace'- replace with a suitable replacement character 333 'xmlcharrefreplace' - Replace with the appropriate XML 334 character reference. 335 'backslashreplace' - Replace with backslashed escape 336 sequences (only for encoding). 337 338 The set of allowed parameter values can be extended via 339 register_error. 340 """ 341 self.stream = stream 342 self.errors = errors 343 344 def write(self, object): 345 346 """ Writes the object's contents encoded to self.stream. 347 """ 348 data, consumed = self.encode(object, self.errors) 349 self.stream.write(data) 350 351 def writelines(self, list): 352 353 """ Writes the concatenated list of strings to the stream 354 using .write(). 355 """ 356 self.write(''.join(list)) 357 358 def reset(self): 359 360 """ Flushes and resets the codec buffers used for keeping state. 361 362 Calling this method should ensure that the data on the 363 output is put into a clean state, that allows appending 364 of new fresh data without having to rescan the whole 365 stream to recover state. 366 367 """ 368 pass 369 370 def __getattr__(self, name, 371 getattr=getattr): 372 373 """ Inherit all other methods from the underlying stream. 374 """ 375 return getattr(self.stream, name) 376 377 def __enter__(self): 378 return self 379 380 def __exit__(self, type, value, tb): 381 self.stream.close() 382 383### 384 385class StreamReader(Codec): 386 387 def __init__(self, stream, errors='strict'): 388 389 """ Creates a StreamReader instance. 390 391 stream must be a file-like object open for reading 392 (binary) data. 393 394 The StreamReader may use different error handling 395 schemes by providing the errors keyword argument. These 396 parameters are predefined: 397 398 'strict' - raise a ValueError (or a subclass) 399 'ignore' - ignore the character and continue with the next 400 'replace'- replace with a suitable replacement character; 401 402 The set of allowed parameter values can be extended via 403 register_error. 404 """ 405 self.stream = stream 406 self.errors = errors 407 self.bytebuffer = b"" 408 # For str->str decoding this will stay a str 409 # For str->unicode decoding the first read will promote it to unicode 410 self.charbuffer = "" 411 self.linebuffer = None 412 413 def decode(self, input, errors='strict'): 414 raise NotImplementedError 415 416 def read(self, size=-1, chars=-1, firstline=False): 417 418 """ Decodes data from the stream self.stream and returns the 419 resulting object. 420 421 chars indicates the number of characters to read from the 422 stream. read() will never return more than chars 423 characters, but it might return less, if there are not enough 424 characters available. 425 426 size indicates the approximate maximum number of bytes to 427 read from the stream for decoding purposes. The decoder 428 can modify this setting as appropriate. The default value 429 -1 indicates to read and decode as much as possible. size 430 is intended to prevent having to decode huge files in one 431 step. 432 433 If firstline is true, and a UnicodeDecodeError happens 434 after the first line terminator in the input only the first line 435 will be returned, the rest of the input will be kept until the 436 next call to read(). 437 438 The method should use a greedy read strategy meaning that 439 it should read as much data as is allowed within the 440 definition of the encoding and the given size, e.g. if 441 optional encoding endings or state markers are available 442 on the stream, these should be read too. 443 """ 444 # If we have lines cached, first merge them back into characters 445 if self.linebuffer: 446 self.charbuffer = "".join(self.linebuffer) 447 self.linebuffer = None 448 449 # read until we get the required number of characters (if available) 450 while True: 451 # can the request can be satisfied from the character buffer? 452 if chars < 0: 453 if size < 0: 454 if self.charbuffer: 455 break 456 elif len(self.charbuffer) >= size: 457 break 458 else: 459 if len(self.charbuffer) >= chars: 460 break 461 # we need more data 462 if size < 0: 463 newdata = self.stream.read() 464 else: 465 newdata = self.stream.read(size) 466 # decode bytes (those remaining from the last call included) 467 data = self.bytebuffer + newdata 468 try: 469 newchars, decodedbytes = self.decode(data, self.errors) 470 except UnicodeDecodeError as exc: 471 if firstline: 472 newchars, decodedbytes = \ 473 self.decode(data[:exc.start], self.errors) 474 lines = newchars.splitlines(True) 475 if len(lines)<=1: 476 raise 477 else: 478 raise 479 # keep undecoded bytes until the next call 480 self.bytebuffer = data[decodedbytes:] 481 # put new characters in the character buffer 482 self.charbuffer += newchars 483 # there was no data available 484 if not newdata: 485 break 486 if chars < 0: 487 # Return everything we've got 488 result = self.charbuffer 489 self.charbuffer = "" 490 else: 491 # Return the first chars characters 492 result = self.charbuffer[:chars] 493 self.charbuffer = self.charbuffer[chars:] 494 return result 495 496 def readline(self, size=None, keepends=True): 497 498 """ Read one line from the input stream and return the 499 decoded data. 500 501 size, if given, is passed as size argument to the 502 read() method. 503 504 """ 505 # If we have lines cached from an earlier read, return 506 # them unconditionally 507 if self.linebuffer: 508 line = self.linebuffer[0] 509 del self.linebuffer[0] 510 if len(self.linebuffer) == 1: 511 # revert to charbuffer mode; we might need more data 512 # next time 513 self.charbuffer = self.linebuffer[0] 514 self.linebuffer = None 515 if not keepends: 516 line = line.splitlines(False)[0] 517 return line 518 519 readsize = size or 72 520 line = "" 521 # If size is given, we call read() only once 522 while True: 523 data = self.read(readsize, firstline=True) 524 if data: 525 # If we're at a "\r" read one extra character (which might 526 # be a "\n") to get a proper line ending. If the stream is 527 # temporarily exhausted we return the wrong line ending. 528 if data.endswith("\r"): 529 data += self.read(size=1, chars=1) 530 531 line += data 532 lines = line.splitlines(True) 533 if lines: 534 if len(lines) > 1: 535 # More than one line result; the first line is a full line 536 # to return 537 line = lines[0] 538 del lines[0] 539 if len(lines) > 1: 540 # cache the remaining lines 541 lines[-1] += self.charbuffer 542 self.linebuffer = lines 543 self.charbuffer = None 544 else: 545 # only one remaining line, put it back into charbuffer 546 self.charbuffer = lines[0] + self.charbuffer 547 if not keepends: 548 line = line.splitlines(False)[0] 549 break 550 line0withend = lines[0] 551 line0withoutend = lines[0].splitlines(False)[0] 552 if line0withend != line0withoutend: # We really have a line end 553 # Put the rest back together and keep it until the next call 554 self.charbuffer = "".join(lines[1:]) + self.charbuffer 555 if keepends: 556 line = line0withend 557 else: 558 line = line0withoutend 559 break 560 # we didn't get anything or this was our only try 561 if not data or size is not None: 562 if line and not keepends: 563 line = line.splitlines(False)[0] 564 break 565 if readsize<8000: 566 readsize *= 2 567 return line 568 569 def readlines(self, sizehint=None, keepends=True): 570 571 """ Read all lines available on the input stream 572 and return them as list of lines. 573 574 Line breaks are implemented using the codec's decoder 575 method and are included in the list entries. 576 577 sizehint, if given, is ignored since there is no efficient 578 way to finding the true end-of-line. 579 580 """ 581 data = self.read() 582 return data.splitlines(keepends) 583 584 def reset(self): 585 586 """ Resets the codec buffers used for keeping state. 587 588 Note that no stream repositioning should take place. 589 This method is primarily intended to be able to recover 590 from decoding errors. 591 592 """ 593 self.bytebuffer = b"" 594 self.charbuffer = "" 595 self.linebuffer = None 596 597 def seek(self, offset, whence=0): 598 """ Set the input stream's current position. 599 600 Resets the codec buffers used for keeping state. 601 """ 602 self.reset() 603 self.stream.seek(offset, whence) 604 605 def __next__(self): 606 607 """ Return the next decoded line from the input stream.""" 608 line = self.readline() 609 if line: 610 return line 611 raise StopIteration 612 613 def __iter__(self): 614 return self 615 616 def __getattr__(self, name, 617 getattr=getattr): 618 619 """ Inherit all other methods from the underlying stream. 620 """ 621 return getattr(self.stream, name) 622 623 def __enter__(self): 624 return self 625 626 def __exit__(self, type, value, tb): 627 self.stream.close() 628 629### 630 631class StreamReaderWriter: 632 633 """ StreamReaderWriter instances allow wrapping streams which 634 work in both read and write modes. 635 636 The design is such that one can use the factory functions 637 returned by the codec.lookup() function to construct the 638 instance. 639 640 """ 641 # Optional attributes set by the file wrappers below 642 encoding = 'unknown' 643 644 def __init__(self, stream, Reader, Writer, errors='strict'): 645 646 """ Creates a StreamReaderWriter instance. 647 648 stream must be a Stream-like object. 649 650 Reader, Writer must be factory functions or classes 651 providing the StreamReader, StreamWriter interface resp. 652 653 Error handling is done in the same way as defined for the 654 StreamWriter/Readers. 655 656 """ 657 self.stream = stream 658 self.reader = Reader(stream, errors) 659 self.writer = Writer(stream, errors) 660 self.errors = errors 661 662 def read(self, size=-1): 663 664 return self.reader.read(size) 665 666 def readline(self, size=None): 667 668 return self.reader.readline(size) 669 670 def readlines(self, sizehint=None): 671 672 return self.reader.readlines(sizehint) 673 674 def __next__(self): 675 676 """ Return the next decoded line from the input stream.""" 677 return next(self.reader) 678 679 def __iter__(self): 680 return self 681 682 def write(self, data): 683 684 return self.writer.write(data) 685 686 def writelines(self, list): 687 688 return self.writer.writelines(list) 689 690 def reset(self): 691 692 self.reader.reset() 693 self.writer.reset() 694 695 def __getattr__(self, name, 696 getattr=getattr): 697 698 """ Inherit all other methods from the underlying stream. 699 """ 700 return getattr(self.stream, name) 701 702 # these are needed to make "with codecs.open(...)" work properly 703 704 def __enter__(self): 705 return self 706 707 def __exit__(self, type, value, tb): 708 self.stream.close() 709 710### 711 712class StreamRecoder: 713 714 """ StreamRecoder instances provide a frontend - backend 715 view of encoding data. 716 717 They use the complete set of APIs returned by the 718 codecs.lookup() function to implement their task. 719 720 Data written to the stream is first decoded into an 721 intermediate format (which is dependent on the given codec 722 combination) and then written to the stream using an instance 723 of the provided Writer class. 724 725 In the other direction, data is read from the stream using a 726 Reader instance and then return encoded data to the caller. 727 728 """ 729 # Optional attributes set by the file wrappers below 730 data_encoding = 'unknown' 731 file_encoding = 'unknown' 732 733 def __init__(self, stream, encode, decode, Reader, Writer, 734 errors='strict'): 735 736 """ Creates a StreamRecoder instance which implements a two-way 737 conversion: encode and decode work on the frontend (the 738 input to .read() and output of .write()) while 739 Reader and Writer work on the backend (reading and 740 writing to the stream). 741 742 You can use these objects to do transparent direct 743 recodings from e.g. latin-1 to utf-8 and back. 744 745 stream must be a file-like object. 746 747 encode, decode must adhere to the Codec interface, Reader, 748 Writer must be factory functions or classes providing the 749 StreamReader, StreamWriter interface resp. 750 751 encode and decode are needed for the frontend translation, 752 Reader and Writer for the backend translation. Unicode is 753 used as intermediate encoding. 754 755 Error handling is done in the same way as defined for the 756 StreamWriter/Readers. 757 758 """ 759 self.stream = stream 760 self.encode = encode 761 self.decode = decode 762 self.reader = Reader(stream, errors) 763 self.writer = Writer(stream, errors) 764 self.errors = errors 765 766 def read(self, size=-1): 767 768 data = self.reader.read(size) 769 data, bytesencoded = self.encode(data, self.errors) 770 return data 771 772 def readline(self, size=None): 773 774 if size is None: 775 data = self.reader.readline() 776 else: 777 data = self.reader.readline(size) 778 data, bytesencoded = self.encode(data, self.errors) 779 return data 780 781 def readlines(self, sizehint=None): 782 783 data = self.reader.read() 784 data, bytesencoded = self.encode(data, self.errors) 785 return data.splitlines(1) 786 787 def __next__(self): 788 789 """ Return the next decoded line from the input stream.""" 790 data = next(self.reader) 791 data, bytesencoded = self.encode(data, self.errors) 792 return data 793 794 def __iter__(self): 795 return self 796 797 def write(self, data): 798 799 data, bytesdecoded = self.decode(data, self.errors) 800 return self.writer.write(data) 801 802 def writelines(self, list): 803 804 data = ''.join(list) 805 data, bytesdecoded = self.decode(data, self.errors) 806 return self.writer.write(data) 807 808 def reset(self): 809 810 self.reader.reset() 811 self.writer.reset() 812 813 def __getattr__(self, name, 814 getattr=getattr): 815 816 """ Inherit all other methods from the underlying stream. 817 """ 818 return getattr(self.stream, name) 819 820 def __enter__(self): 821 return self 822 823 def __exit__(self, type, value, tb): 824 self.stream.close() 825 826### Shortcuts 827 828def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 829 830 """ Open an encoded file using the given mode and return 831 a wrapped version providing transparent encoding/decoding. 832 833 Note: The wrapped version will only accept the object format 834 defined by the codecs, i.e. Unicode objects for most builtin 835 codecs. Output is also codec dependent and will usually be 836 Unicode as well. 837 838 Files are always opened in binary mode, even if no binary mode 839 was specified. This is done to avoid data loss due to encodings 840 using 8-bit values. The default file mode is 'rb' meaning to 841 open the file in binary read mode. 842 843 encoding specifies the encoding which is to be used for the 844 file. 845 846 errors may be given to define the error handling. It defaults 847 to 'strict' which causes ValueErrors to be raised in case an 848 encoding error occurs. 849 850 buffering has the same meaning as for the builtin open() API. 851 It defaults to line buffered. 852 853 The returned wrapped file object provides an extra attribute 854 .encoding which allows querying the used encoding. This 855 attribute is only available if an encoding was specified as 856 parameter. 857 858 """ 859 if encoding is not None and \ 860 'b' not in mode: 861 # Force opening of the file in binary mode 862 mode = mode + 'b' 863 file = builtins.open(filename, mode, buffering) 864 if encoding is None: 865 return file 866 info = lookup(encoding) 867 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 868 # Add attributes to simplify introspection 869 srw.encoding = encoding 870 return srw 871 872def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 873 874 """ Return a wrapped version of file which provides transparent 875 encoding translation. 876 877 Strings written to the wrapped file are interpreted according 878 to the given data_encoding and then written to the original 879 file as string using file_encoding. The intermediate encoding 880 will usually be Unicode but depends on the specified codecs. 881 882 Strings are read from the file using file_encoding and then 883 passed back to the caller as string using data_encoding. 884 885 If file_encoding is not given, it defaults to data_encoding. 886 887 errors may be given to define the error handling. It defaults 888 to 'strict' which causes ValueErrors to be raised in case an 889 encoding error occurs. 890 891 The returned wrapped file object provides two extra attributes 892 .data_encoding and .file_encoding which reflect the given 893 parameters of the same name. The attributes can be used for 894 introspection by Python programs. 895 896 """ 897 if file_encoding is None: 898 file_encoding = data_encoding 899 data_info = lookup(data_encoding) 900 file_info = lookup(file_encoding) 901 sr = StreamRecoder(file, data_info.encode, data_info.decode, 902 file_info.streamreader, file_info.streamwriter, errors) 903 # Add attributes to simplify introspection 904 sr.data_encoding = data_encoding 905 sr.file_encoding = file_encoding 906 return sr 907 908### Helpers for codec lookup 909 910def getencoder(encoding): 911 912 """ Lookup up the codec for the given encoding and return 913 its encoder function. 914 915 Raises a LookupError in case the encoding cannot be found. 916 917 """ 918 return lookup(encoding).encode 919 920def getdecoder(encoding): 921 922 """ Lookup up the codec for the given encoding and return 923 its decoder function. 924 925 Raises a LookupError in case the encoding cannot be found. 926 927 """ 928 return lookup(encoding).decode 929 930def getincrementalencoder(encoding): 931 932 """ Lookup up the codec for the given encoding and return 933 its IncrementalEncoder class or factory function. 934 935 Raises a LookupError in case the encoding cannot be found 936 or the codecs doesn't provide an incremental encoder. 937 938 """ 939 encoder = lookup(encoding).incrementalencoder 940 if encoder is None: 941 raise LookupError(encoding) 942 return encoder 943 944def getincrementaldecoder(encoding): 945 946 """ Lookup up the codec for the given encoding and return 947 its IncrementalDecoder class or factory function. 948 949 Raises a LookupError in case the encoding cannot be found 950 or the codecs doesn't provide an incremental decoder. 951 952 """ 953 decoder = lookup(encoding).incrementaldecoder 954 if decoder is None: 955 raise LookupError(encoding) 956 return decoder 957 958def getreader(encoding): 959 960 """ Lookup up the codec for the given encoding and return 961 its StreamReader class or factory function. 962 963 Raises a LookupError in case the encoding cannot be found. 964 965 """ 966 return lookup(encoding).streamreader 967 968def getwriter(encoding): 969 970 """ Lookup up the codec for the given encoding and return 971 its StreamWriter class or factory function. 972 973 Raises a LookupError in case the encoding cannot be found. 974 975 """ 976 return lookup(encoding).streamwriter 977 978def iterencode(iterator, encoding, errors='strict', **kwargs): 979 """ 980 Encoding iterator. 981 982 Encodes the input strings from the iterator using a IncrementalEncoder. 983 984 errors and kwargs are passed through to the IncrementalEncoder 985 constructor. 986 """ 987 encoder = getincrementalencoder(encoding)(errors, **kwargs) 988 for input in iterator: 989 output = encoder.encode(input) 990 if output: 991 yield output 992 output = encoder.encode("", True) 993 if output: 994 yield output 995 996def iterdecode(iterator, encoding, errors='strict', **kwargs): 997 """ 998 Decoding iterator. 999 1000 Decodes the input strings from the iterator using a IncrementalDecoder. 1001 1002 errors and kwargs are passed through to the IncrementalDecoder 1003 constructor. 1004 """ 1005 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1006 for input in iterator: 1007 output = decoder.decode(input) 1008 if output: 1009 yield output 1010 output = decoder.decode(b"", True) 1011 if output: 1012 yield output 1013 1014### Helpers for charmap-based codecs 1015 1016def make_identity_dict(rng): 1017 1018 """ make_identity_dict(rng) -> dict 1019 1020 Return a dictionary where elements of the rng sequence are 1021 mapped to themselves. 1022 1023 """ 1024 res = {} 1025 for i in rng: 1026 res[i]=i 1027 return res 1028 1029def make_encoding_map(decoding_map): 1030 1031 """ Creates an encoding map from a decoding map. 1032 1033 If a target mapping in the decoding map occurs multiple 1034 times, then that target is mapped to None (undefined mapping), 1035 causing an exception when encountered by the charmap codec 1036 during translation. 1037 1038 One example where this happens is cp875.py which decodes 1039 multiple character to \u001a. 1040 1041 """ 1042 m = {} 1043 for k,v in decoding_map.items(): 1044 if not v in m: 1045 m[v] = k 1046 else: 1047 m[v] = None 1048 return m 1049 1050### error handlers 1051 1052try: 1053 strict_errors = lookup_error("strict") 1054 ignore_errors = lookup_error("ignore") 1055 replace_errors = lookup_error("replace") 1056 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1057 backslashreplace_errors = lookup_error("backslashreplace") 1058except LookupError: 1059 # In --disable-unicode builds, these error handler are missing 1060 strict_errors = None 1061 ignore_errors = None 1062 replace_errors = None 1063 xmlcharrefreplace_errors = None 1064 backslashreplace_errors = None 1065 1066# Tell modulefinder that using codecs probably needs the encodings 1067# package 1068_false = 0 1069if _false: 1070 import encodings 1071 1072### Tests 1073 1074if __name__ == '__main__': 1075 1076 # Make stdout translate Latin-1 output into UTF-8 output 1077 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1078 1079 # Have stdin translate Latin-1 input into UTF-8 input 1080 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1081