codecs.py revision b9fdb7a452c2b6f7a628118b5f695bd061b62cc8
1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import builtins, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError as why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 24 "StreamReader", "StreamWriter", 25 "StreamReaderWriter", "StreamRecoder", 26 "getencoder", "getdecoder", "getincrementalencoder", 27 "getincrementaldecoder", "getreader", "getwriter", 28 "encode", "decode", "iterencode", "iterdecode", 29 "strict_errors", "ignore_errors", "replace_errors", 30 "xmlcharrefreplace_errors", "backslashreplace_errors", 31 "register_error", "lookup_error"] 32 33### Constants 34 35# 36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 37# and its possible byte string values 38# for UTF8/UTF16/UTF32 output and little/big endian machines 39# 40 41# UTF-8 42BOM_UTF8 = b'\xef\xbb\xbf' 43 44# UTF-16, little endian 45BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 46 47# UTF-16, big endian 48BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 49 50# UTF-32, little endian 51BOM_UTF32_LE = b'\xff\xfe\x00\x00' 52 53# UTF-32, big endian 54BOM_UTF32_BE = b'\x00\x00\xfe\xff' 55 56if sys.byteorder == 'little': 57 58 # UTF-16, native endianness 59 BOM = BOM_UTF16 = BOM_UTF16_LE 60 61 # UTF-32, native endianness 62 BOM_UTF32 = BOM_UTF32_LE 63 64else: 65 66 # UTF-16, native endianness 67 BOM = BOM_UTF16 = BOM_UTF16_BE 68 69 # UTF-32, native endianness 70 BOM_UTF32 = BOM_UTF32_BE 71 72# Old broken names (don't use in new code) 73BOM32_LE = BOM_UTF16_LE 74BOM32_BE = BOM_UTF16_BE 75BOM64_LE = BOM_UTF32_LE 76BOM64_BE = BOM_UTF32_BE 77 78 79### Codec base classes (defining the API) 80 81class CodecInfo(tuple): 82 """Codec details when looking up the codec registry""" 83 84 # Private API to allow Python 3.4 to blacklist the known non-Unicode 85 # codecs in the standard library. A more general mechanism to 86 # reliably distinguish test encodings from other codecs will hopefully 87 # be defined for Python 3.5 88 # 89 # See http://bugs.python.org/issue19619 90 _is_text_encoding = True # Assume codecs are text encodings by default 91 92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 93 incrementalencoder=None, incrementaldecoder=None, name=None, 94 *, _is_text_encoding=None): 95 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 96 self.name = name 97 self.encode = encode 98 self.decode = decode 99 self.incrementalencoder = incrementalencoder 100 self.incrementaldecoder = incrementaldecoder 101 self.streamwriter = streamwriter 102 self.streamreader = streamreader 103 if _is_text_encoding is not None: 104 self._is_text_encoding = _is_text_encoding 105 return self 106 107 def __repr__(self): 108 return "<%s.%s object for encoding %s at 0x%x>" % \ 109 (self.__class__.__module__, self.__class__.__name__, 110 self.name, id(self)) 111 112class Codec: 113 114 """ Defines the interface for stateless encoders/decoders. 115 116 The .encode()/.decode() methods may use different error 117 handling schemes by providing the errors argument. These 118 string values are predefined: 119 120 'strict' - raise a ValueError error (or a subclass) 121 'ignore' - ignore the character and continue with the next 122 'replace' - replace with a suitable replacement character; 123 Python will use the official U+FFFD REPLACEMENT 124 CHARACTER for the builtin Unicode codecs on 125 decoding and '?' on encoding. 126 'surrogateescape' - replace with private codepoints U+DCnn. 127 'xmlcharrefreplace' - Replace with the appropriate XML 128 character reference (only for encoding). 129 'backslashreplace' - Replace with backslashed escape sequences 130 (only for encoding). 131 132 The set of allowed values can be extended via register_error. 133 134 """ 135 def encode(self, input, errors='strict'): 136 137 """ Encodes the object input and returns a tuple (output 138 object, length consumed). 139 140 errors defines the error handling to apply. It defaults to 141 'strict' handling. 142 143 The method may not store state in the Codec instance. Use 144 StreamCodec for codecs which have to keep state in order to 145 make encoding/decoding efficient. 146 147 The encoder must be able to handle zero length input and 148 return an empty object of the output object type in this 149 situation. 150 151 """ 152 raise NotImplementedError 153 154 def decode(self, input, errors='strict'): 155 156 """ Decodes the object input and returns a tuple (output 157 object, length consumed). 158 159 input must be an object which provides the bf_getreadbuf 160 buffer slot. Python strings, buffer objects and memory 161 mapped files are examples of objects providing this slot. 162 163 errors defines the error handling to apply. It defaults to 164 'strict' handling. 165 166 The method may not store state in the Codec instance. Use 167 StreamCodec for codecs which have to keep state in order to 168 make encoding/decoding efficient. 169 170 The decoder must be able to handle zero length input and 171 return an empty object of the output object type in this 172 situation. 173 174 """ 175 raise NotImplementedError 176 177class IncrementalEncoder(object): 178 """ 179 An IncrementalEncoder encodes an input in multiple steps. The input can 180 be passed piece by piece to the encode() method. The IncrementalEncoder 181 remembers the state of the encoding process between calls to encode(). 182 """ 183 def __init__(self, errors='strict'): 184 """ 185 Creates an IncrementalEncoder instance. 186 187 The IncrementalEncoder may use different error handling schemes by 188 providing the errors keyword argument. See the module docstring 189 for a list of possible values. 190 """ 191 self.errors = errors 192 self.buffer = "" 193 194 def encode(self, input, final=False): 195 """ 196 Encodes input and returns the resulting object. 197 """ 198 raise NotImplementedError 199 200 def reset(self): 201 """ 202 Resets the encoder to the initial state. 203 """ 204 205 def getstate(self): 206 """ 207 Return the current state of the encoder. 208 """ 209 return 0 210 211 def setstate(self, state): 212 """ 213 Set the current state of the encoder. state must have been 214 returned by getstate(). 215 """ 216 217class BufferedIncrementalEncoder(IncrementalEncoder): 218 """ 219 This subclass of IncrementalEncoder can be used as the baseclass for an 220 incremental encoder if the encoder must keep some of the output in a 221 buffer between calls to encode(). 222 """ 223 def __init__(self, errors='strict'): 224 IncrementalEncoder.__init__(self, errors) 225 # unencoded input that is kept between calls to encode() 226 self.buffer = "" 227 228 def _buffer_encode(self, input, errors, final): 229 # Overwrite this method in subclasses: It must encode input 230 # and return an (output, length consumed) tuple 231 raise NotImplementedError 232 233 def encode(self, input, final=False): 234 # encode input (taking the buffer into account) 235 data = self.buffer + input 236 (result, consumed) = self._buffer_encode(data, self.errors, final) 237 # keep unencoded input until the next call 238 self.buffer = data[consumed:] 239 return result 240 241 def reset(self): 242 IncrementalEncoder.reset(self) 243 self.buffer = "" 244 245 def getstate(self): 246 return self.buffer or 0 247 248 def setstate(self, state): 249 self.buffer = state or "" 250 251class IncrementalDecoder(object): 252 """ 253 An IncrementalDecoder decodes an input in multiple steps. The input can 254 be passed piece by piece to the decode() method. The IncrementalDecoder 255 remembers the state of the decoding process between calls to decode(). 256 """ 257 def __init__(self, errors='strict'): 258 """ 259 Create a IncrementalDecoder instance. 260 261 The IncrementalDecoder may use different error handling schemes by 262 providing the errors keyword argument. See the module docstring 263 for a list of possible values. 264 """ 265 self.errors = errors 266 267 def decode(self, input, final=False): 268 """ 269 Decode input and returns the resulting object. 270 """ 271 raise NotImplementedError 272 273 def reset(self): 274 """ 275 Reset the decoder to the initial state. 276 """ 277 278 def getstate(self): 279 """ 280 Return the current state of the decoder. 281 282 This must be a (buffered_input, additional_state_info) tuple. 283 buffered_input must be a bytes object containing bytes that 284 were passed to decode() that have not yet been converted. 285 additional_state_info must be a non-negative integer 286 representing the state of the decoder WITHOUT yet having 287 processed the contents of buffered_input. In the initial state 288 and after reset(), getstate() must return (b"", 0). 289 """ 290 return (b"", 0) 291 292 def setstate(self, state): 293 """ 294 Set the current state of the decoder. 295 296 state must have been returned by getstate(). The effect of 297 setstate((b"", 0)) must be equivalent to reset(). 298 """ 299 300class BufferedIncrementalDecoder(IncrementalDecoder): 301 """ 302 This subclass of IncrementalDecoder can be used as the baseclass for an 303 incremental decoder if the decoder must be able to handle incomplete 304 byte sequences. 305 """ 306 def __init__(self, errors='strict'): 307 IncrementalDecoder.__init__(self, errors) 308 # undecoded input that is kept between calls to decode() 309 self.buffer = b"" 310 311 def _buffer_decode(self, input, errors, final): 312 # Overwrite this method in subclasses: It must decode input 313 # and return an (output, length consumed) tuple 314 raise NotImplementedError 315 316 def decode(self, input, final=False): 317 # decode input (taking the buffer into account) 318 data = self.buffer + input 319 (result, consumed) = self._buffer_decode(data, self.errors, final) 320 # keep undecoded input until the next call 321 self.buffer = data[consumed:] 322 return result 323 324 def reset(self): 325 IncrementalDecoder.reset(self) 326 self.buffer = b"" 327 328 def getstate(self): 329 # additional state info is always 0 330 return (self.buffer, 0) 331 332 def setstate(self, state): 333 # ignore additional state info 334 self.buffer = state[0] 335 336# 337# The StreamWriter and StreamReader class provide generic working 338# interfaces which can be used to implement new encoding submodules 339# very easily. See encodings/utf_8.py for an example on how this is 340# done. 341# 342 343class StreamWriter(Codec): 344 345 def __init__(self, stream, errors='strict'): 346 347 """ Creates a StreamWriter instance. 348 349 stream must be a file-like object open for writing. 350 351 The StreamWriter may use different error handling 352 schemes by providing the errors keyword argument. These 353 parameters are predefined: 354 355 'strict' - raise a ValueError (or a subclass) 356 'ignore' - ignore the character and continue with the next 357 'replace'- replace with a suitable replacement character 358 'xmlcharrefreplace' - Replace with the appropriate XML 359 character reference. 360 'backslashreplace' - Replace with backslashed escape 361 sequences (only for encoding). 362 363 The set of allowed parameter values can be extended via 364 register_error. 365 """ 366 self.stream = stream 367 self.errors = errors 368 369 def write(self, object): 370 371 """ Writes the object's contents encoded to self.stream. 372 """ 373 data, consumed = self.encode(object, self.errors) 374 self.stream.write(data) 375 376 def writelines(self, list): 377 378 """ Writes the concatenated list of strings to the stream 379 using .write(). 380 """ 381 self.write(''.join(list)) 382 383 def reset(self): 384 385 """ Flushes and resets the codec buffers used for keeping state. 386 387 Calling this method should ensure that the data on the 388 output is put into a clean state, that allows appending 389 of new fresh data without having to rescan the whole 390 stream to recover state. 391 392 """ 393 pass 394 395 def seek(self, offset, whence=0): 396 self.stream.seek(offset, whence) 397 if whence == 0 and offset == 0: 398 self.reset() 399 400 def __getattr__(self, name, 401 getattr=getattr): 402 403 """ Inherit all other methods from the underlying stream. 404 """ 405 return getattr(self.stream, name) 406 407 def __enter__(self): 408 return self 409 410 def __exit__(self, type, value, tb): 411 self.stream.close() 412 413### 414 415class StreamReader(Codec): 416 417 charbuffertype = str 418 419 def __init__(self, stream, errors='strict'): 420 421 """ Creates a StreamReader instance. 422 423 stream must be a file-like object open for reading. 424 425 The StreamReader may use different error handling 426 schemes by providing the errors keyword argument. These 427 parameters are predefined: 428 429 'strict' - raise a ValueError (or a subclass) 430 'ignore' - ignore the character and continue with the next 431 'replace'- replace with a suitable replacement character; 432 433 The set of allowed parameter values can be extended via 434 register_error. 435 """ 436 self.stream = stream 437 self.errors = errors 438 self.bytebuffer = b"" 439 self._empty_charbuffer = self.charbuffertype() 440 self.charbuffer = self._empty_charbuffer 441 self.linebuffer = None 442 443 def decode(self, input, errors='strict'): 444 raise NotImplementedError 445 446 def read(self, size=-1, chars=-1, firstline=False): 447 448 """ Decodes data from the stream self.stream and returns the 449 resulting object. 450 451 chars indicates the number of decoded code points or bytes to 452 return. read() will never return more data than requested, 453 but it might return less, if there is not enough available. 454 455 size indicates the approximate maximum number of decoded 456 bytes or code points to read for decoding. The decoder 457 can modify this setting as appropriate. The default value 458 -1 indicates to read and decode as much as possible. size 459 is intended to prevent having to decode huge files in one 460 step. 461 462 If firstline is true, and a UnicodeDecodeError happens 463 after the first line terminator in the input only the first line 464 will be returned, the rest of the input will be kept until the 465 next call to read(). 466 467 The method should use a greedy read strategy, meaning that 468 it should read as much data as is allowed within the 469 definition of the encoding and the given size, e.g. if 470 optional encoding endings or state markers are available 471 on the stream, these should be read too. 472 """ 473 # If we have lines cached, first merge them back into characters 474 if self.linebuffer: 475 self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 476 self.linebuffer = None 477 478 # read until we get the required number of characters (if available) 479 while True: 480 # can the request be satisfied from the character buffer? 481 if chars >= 0: 482 if len(self.charbuffer) >= chars: 483 break 484 elif size >= 0: 485 if len(self.charbuffer) >= size: 486 break 487 # we need more data 488 if size < 0: 489 newdata = self.stream.read() 490 else: 491 newdata = self.stream.read(size) 492 # decode bytes (those remaining from the last call included) 493 data = self.bytebuffer + newdata 494 if not data: 495 break 496 try: 497 newchars, decodedbytes = self.decode(data, self.errors) 498 except UnicodeDecodeError as exc: 499 if firstline: 500 newchars, decodedbytes = \ 501 self.decode(data[:exc.start], self.errors) 502 lines = newchars.splitlines(keepends=True) 503 if len(lines)<=1: 504 raise 505 else: 506 raise 507 # keep undecoded bytes until the next call 508 self.bytebuffer = data[decodedbytes:] 509 # put new characters in the character buffer 510 self.charbuffer += newchars 511 # there was no data available 512 if not newdata: 513 break 514 if chars < 0: 515 # Return everything we've got 516 result = self.charbuffer 517 self.charbuffer = self._empty_charbuffer 518 else: 519 # Return the first chars characters 520 result = self.charbuffer[:chars] 521 self.charbuffer = self.charbuffer[chars:] 522 return result 523 524 def readline(self, size=None, keepends=True): 525 526 """ Read one line from the input stream and return the 527 decoded data. 528 529 size, if given, is passed as size argument to the 530 read() method. 531 532 """ 533 # If we have lines cached from an earlier read, return 534 # them unconditionally 535 if self.linebuffer: 536 line = self.linebuffer[0] 537 del self.linebuffer[0] 538 if len(self.linebuffer) == 1: 539 # revert to charbuffer mode; we might need more data 540 # next time 541 self.charbuffer = self.linebuffer[0] 542 self.linebuffer = None 543 if not keepends: 544 line = line.splitlines(keepends=False)[0] 545 return line 546 547 readsize = size or 72 548 line = self._empty_charbuffer 549 # If size is given, we call read() only once 550 while True: 551 data = self.read(readsize, firstline=True) 552 if data: 553 # If we're at a "\r" read one extra character (which might 554 # be a "\n") to get a proper line ending. If the stream is 555 # temporarily exhausted we return the wrong line ending. 556 if (isinstance(data, str) and data.endswith("\r")) or \ 557 (isinstance(data, bytes) and data.endswith(b"\r")): 558 data += self.read(size=1, chars=1) 559 560 line += data 561 lines = line.splitlines(keepends=True) 562 if lines: 563 if len(lines) > 1: 564 # More than one line result; the first line is a full line 565 # to return 566 line = lines[0] 567 del lines[0] 568 if len(lines) > 1: 569 # cache the remaining lines 570 lines[-1] += self.charbuffer 571 self.linebuffer = lines 572 self.charbuffer = None 573 else: 574 # only one remaining line, put it back into charbuffer 575 self.charbuffer = lines[0] + self.charbuffer 576 if not keepends: 577 line = line.splitlines(keepends=False)[0] 578 break 579 line0withend = lines[0] 580 line0withoutend = lines[0].splitlines(keepends=False)[0] 581 if line0withend != line0withoutend: # We really have a line end 582 # Put the rest back together and keep it until the next call 583 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 584 self.charbuffer 585 if keepends: 586 line = line0withend 587 else: 588 line = line0withoutend 589 break 590 # we didn't get anything or this was our only try 591 if not data or size is not None: 592 if line and not keepends: 593 line = line.splitlines(keepends=False)[0] 594 break 595 if readsize < 8000: 596 readsize *= 2 597 return line 598 599 def readlines(self, sizehint=None, keepends=True): 600 601 """ Read all lines available on the input stream 602 and return them as a list. 603 604 Line breaks are implemented using the codec's decoder 605 method and are included in the list entries. 606 607 sizehint, if given, is ignored since there is no efficient 608 way to finding the true end-of-line. 609 610 """ 611 data = self.read() 612 return data.splitlines(keepends) 613 614 def reset(self): 615 616 """ Resets the codec buffers used for keeping state. 617 618 Note that no stream repositioning should take place. 619 This method is primarily intended to be able to recover 620 from decoding errors. 621 622 """ 623 self.bytebuffer = b"" 624 self.charbuffer = self._empty_charbuffer 625 self.linebuffer = None 626 627 def seek(self, offset, whence=0): 628 """ Set the input stream's current position. 629 630 Resets the codec buffers used for keeping state. 631 """ 632 self.stream.seek(offset, whence) 633 self.reset() 634 635 def __next__(self): 636 637 """ Return the next decoded line from the input stream.""" 638 line = self.readline() 639 if line: 640 return line 641 raise StopIteration 642 643 def __iter__(self): 644 return self 645 646 def __getattr__(self, name, 647 getattr=getattr): 648 649 """ Inherit all other methods from the underlying stream. 650 """ 651 return getattr(self.stream, name) 652 653 def __enter__(self): 654 return self 655 656 def __exit__(self, type, value, tb): 657 self.stream.close() 658 659### 660 661class StreamReaderWriter: 662 663 """ StreamReaderWriter instances allow wrapping streams which 664 work in both read and write modes. 665 666 The design is such that one can use the factory functions 667 returned by the codec.lookup() function to construct the 668 instance. 669 670 """ 671 # Optional attributes set by the file wrappers below 672 encoding = 'unknown' 673 674 def __init__(self, stream, Reader, Writer, errors='strict'): 675 676 """ Creates a StreamReaderWriter instance. 677 678 stream must be a Stream-like object. 679 680 Reader, Writer must be factory functions or classes 681 providing the StreamReader, StreamWriter interface resp. 682 683 Error handling is done in the same way as defined for the 684 StreamWriter/Readers. 685 686 """ 687 self.stream = stream 688 self.reader = Reader(stream, errors) 689 self.writer = Writer(stream, errors) 690 self.errors = errors 691 692 def read(self, size=-1): 693 694 return self.reader.read(size) 695 696 def readline(self, size=None): 697 698 return self.reader.readline(size) 699 700 def readlines(self, sizehint=None): 701 702 return self.reader.readlines(sizehint) 703 704 def __next__(self): 705 706 """ Return the next decoded line from the input stream.""" 707 return next(self.reader) 708 709 def __iter__(self): 710 return self 711 712 def write(self, data): 713 714 return self.writer.write(data) 715 716 def writelines(self, list): 717 718 return self.writer.writelines(list) 719 720 def reset(self): 721 722 self.reader.reset() 723 self.writer.reset() 724 725 def seek(self, offset, whence=0): 726 self.stream.seek(offset, whence) 727 self.reader.reset() 728 if whence == 0 and offset == 0: 729 self.writer.reset() 730 731 def __getattr__(self, name, 732 getattr=getattr): 733 734 """ Inherit all other methods from the underlying stream. 735 """ 736 return getattr(self.stream, name) 737 738 # these are needed to make "with codecs.open(...)" work properly 739 740 def __enter__(self): 741 return self 742 743 def __exit__(self, type, value, tb): 744 self.stream.close() 745 746### 747 748class StreamRecoder: 749 750 """ StreamRecoder instances translate data from one encoding to another. 751 752 They use the complete set of APIs returned by the 753 codecs.lookup() function to implement their task. 754 755 Data written to the StreamRecoder is first decoded into an 756 intermediate format (depending on the "decode" codec) and then 757 written to the underlying stream using an instance of the provided 758 Writer class. 759 760 In the other direction, data is read from the underlying stream using 761 a Reader instance and then encoded and returned to the caller. 762 763 """ 764 # Optional attributes set by the file wrappers below 765 data_encoding = 'unknown' 766 file_encoding = 'unknown' 767 768 def __init__(self, stream, encode, decode, Reader, Writer, 769 errors='strict'): 770 771 """ Creates a StreamRecoder instance which implements a two-way 772 conversion: encode and decode work on the frontend (the 773 data visible to .read() and .write()) while Reader and Writer 774 work on the backend (the data in stream). 775 776 You can use these objects to do transparent 777 transcodings from e.g. latin-1 to utf-8 and back. 778 779 stream must be a file-like object. 780 781 encode and decode must adhere to the Codec interface; Reader and 782 Writer must be factory functions or classes providing the 783 StreamReader and StreamWriter interfaces resp. 784 785 Error handling is done in the same way as defined for the 786 StreamWriter/Readers. 787 788 """ 789 self.stream = stream 790 self.encode = encode 791 self.decode = decode 792 self.reader = Reader(stream, errors) 793 self.writer = Writer(stream, errors) 794 self.errors = errors 795 796 def read(self, size=-1): 797 798 data = self.reader.read(size) 799 data, bytesencoded = self.encode(data, self.errors) 800 return data 801 802 def readline(self, size=None): 803 804 if size is None: 805 data = self.reader.readline() 806 else: 807 data = self.reader.readline(size) 808 data, bytesencoded = self.encode(data, self.errors) 809 return data 810 811 def readlines(self, sizehint=None): 812 813 data = self.reader.read() 814 data, bytesencoded = self.encode(data, self.errors) 815 return data.splitlines(keepends=True) 816 817 def __next__(self): 818 819 """ Return the next decoded line from the input stream.""" 820 data = next(self.reader) 821 data, bytesencoded = self.encode(data, self.errors) 822 return data 823 824 def __iter__(self): 825 return self 826 827 def write(self, data): 828 829 data, bytesdecoded = self.decode(data, self.errors) 830 return self.writer.write(data) 831 832 def writelines(self, list): 833 834 data = ''.join(list) 835 data, bytesdecoded = self.decode(data, self.errors) 836 return self.writer.write(data) 837 838 def reset(self): 839 840 self.reader.reset() 841 self.writer.reset() 842 843 def __getattr__(self, name, 844 getattr=getattr): 845 846 """ Inherit all other methods from the underlying stream. 847 """ 848 return getattr(self.stream, name) 849 850 def __enter__(self): 851 return self 852 853 def __exit__(self, type, value, tb): 854 self.stream.close() 855 856### Shortcuts 857 858def open(filename, mode='r', encoding=None, errors='strict', buffering=1): 859 860 """ Open an encoded file using the given mode and return 861 a wrapped version providing transparent encoding/decoding. 862 863 Note: The wrapped version will only accept the object format 864 defined by the codecs, i.e. Unicode objects for most builtin 865 codecs. Output is also codec dependent and will usually be 866 Unicode as well. 867 868 Underlying encoded files are always opened in binary mode. 869 The default file mode is 'r', meaning to open the file in read mode. 870 871 encoding specifies the encoding which is to be used for the 872 file. 873 874 errors may be given to define the error handling. It defaults 875 to 'strict' which causes ValueErrors to be raised in case an 876 encoding error occurs. 877 878 buffering has the same meaning as for the builtin open() API. 879 It defaults to line buffered. 880 881 The returned wrapped file object provides an extra attribute 882 .encoding which allows querying the used encoding. This 883 attribute is only available if an encoding was specified as 884 parameter. 885 886 """ 887 if encoding is not None and \ 888 'b' not in mode: 889 # Force opening of the file in binary mode 890 mode = mode + 'b' 891 file = builtins.open(filename, mode, buffering) 892 if encoding is None: 893 return file 894 info = lookup(encoding) 895 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 896 # Add attributes to simplify introspection 897 srw.encoding = encoding 898 return srw 899 900def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 901 902 """ Return a wrapped version of file which provides transparent 903 encoding translation. 904 905 Data written to the wrapped file is decoded according 906 to the given data_encoding and then encoded to the underlying 907 file using file_encoding. The intermediate data type 908 will usually be Unicode but depends on the specified codecs. 909 910 Bytes read from the file are decoded using file_encoding and then 911 passed back to the caller encoded using data_encoding. 912 913 If file_encoding is not given, it defaults to data_encoding. 914 915 errors may be given to define the error handling. It defaults 916 to 'strict' which causes ValueErrors to be raised in case an 917 encoding error occurs. 918 919 The returned wrapped file object provides two extra attributes 920 .data_encoding and .file_encoding which reflect the given 921 parameters of the same name. The attributes can be used for 922 introspection by Python programs. 923 924 """ 925 if file_encoding is None: 926 file_encoding = data_encoding 927 data_info = lookup(data_encoding) 928 file_info = lookup(file_encoding) 929 sr = StreamRecoder(file, data_info.encode, data_info.decode, 930 file_info.streamreader, file_info.streamwriter, errors) 931 # Add attributes to simplify introspection 932 sr.data_encoding = data_encoding 933 sr.file_encoding = file_encoding 934 return sr 935 936### Helpers for codec lookup 937 938def getencoder(encoding): 939 940 """ Lookup up the codec for the given encoding and return 941 its encoder function. 942 943 Raises a LookupError in case the encoding cannot be found. 944 945 """ 946 return lookup(encoding).encode 947 948def getdecoder(encoding): 949 950 """ Lookup up the codec for the given encoding and return 951 its decoder function. 952 953 Raises a LookupError in case the encoding cannot be found. 954 955 """ 956 return lookup(encoding).decode 957 958def getincrementalencoder(encoding): 959 960 """ Lookup up the codec for the given encoding and return 961 its IncrementalEncoder class or factory function. 962 963 Raises a LookupError in case the encoding cannot be found 964 or the codecs doesn't provide an incremental encoder. 965 966 """ 967 encoder = lookup(encoding).incrementalencoder 968 if encoder is None: 969 raise LookupError(encoding) 970 return encoder 971 972def getincrementaldecoder(encoding): 973 974 """ Lookup up the codec for the given encoding and return 975 its IncrementalDecoder class or factory function. 976 977 Raises a LookupError in case the encoding cannot be found 978 or the codecs doesn't provide an incremental decoder. 979 980 """ 981 decoder = lookup(encoding).incrementaldecoder 982 if decoder is None: 983 raise LookupError(encoding) 984 return decoder 985 986def getreader(encoding): 987 988 """ Lookup up the codec for the given encoding and return 989 its StreamReader class or factory function. 990 991 Raises a LookupError in case the encoding cannot be found. 992 993 """ 994 return lookup(encoding).streamreader 995 996def getwriter(encoding): 997 998 """ Lookup up the codec for the given encoding and return 999 its StreamWriter class or factory function. 1000 1001 Raises a LookupError in case the encoding cannot be found. 1002 1003 """ 1004 return lookup(encoding).streamwriter 1005 1006def iterencode(iterator, encoding, errors='strict', **kwargs): 1007 """ 1008 Encoding iterator. 1009 1010 Encodes the input strings from the iterator using a IncrementalEncoder. 1011 1012 errors and kwargs are passed through to the IncrementalEncoder 1013 constructor. 1014 """ 1015 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1016 for input in iterator: 1017 output = encoder.encode(input) 1018 if output: 1019 yield output 1020 output = encoder.encode("", True) 1021 if output: 1022 yield output 1023 1024def iterdecode(iterator, encoding, errors='strict', **kwargs): 1025 """ 1026 Decoding iterator. 1027 1028 Decodes the input strings from the iterator using a IncrementalDecoder. 1029 1030 errors and kwargs are passed through to the IncrementalDecoder 1031 constructor. 1032 """ 1033 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1034 for input in iterator: 1035 output = decoder.decode(input) 1036 if output: 1037 yield output 1038 output = decoder.decode(b"", True) 1039 if output: 1040 yield output 1041 1042### Helpers for charmap-based codecs 1043 1044def make_identity_dict(rng): 1045 1046 """ make_identity_dict(rng) -> dict 1047 1048 Return a dictionary where elements of the rng sequence are 1049 mapped to themselves. 1050 1051 """ 1052 return {i:i for i in rng} 1053 1054def make_encoding_map(decoding_map): 1055 1056 """ Creates an encoding map from a decoding map. 1057 1058 If a target mapping in the decoding map occurs multiple 1059 times, then that target is mapped to None (undefined mapping), 1060 causing an exception when encountered by the charmap codec 1061 during translation. 1062 1063 One example where this happens is cp875.py which decodes 1064 multiple character to \u001a. 1065 1066 """ 1067 m = {} 1068 for k,v in decoding_map.items(): 1069 if not v in m: 1070 m[v] = k 1071 else: 1072 m[v] = None 1073 return m 1074 1075### error handlers 1076 1077try: 1078 strict_errors = lookup_error("strict") 1079 ignore_errors = lookup_error("ignore") 1080 replace_errors = lookup_error("replace") 1081 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1082 backslashreplace_errors = lookup_error("backslashreplace") 1083except LookupError: 1084 # In --disable-unicode builds, these error handler are missing 1085 strict_errors = None 1086 ignore_errors = None 1087 replace_errors = None 1088 xmlcharrefreplace_errors = None 1089 backslashreplace_errors = None 1090 1091# Tell modulefinder that using codecs probably needs the encodings 1092# package 1093_false = 0 1094if _false: 1095 import encodings 1096 1097### Tests 1098 1099if __name__ == '__main__': 1100 1101 # Make stdout translate Latin-1 output into UTF-8 output 1102 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1103 1104 # Have stdin translate Latin-1 input into UTF-8 input 1105 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1106