codecs.py revision b9fdb7a452c2b6f7a628118b5f695bd061b62cc8
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import builtins, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError as why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24           "StreamReader", "StreamWriter",
25           "StreamReaderWriter", "StreamRecoder",
26           "getencoder", "getdecoder", "getincrementalencoder",
27           "getincrementaldecoder", "getreader", "getwriter",
28           "encode", "decode", "iterencode", "iterdecode",
29           "strict_errors", "ignore_errors", "replace_errors",
30           "xmlcharrefreplace_errors", "backslashreplace_errors",
31           "register_error", "lookup_error"]
32
33### Constants
34
35#
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
39#
40
41# UTF-8
42BOM_UTF8 = b'\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = b'\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = b'\x00\x00\xfe\xff'
55
56if sys.byteorder == 'little':
57
58    # UTF-16, native endianness
59    BOM = BOM_UTF16 = BOM_UTF16_LE
60
61    # UTF-32, native endianness
62    BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66    # UTF-16, native endianness
67    BOM = BOM_UTF16 = BOM_UTF16_BE
68
69    # UTF-32, native endianness
70    BOM_UTF32 = BOM_UTF32_BE
71
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
77
78
79### Codec base classes (defining the API)
80
81class CodecInfo(tuple):
82    """Codec details when looking up the codec registry"""
83
84    # Private API to allow Python 3.4 to blacklist the known non-Unicode
85    # codecs in the standard library. A more general mechanism to
86    # reliably distinguish test encodings from other codecs will hopefully
87    # be defined for Python 3.5
88    #
89    # See http://bugs.python.org/issue19619
90    _is_text_encoding = True # Assume codecs are text encodings by default
91
92    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
93        incrementalencoder=None, incrementaldecoder=None, name=None,
94        *, _is_text_encoding=None):
95        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96        self.name = name
97        self.encode = encode
98        self.decode = decode
99        self.incrementalencoder = incrementalencoder
100        self.incrementaldecoder = incrementaldecoder
101        self.streamwriter = streamwriter
102        self.streamreader = streamreader
103        if _is_text_encoding is not None:
104            self._is_text_encoding = _is_text_encoding
105        return self
106
107    def __repr__(self):
108        return "<%s.%s object for encoding %s at 0x%x>" % \
109                (self.__class__.__module__, self.__class__.__name__,
110                 self.name, id(self))
111
112class Codec:
113
114    """ Defines the interface for stateless encoders/decoders.
115
116        The .encode()/.decode() methods may use different error
117        handling schemes by providing the errors argument. These
118        string values are predefined:
119
120         'strict' - raise a ValueError error (or a subclass)
121         'ignore' - ignore the character and continue with the next
122         'replace' - replace with a suitable replacement character;
123                    Python will use the official U+FFFD REPLACEMENT
124                    CHARACTER for the builtin Unicode codecs on
125                    decoding and '?' on encoding.
126         'surrogateescape' - replace with private codepoints U+DCnn.
127         'xmlcharrefreplace' - Replace with the appropriate XML
128                               character reference (only for encoding).
129         'backslashreplace'  - Replace with backslashed escape sequences
130                               (only for encoding).
131
132        The set of allowed values can be extended via register_error.
133
134    """
135    def encode(self, input, errors='strict'):
136
137        """ Encodes the object input and returns a tuple (output
138            object, length consumed).
139
140            errors defines the error handling to apply. It defaults to
141            'strict' handling.
142
143            The method may not store state in the Codec instance. Use
144            StreamCodec for codecs which have to keep state in order to
145            make encoding/decoding efficient.
146
147            The encoder must be able to handle zero length input and
148            return an empty object of the output object type in this
149            situation.
150
151        """
152        raise NotImplementedError
153
154    def decode(self, input, errors='strict'):
155
156        """ Decodes the object input and returns a tuple (output
157            object, length consumed).
158
159            input must be an object which provides the bf_getreadbuf
160            buffer slot. Python strings, buffer objects and memory
161            mapped files are examples of objects providing this slot.
162
163            errors defines the error handling to apply. It defaults to
164            'strict' handling.
165
166            The method may not store state in the Codec instance. Use
167            StreamCodec for codecs which have to keep state in order to
168            make encoding/decoding efficient.
169
170            The decoder must be able to handle zero length input and
171            return an empty object of the output object type in this
172            situation.
173
174        """
175        raise NotImplementedError
176
177class IncrementalEncoder(object):
178    """
179    An IncrementalEncoder encodes an input in multiple steps. The input can
180    be passed piece by piece to the encode() method. The IncrementalEncoder
181    remembers the state of the encoding process between calls to encode().
182    """
183    def __init__(self, errors='strict'):
184        """
185        Creates an IncrementalEncoder instance.
186
187        The IncrementalEncoder may use different error handling schemes by
188        providing the errors keyword argument. See the module docstring
189        for a list of possible values.
190        """
191        self.errors = errors
192        self.buffer = ""
193
194    def encode(self, input, final=False):
195        """
196        Encodes input and returns the resulting object.
197        """
198        raise NotImplementedError
199
200    def reset(self):
201        """
202        Resets the encoder to the initial state.
203        """
204
205    def getstate(self):
206        """
207        Return the current state of the encoder.
208        """
209        return 0
210
211    def setstate(self, state):
212        """
213        Set the current state of the encoder. state must have been
214        returned by getstate().
215        """
216
217class BufferedIncrementalEncoder(IncrementalEncoder):
218    """
219    This subclass of IncrementalEncoder can be used as the baseclass for an
220    incremental encoder if the encoder must keep some of the output in a
221    buffer between calls to encode().
222    """
223    def __init__(self, errors='strict'):
224        IncrementalEncoder.__init__(self, errors)
225        # unencoded input that is kept between calls to encode()
226        self.buffer = ""
227
228    def _buffer_encode(self, input, errors, final):
229        # Overwrite this method in subclasses: It must encode input
230        # and return an (output, length consumed) tuple
231        raise NotImplementedError
232
233    def encode(self, input, final=False):
234        # encode input (taking the buffer into account)
235        data = self.buffer + input
236        (result, consumed) = self._buffer_encode(data, self.errors, final)
237        # keep unencoded input until the next call
238        self.buffer = data[consumed:]
239        return result
240
241    def reset(self):
242        IncrementalEncoder.reset(self)
243        self.buffer = ""
244
245    def getstate(self):
246        return self.buffer or 0
247
248    def setstate(self, state):
249        self.buffer = state or ""
250
251class IncrementalDecoder(object):
252    """
253    An IncrementalDecoder decodes an input in multiple steps. The input can
254    be passed piece by piece to the decode() method. The IncrementalDecoder
255    remembers the state of the decoding process between calls to decode().
256    """
257    def __init__(self, errors='strict'):
258        """
259        Create a IncrementalDecoder instance.
260
261        The IncrementalDecoder may use different error handling schemes by
262        providing the errors keyword argument. See the module docstring
263        for a list of possible values.
264        """
265        self.errors = errors
266
267    def decode(self, input, final=False):
268        """
269        Decode input and returns the resulting object.
270        """
271        raise NotImplementedError
272
273    def reset(self):
274        """
275        Reset the decoder to the initial state.
276        """
277
278    def getstate(self):
279        """
280        Return the current state of the decoder.
281
282        This must be a (buffered_input, additional_state_info) tuple.
283        buffered_input must be a bytes object containing bytes that
284        were passed to decode() that have not yet been converted.
285        additional_state_info must be a non-negative integer
286        representing the state of the decoder WITHOUT yet having
287        processed the contents of buffered_input.  In the initial state
288        and after reset(), getstate() must return (b"", 0).
289        """
290        return (b"", 0)
291
292    def setstate(self, state):
293        """
294        Set the current state of the decoder.
295
296        state must have been returned by getstate().  The effect of
297        setstate((b"", 0)) must be equivalent to reset().
298        """
299
300class BufferedIncrementalDecoder(IncrementalDecoder):
301    """
302    This subclass of IncrementalDecoder can be used as the baseclass for an
303    incremental decoder if the decoder must be able to handle incomplete
304    byte sequences.
305    """
306    def __init__(self, errors='strict'):
307        IncrementalDecoder.__init__(self, errors)
308        # undecoded input that is kept between calls to decode()
309        self.buffer = b""
310
311    def _buffer_decode(self, input, errors, final):
312        # Overwrite this method in subclasses: It must decode input
313        # and return an (output, length consumed) tuple
314        raise NotImplementedError
315
316    def decode(self, input, final=False):
317        # decode input (taking the buffer into account)
318        data = self.buffer + input
319        (result, consumed) = self._buffer_decode(data, self.errors, final)
320        # keep undecoded input until the next call
321        self.buffer = data[consumed:]
322        return result
323
324    def reset(self):
325        IncrementalDecoder.reset(self)
326        self.buffer = b""
327
328    def getstate(self):
329        # additional state info is always 0
330        return (self.buffer, 0)
331
332    def setstate(self, state):
333        # ignore additional state info
334        self.buffer = state[0]
335
336#
337# The StreamWriter and StreamReader class provide generic working
338# interfaces which can be used to implement new encoding submodules
339# very easily. See encodings/utf_8.py for an example on how this is
340# done.
341#
342
343class StreamWriter(Codec):
344
345    def __init__(self, stream, errors='strict'):
346
347        """ Creates a StreamWriter instance.
348
349            stream must be a file-like object open for writing.
350
351            The StreamWriter may use different error handling
352            schemes by providing the errors keyword argument. These
353            parameters are predefined:
354
355             'strict' - raise a ValueError (or a subclass)
356             'ignore' - ignore the character and continue with the next
357             'replace'- replace with a suitable replacement character
358             'xmlcharrefreplace' - Replace with the appropriate XML
359                                   character reference.
360             'backslashreplace'  - Replace with backslashed escape
361                                   sequences (only for encoding).
362
363            The set of allowed parameter values can be extended via
364            register_error.
365        """
366        self.stream = stream
367        self.errors = errors
368
369    def write(self, object):
370
371        """ Writes the object's contents encoded to self.stream.
372        """
373        data, consumed = self.encode(object, self.errors)
374        self.stream.write(data)
375
376    def writelines(self, list):
377
378        """ Writes the concatenated list of strings to the stream
379            using .write().
380        """
381        self.write(''.join(list))
382
383    def reset(self):
384
385        """ Flushes and resets the codec buffers used for keeping state.
386
387            Calling this method should ensure that the data on the
388            output is put into a clean state, that allows appending
389            of new fresh data without having to rescan the whole
390            stream to recover state.
391
392        """
393        pass
394
395    def seek(self, offset, whence=0):
396        self.stream.seek(offset, whence)
397        if whence == 0 and offset == 0:
398            self.reset()
399
400    def __getattr__(self, name,
401                    getattr=getattr):
402
403        """ Inherit all other methods from the underlying stream.
404        """
405        return getattr(self.stream, name)
406
407    def __enter__(self):
408        return self
409
410    def __exit__(self, type, value, tb):
411        self.stream.close()
412
413###
414
415class StreamReader(Codec):
416
417    charbuffertype = str
418
419    def __init__(self, stream, errors='strict'):
420
421        """ Creates a StreamReader instance.
422
423            stream must be a file-like object open for reading.
424
425            The StreamReader may use different error handling
426            schemes by providing the errors keyword argument. These
427            parameters are predefined:
428
429             'strict' - raise a ValueError (or a subclass)
430             'ignore' - ignore the character and continue with the next
431             'replace'- replace with a suitable replacement character;
432
433            The set of allowed parameter values can be extended via
434            register_error.
435        """
436        self.stream = stream
437        self.errors = errors
438        self.bytebuffer = b""
439        self._empty_charbuffer = self.charbuffertype()
440        self.charbuffer = self._empty_charbuffer
441        self.linebuffer = None
442
443    def decode(self, input, errors='strict'):
444        raise NotImplementedError
445
446    def read(self, size=-1, chars=-1, firstline=False):
447
448        """ Decodes data from the stream self.stream and returns the
449            resulting object.
450
451            chars indicates the number of decoded code points or bytes to
452            return. read() will never return more data than requested,
453            but it might return less, if there is not enough available.
454
455            size indicates the approximate maximum number of decoded
456            bytes or code points to read for decoding. The decoder
457            can modify this setting as appropriate. The default value
458            -1 indicates to read and decode as much as possible.  size
459            is intended to prevent having to decode huge files in one
460            step.
461
462            If firstline is true, and a UnicodeDecodeError happens
463            after the first line terminator in the input only the first line
464            will be returned, the rest of the input will be kept until the
465            next call to read().
466
467            The method should use a greedy read strategy, meaning that
468            it should read as much data as is allowed within the
469            definition of the encoding and the given size, e.g.  if
470            optional encoding endings or state markers are available
471            on the stream, these should be read too.
472        """
473        # If we have lines cached, first merge them back into characters
474        if self.linebuffer:
475            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
476            self.linebuffer = None
477
478        # read until we get the required number of characters (if available)
479        while True:
480            # can the request be satisfied from the character buffer?
481            if chars >= 0:
482                if len(self.charbuffer) >= chars:
483                    break
484            elif size >= 0:
485                if len(self.charbuffer) >= size:
486                    break
487            # we need more data
488            if size < 0:
489                newdata = self.stream.read()
490            else:
491                newdata = self.stream.read(size)
492            # decode bytes (those remaining from the last call included)
493            data = self.bytebuffer + newdata
494            if not data:
495                break
496            try:
497                newchars, decodedbytes = self.decode(data, self.errors)
498            except UnicodeDecodeError as exc:
499                if firstline:
500                    newchars, decodedbytes = \
501                        self.decode(data[:exc.start], self.errors)
502                    lines = newchars.splitlines(keepends=True)
503                    if len(lines)<=1:
504                        raise
505                else:
506                    raise
507            # keep undecoded bytes until the next call
508            self.bytebuffer = data[decodedbytes:]
509            # put new characters in the character buffer
510            self.charbuffer += newchars
511            # there was no data available
512            if not newdata:
513                break
514        if chars < 0:
515            # Return everything we've got
516            result = self.charbuffer
517            self.charbuffer = self._empty_charbuffer
518        else:
519            # Return the first chars characters
520            result = self.charbuffer[:chars]
521            self.charbuffer = self.charbuffer[chars:]
522        return result
523
524    def readline(self, size=None, keepends=True):
525
526        """ Read one line from the input stream and return the
527            decoded data.
528
529            size, if given, is passed as size argument to the
530            read() method.
531
532        """
533        # If we have lines cached from an earlier read, return
534        # them unconditionally
535        if self.linebuffer:
536            line = self.linebuffer[0]
537            del self.linebuffer[0]
538            if len(self.linebuffer) == 1:
539                # revert to charbuffer mode; we might need more data
540                # next time
541                self.charbuffer = self.linebuffer[0]
542                self.linebuffer = None
543            if not keepends:
544                line = line.splitlines(keepends=False)[0]
545            return line
546
547        readsize = size or 72
548        line = self._empty_charbuffer
549        # If size is given, we call read() only once
550        while True:
551            data = self.read(readsize, firstline=True)
552            if data:
553                # If we're at a "\r" read one extra character (which might
554                # be a "\n") to get a proper line ending. If the stream is
555                # temporarily exhausted we return the wrong line ending.
556                if (isinstance(data, str) and data.endswith("\r")) or \
557                   (isinstance(data, bytes) and data.endswith(b"\r")):
558                    data += self.read(size=1, chars=1)
559
560            line += data
561            lines = line.splitlines(keepends=True)
562            if lines:
563                if len(lines) > 1:
564                    # More than one line result; the first line is a full line
565                    # to return
566                    line = lines[0]
567                    del lines[0]
568                    if len(lines) > 1:
569                        # cache the remaining lines
570                        lines[-1] += self.charbuffer
571                        self.linebuffer = lines
572                        self.charbuffer = None
573                    else:
574                        # only one remaining line, put it back into charbuffer
575                        self.charbuffer = lines[0] + self.charbuffer
576                    if not keepends:
577                        line = line.splitlines(keepends=False)[0]
578                    break
579                line0withend = lines[0]
580                line0withoutend = lines[0].splitlines(keepends=False)[0]
581                if line0withend != line0withoutend: # We really have a line end
582                    # Put the rest back together and keep it until the next call
583                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
584                                      self.charbuffer
585                    if keepends:
586                        line = line0withend
587                    else:
588                        line = line0withoutend
589                    break
590            # we didn't get anything or this was our only try
591            if not data or size is not None:
592                if line and not keepends:
593                    line = line.splitlines(keepends=False)[0]
594                break
595            if readsize < 8000:
596                readsize *= 2
597        return line
598
599    def readlines(self, sizehint=None, keepends=True):
600
601        """ Read all lines available on the input stream
602            and return them as a list.
603
604            Line breaks are implemented using the codec's decoder
605            method and are included in the list entries.
606
607            sizehint, if given, is ignored since there is no efficient
608            way to finding the true end-of-line.
609
610        """
611        data = self.read()
612        return data.splitlines(keepends)
613
614    def reset(self):
615
616        """ Resets the codec buffers used for keeping state.
617
618            Note that no stream repositioning should take place.
619            This method is primarily intended to be able to recover
620            from decoding errors.
621
622        """
623        self.bytebuffer = b""
624        self.charbuffer = self._empty_charbuffer
625        self.linebuffer = None
626
627    def seek(self, offset, whence=0):
628        """ Set the input stream's current position.
629
630            Resets the codec buffers used for keeping state.
631        """
632        self.stream.seek(offset, whence)
633        self.reset()
634
635    def __next__(self):
636
637        """ Return the next decoded line from the input stream."""
638        line = self.readline()
639        if line:
640            return line
641        raise StopIteration
642
643    def __iter__(self):
644        return self
645
646    def __getattr__(self, name,
647                    getattr=getattr):
648
649        """ Inherit all other methods from the underlying stream.
650        """
651        return getattr(self.stream, name)
652
653    def __enter__(self):
654        return self
655
656    def __exit__(self, type, value, tb):
657        self.stream.close()
658
659###
660
661class StreamReaderWriter:
662
663    """ StreamReaderWriter instances allow wrapping streams which
664        work in both read and write modes.
665
666        The design is such that one can use the factory functions
667        returned by the codec.lookup() function to construct the
668        instance.
669
670    """
671    # Optional attributes set by the file wrappers below
672    encoding = 'unknown'
673
674    def __init__(self, stream, Reader, Writer, errors='strict'):
675
676        """ Creates a StreamReaderWriter instance.
677
678            stream must be a Stream-like object.
679
680            Reader, Writer must be factory functions or classes
681            providing the StreamReader, StreamWriter interface resp.
682
683            Error handling is done in the same way as defined for the
684            StreamWriter/Readers.
685
686        """
687        self.stream = stream
688        self.reader = Reader(stream, errors)
689        self.writer = Writer(stream, errors)
690        self.errors = errors
691
692    def read(self, size=-1):
693
694        return self.reader.read(size)
695
696    def readline(self, size=None):
697
698        return self.reader.readline(size)
699
700    def readlines(self, sizehint=None):
701
702        return self.reader.readlines(sizehint)
703
704    def __next__(self):
705
706        """ Return the next decoded line from the input stream."""
707        return next(self.reader)
708
709    def __iter__(self):
710        return self
711
712    def write(self, data):
713
714        return self.writer.write(data)
715
716    def writelines(self, list):
717
718        return self.writer.writelines(list)
719
720    def reset(self):
721
722        self.reader.reset()
723        self.writer.reset()
724
725    def seek(self, offset, whence=0):
726        self.stream.seek(offset, whence)
727        self.reader.reset()
728        if whence == 0 and offset == 0:
729            self.writer.reset()
730
731    def __getattr__(self, name,
732                    getattr=getattr):
733
734        """ Inherit all other methods from the underlying stream.
735        """
736        return getattr(self.stream, name)
737
738    # these are needed to make "with codecs.open(...)" work properly
739
740    def __enter__(self):
741        return self
742
743    def __exit__(self, type, value, tb):
744        self.stream.close()
745
746###
747
748class StreamRecoder:
749
750    """ StreamRecoder instances translate data from one encoding to another.
751
752        They use the complete set of APIs returned by the
753        codecs.lookup() function to implement their task.
754
755        Data written to the StreamRecoder is first decoded into an
756        intermediate format (depending on the "decode" codec) and then
757        written to the underlying stream using an instance of the provided
758        Writer class.
759
760        In the other direction, data is read from the underlying stream using
761        a Reader instance and then encoded and returned to the caller.
762
763    """
764    # Optional attributes set by the file wrappers below
765    data_encoding = 'unknown'
766    file_encoding = 'unknown'
767
768    def __init__(self, stream, encode, decode, Reader, Writer,
769                 errors='strict'):
770
771        """ Creates a StreamRecoder instance which implements a two-way
772            conversion: encode and decode work on the frontend (the
773            data visible to .read() and .write()) while Reader and Writer
774            work on the backend (the data in stream).
775
776            You can use these objects to do transparent
777            transcodings from e.g. latin-1 to utf-8 and back.
778
779            stream must be a file-like object.
780
781            encode and decode must adhere to the Codec interface; Reader and
782            Writer must be factory functions or classes providing the
783            StreamReader and StreamWriter interfaces resp.
784
785            Error handling is done in the same way as defined for the
786            StreamWriter/Readers.
787
788        """
789        self.stream = stream
790        self.encode = encode
791        self.decode = decode
792        self.reader = Reader(stream, errors)
793        self.writer = Writer(stream, errors)
794        self.errors = errors
795
796    def read(self, size=-1):
797
798        data = self.reader.read(size)
799        data, bytesencoded = self.encode(data, self.errors)
800        return data
801
802    def readline(self, size=None):
803
804        if size is None:
805            data = self.reader.readline()
806        else:
807            data = self.reader.readline(size)
808        data, bytesencoded = self.encode(data, self.errors)
809        return data
810
811    def readlines(self, sizehint=None):
812
813        data = self.reader.read()
814        data, bytesencoded = self.encode(data, self.errors)
815        return data.splitlines(keepends=True)
816
817    def __next__(self):
818
819        """ Return the next decoded line from the input stream."""
820        data = next(self.reader)
821        data, bytesencoded = self.encode(data, self.errors)
822        return data
823
824    def __iter__(self):
825        return self
826
827    def write(self, data):
828
829        data, bytesdecoded = self.decode(data, self.errors)
830        return self.writer.write(data)
831
832    def writelines(self, list):
833
834        data = ''.join(list)
835        data, bytesdecoded = self.decode(data, self.errors)
836        return self.writer.write(data)
837
838    def reset(self):
839
840        self.reader.reset()
841        self.writer.reset()
842
843    def __getattr__(self, name,
844                    getattr=getattr):
845
846        """ Inherit all other methods from the underlying stream.
847        """
848        return getattr(self.stream, name)
849
850    def __enter__(self):
851        return self
852
853    def __exit__(self, type, value, tb):
854        self.stream.close()
855
856### Shortcuts
857
858def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
859
860    """ Open an encoded file using the given mode and return
861        a wrapped version providing transparent encoding/decoding.
862
863        Note: The wrapped version will only accept the object format
864        defined by the codecs, i.e. Unicode objects for most builtin
865        codecs. Output is also codec dependent and will usually be
866        Unicode as well.
867
868        Underlying encoded files are always opened in binary mode.
869        The default file mode is 'r', meaning to open the file in read mode.
870
871        encoding specifies the encoding which is to be used for the
872        file.
873
874        errors may be given to define the error handling. It defaults
875        to 'strict' which causes ValueErrors to be raised in case an
876        encoding error occurs.
877
878        buffering has the same meaning as for the builtin open() API.
879        It defaults to line buffered.
880
881        The returned wrapped file object provides an extra attribute
882        .encoding which allows querying the used encoding. This
883        attribute is only available if an encoding was specified as
884        parameter.
885
886    """
887    if encoding is not None and \
888       'b' not in mode:
889        # Force opening of the file in binary mode
890        mode = mode + 'b'
891    file = builtins.open(filename, mode, buffering)
892    if encoding is None:
893        return file
894    info = lookup(encoding)
895    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
896    # Add attributes to simplify introspection
897    srw.encoding = encoding
898    return srw
899
900def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
901
902    """ Return a wrapped version of file which provides transparent
903        encoding translation.
904
905        Data written to the wrapped file is decoded according
906        to the given data_encoding and then encoded to the underlying
907        file using file_encoding. The intermediate data type
908        will usually be Unicode but depends on the specified codecs.
909
910        Bytes read from the file are decoded using file_encoding and then
911        passed back to the caller encoded using data_encoding.
912
913        If file_encoding is not given, it defaults to data_encoding.
914
915        errors may be given to define the error handling. It defaults
916        to 'strict' which causes ValueErrors to be raised in case an
917        encoding error occurs.
918
919        The returned wrapped file object provides two extra attributes
920        .data_encoding and .file_encoding which reflect the given
921        parameters of the same name. The attributes can be used for
922        introspection by Python programs.
923
924    """
925    if file_encoding is None:
926        file_encoding = data_encoding
927    data_info = lookup(data_encoding)
928    file_info = lookup(file_encoding)
929    sr = StreamRecoder(file, data_info.encode, data_info.decode,
930                       file_info.streamreader, file_info.streamwriter, errors)
931    # Add attributes to simplify introspection
932    sr.data_encoding = data_encoding
933    sr.file_encoding = file_encoding
934    return sr
935
936### Helpers for codec lookup
937
938def getencoder(encoding):
939
940    """ Lookup up the codec for the given encoding and return
941        its encoder function.
942
943        Raises a LookupError in case the encoding cannot be found.
944
945    """
946    return lookup(encoding).encode
947
948def getdecoder(encoding):
949
950    """ Lookup up the codec for the given encoding and return
951        its decoder function.
952
953        Raises a LookupError in case the encoding cannot be found.
954
955    """
956    return lookup(encoding).decode
957
958def getincrementalencoder(encoding):
959
960    """ Lookup up the codec for the given encoding and return
961        its IncrementalEncoder class or factory function.
962
963        Raises a LookupError in case the encoding cannot be found
964        or the codecs doesn't provide an incremental encoder.
965
966    """
967    encoder = lookup(encoding).incrementalencoder
968    if encoder is None:
969        raise LookupError(encoding)
970    return encoder
971
972def getincrementaldecoder(encoding):
973
974    """ Lookup up the codec for the given encoding and return
975        its IncrementalDecoder class or factory function.
976
977        Raises a LookupError in case the encoding cannot be found
978        or the codecs doesn't provide an incremental decoder.
979
980    """
981    decoder = lookup(encoding).incrementaldecoder
982    if decoder is None:
983        raise LookupError(encoding)
984    return decoder
985
986def getreader(encoding):
987
988    """ Lookup up the codec for the given encoding and return
989        its StreamReader class or factory function.
990
991        Raises a LookupError in case the encoding cannot be found.
992
993    """
994    return lookup(encoding).streamreader
995
996def getwriter(encoding):
997
998    """ Lookup up the codec for the given encoding and return
999        its StreamWriter class or factory function.
1000
1001        Raises a LookupError in case the encoding cannot be found.
1002
1003    """
1004    return lookup(encoding).streamwriter
1005
1006def iterencode(iterator, encoding, errors='strict', **kwargs):
1007    """
1008    Encoding iterator.
1009
1010    Encodes the input strings from the iterator using a IncrementalEncoder.
1011
1012    errors and kwargs are passed through to the IncrementalEncoder
1013    constructor.
1014    """
1015    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1016    for input in iterator:
1017        output = encoder.encode(input)
1018        if output:
1019            yield output
1020    output = encoder.encode("", True)
1021    if output:
1022        yield output
1023
1024def iterdecode(iterator, encoding, errors='strict', **kwargs):
1025    """
1026    Decoding iterator.
1027
1028    Decodes the input strings from the iterator using a IncrementalDecoder.
1029
1030    errors and kwargs are passed through to the IncrementalDecoder
1031    constructor.
1032    """
1033    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1034    for input in iterator:
1035        output = decoder.decode(input)
1036        if output:
1037            yield output
1038    output = decoder.decode(b"", True)
1039    if output:
1040        yield output
1041
1042### Helpers for charmap-based codecs
1043
1044def make_identity_dict(rng):
1045
1046    """ make_identity_dict(rng) -> dict
1047
1048        Return a dictionary where elements of the rng sequence are
1049        mapped to themselves.
1050
1051    """
1052    return {i:i for i in rng}
1053
1054def make_encoding_map(decoding_map):
1055
1056    """ Creates an encoding map from a decoding map.
1057
1058        If a target mapping in the decoding map occurs multiple
1059        times, then that target is mapped to None (undefined mapping),
1060        causing an exception when encountered by the charmap codec
1061        during translation.
1062
1063        One example where this happens is cp875.py which decodes
1064        multiple character to \u001a.
1065
1066    """
1067    m = {}
1068    for k,v in decoding_map.items():
1069        if not v in m:
1070            m[v] = k
1071        else:
1072            m[v] = None
1073    return m
1074
1075### error handlers
1076
1077try:
1078    strict_errors = lookup_error("strict")
1079    ignore_errors = lookup_error("ignore")
1080    replace_errors = lookup_error("replace")
1081    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1082    backslashreplace_errors = lookup_error("backslashreplace")
1083except LookupError:
1084    # In --disable-unicode builds, these error handler are missing
1085    strict_errors = None
1086    ignore_errors = None
1087    replace_errors = None
1088    xmlcharrefreplace_errors = None
1089    backslashreplace_errors = None
1090
1091# Tell modulefinder that using codecs probably needs the encodings
1092# package
1093_false = 0
1094if _false:
1095    import encodings
1096
1097### Tests
1098
1099if __name__ == '__main__':
1100
1101    # Make stdout translate Latin-1 output into UTF-8 output
1102    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1103
1104    # Have stdin translate Latin-1 input into UTF-8 input
1105    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1106