codecs.py revision e84b6336db4a2521de91aa916676bdf494aa8205
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import builtins, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError as why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "strict_errors", "ignore_errors", "replace_errors",
24           "xmlcharrefreplace_errors",
25           "register_error", "lookup_error"]
26
27### Constants
28
29#
30# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
33#
34
35# UTF-8
36BOM_UTF8 = b'\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = b'\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = b'\x00\x00\xfe\xff'
49
50if sys.byteorder == 'little':
51
52    # UTF-16, native endianness
53    BOM = BOM_UTF16 = BOM_UTF16_LE
54
55    # UTF-32, native endianness
56    BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_BE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_BE
65
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
71
72
73### Codec base classes (defining the API)
74
75class CodecInfo(tuple):
76
77    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78        incrementalencoder=None, incrementaldecoder=None, name=None):
79        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80        self.name = name
81        self.encode = encode
82        self.decode = decode
83        self.incrementalencoder = incrementalencoder
84        self.incrementaldecoder = incrementaldecoder
85        self.streamwriter = streamwriter
86        self.streamreader = streamreader
87        return self
88
89    def __repr__(self):
90        return "<%s.%s object for encoding %s at 0x%x>" % \
91                (self.__class__.__module__, self.__class__.__name__,
92                 self.name, id(self))
93
94class Codec:
95
96    """ Defines the interface for stateless encoders/decoders.
97
98        The .encode()/.decode() methods may use different error
99        handling schemes by providing the errors argument. These
100        string values are predefined:
101
102         'strict' - raise a ValueError error (or a subclass)
103         'ignore' - ignore the character and continue with the next
104         'replace' - replace with a suitable replacement character;
105                    Python will use the official U+FFFD REPLACEMENT
106                    CHARACTER for the builtin Unicode codecs on
107                    decoding and '?' on encoding.
108         'xmlcharrefreplace' - Replace with the appropriate XML
109                               character reference (only for encoding).
110         'backslashreplace'  - Replace with backslashed escape sequences
111                               (only for encoding).
112
113        The set of allowed values can be extended via register_error.
114
115    """
116    def encode(self, input, errors='strict'):
117
118        """ Encodes the object input and returns a tuple (output
119            object, length consumed).
120
121            errors defines the error handling to apply. It defaults to
122            'strict' handling.
123
124            The method may not store state in the Codec instance. Use
125            StreamCodec for codecs which have to keep state in order to
126            make encoding/decoding efficient.
127
128            The encoder must be able to handle zero length input and
129            return an empty object of the output object type in this
130            situation.
131
132        """
133        raise NotImplementedError
134
135    def decode(self, input, errors='strict'):
136
137        """ Decodes the object input and returns a tuple (output
138            object, length consumed).
139
140            input must be an object which provides the bf_getreadbuf
141            buffer slot. Python strings, buffer objects and memory
142            mapped files are examples of objects providing this slot.
143
144            errors defines the error handling to apply. It defaults to
145            'strict' handling.
146
147            The method may not store state in the Codec instance. Use
148            StreamCodec for codecs which have to keep state in order to
149            make encoding/decoding efficient.
150
151            The decoder must be able to handle zero length input and
152            return an empty object of the output object type in this
153            situation.
154
155        """
156        raise NotImplementedError
157
158class IncrementalEncoder(object):
159    """
160    An IncrementalEncoder encodes an input in multiple steps. The input can
161    be passed piece by piece to the encode() method. The IncrementalEncoder
162    remembers the state of the encoding process between calls to encode().
163    """
164    def __init__(self, errors='strict'):
165        """
166        Creates an IncrementalEncoder instance.
167
168        The IncrementalEncoder may use different error handling schemes by
169        providing the errors keyword argument. See the module docstring
170        for a list of possible values.
171        """
172        self.errors = errors
173        self.buffer = ""
174
175    def encode(self, input, final=False):
176        """
177        Encodes input and returns the resulting object.
178        """
179        raise NotImplementedError
180
181    def reset(self):
182        """
183        Resets the encoder to the initial state.
184        """
185
186    def getstate(self):
187        """
188        Return the current state of the encoder.
189        """
190        return 0
191
192    def setstate(self, state):
193        """
194        Set the current state of the encoder. state must have been
195        returned by getstate().
196        """
197
198class BufferedIncrementalEncoder(IncrementalEncoder):
199    """
200    This subclass of IncrementalEncoder can be used as the baseclass for an
201    incremental encoder if the encoder must keep some of the output in a
202    buffer between calls to encode().
203    """
204    def __init__(self, errors='strict'):
205        IncrementalEncoder.__init__(self, errors)
206        # unencoded input that is kept between calls to encode()
207        self.buffer = ""
208
209    def _buffer_encode(self, input, errors, final):
210        # Overwrite this method in subclasses: It must encode input
211        # and return an (output, length consumed) tuple
212        raise NotImplementedError
213
214    def encode(self, input, final=False):
215        # encode input (taking the buffer into account)
216        data = self.buffer + input
217        (result, consumed) = self._buffer_encode(data, self.errors, final)
218        # keep unencoded input until the next call
219        self.buffer = data[consumed:]
220        return result
221
222    def reset(self):
223        IncrementalEncoder.reset(self)
224        self.buffer = ""
225
226    def getstate(self):
227        return self.buffer or 0
228
229    def setstate(self, state):
230        self.buffer = state or ""
231
232class IncrementalDecoder(object):
233    """
234    An IncrementalDecoder decodes an input in multiple steps. The input can
235    be passed piece by piece to the decode() method. The IncrementalDecoder
236    remembers the state of the decoding process between calls to decode().
237    """
238    def __init__(self, errors='strict'):
239        """
240        Creates a IncrementalDecoder instance.
241
242        The IncrementalDecoder may use different error handling schemes by
243        providing the errors keyword argument. See the module docstring
244        for a list of possible values.
245        """
246        self.errors = errors
247
248    def decode(self, input, final=False):
249        """
250        Decodes input and returns the resulting object.
251        """
252        raise NotImplementedError
253
254    def reset(self):
255        """
256        Resets the decoder to the initial state.
257        """
258
259    def getstate(self):
260        """
261        Return the current state of the decoder. This must be a
262        (buffered_input, additional_state_info) tuple.  By convention,
263        additional_state_info should represent the state of the decoder
264        WITHOUT yet having processed the contents of buffered_input.
265        """
266        return (b"", 0)
267
268    def setstate(self, state):
269        """
270        Set the current state of the decoder. state must have been
271        returned by getstate().
272        """
273
274class BufferedIncrementalDecoder(IncrementalDecoder):
275    """
276    This subclass of IncrementalDecoder can be used as the baseclass for an
277    incremental decoder if the decoder must be able to handle incomplete
278    byte sequences.
279    """
280    def __init__(self, errors='strict'):
281        IncrementalDecoder.__init__(self, errors)
282        # undecoded input that is kept between calls to decode()
283        self.buffer = b""
284
285    def _buffer_decode(self, input, errors, final):
286        # Overwrite this method in subclasses: It must decode input
287        # and return an (output, length consumed) tuple
288        raise NotImplementedError
289
290    def decode(self, input, final=False):
291        # decode input (taking the buffer into account)
292        data = self.buffer + input
293        (result, consumed) = self._buffer_decode(data, self.errors, final)
294        # keep undecoded input until the next call
295        self.buffer = data[consumed:]
296        return result
297
298    def reset(self):
299        IncrementalDecoder.reset(self)
300        self.buffer = b""
301
302    def getstate(self):
303        # additional state info is always 0
304        return (self.buffer, 0)
305
306    def setstate(self, state):
307        # ignore additional state info
308        self.buffer = state[0]
309
310#
311# The StreamWriter and StreamReader class provide generic working
312# interfaces which can be used to implement new encoding submodules
313# very easily. See encodings/utf_8.py for an example on how this is
314# done.
315#
316
317class StreamWriter(Codec):
318
319    def __init__(self, stream, errors='strict'):
320
321        """ Creates a StreamWriter instance.
322
323            stream must be a file-like object open for writing
324            (binary) data.
325
326            The StreamWriter may use different error handling
327            schemes by providing the errors keyword argument. These
328            parameters are predefined:
329
330             'strict' - raise a ValueError (or a subclass)
331             'ignore' - ignore the character and continue with the next
332             'replace'- replace with a suitable replacement character
333             'xmlcharrefreplace' - Replace with the appropriate XML
334                                   character reference.
335             'backslashreplace'  - Replace with backslashed escape
336                                   sequences (only for encoding).
337
338            The set of allowed parameter values can be extended via
339            register_error.
340        """
341        self.stream = stream
342        self.errors = errors
343
344    def write(self, object):
345
346        """ Writes the object's contents encoded to self.stream.
347        """
348        data, consumed = self.encode(object, self.errors)
349        self.stream.write(data)
350
351    def writelines(self, list):
352
353        """ Writes the concatenated list of strings to the stream
354            using .write().
355        """
356        self.write(''.join(list))
357
358    def reset(self):
359
360        """ Flushes and resets the codec buffers used for keeping state.
361
362            Calling this method should ensure that the data on the
363            output is put into a clean state, that allows appending
364            of new fresh data without having to rescan the whole
365            stream to recover state.
366
367        """
368        pass
369
370    def __getattr__(self, name,
371                    getattr=getattr):
372
373        """ Inherit all other methods from the underlying stream.
374        """
375        return getattr(self.stream, name)
376
377    def __enter__(self):
378        return self
379
380    def __exit__(self, type, value, tb):
381        self.stream.close()
382
383###
384
385class StreamReader(Codec):
386
387    def __init__(self, stream, errors='strict'):
388
389        """ Creates a StreamReader instance.
390
391            stream must be a file-like object open for reading
392            (binary) data.
393
394            The StreamReader may use different error handling
395            schemes by providing the errors keyword argument. These
396            parameters are predefined:
397
398             'strict' - raise a ValueError (or a subclass)
399             'ignore' - ignore the character and continue with the next
400             'replace'- replace with a suitable replacement character;
401
402            The set of allowed parameter values can be extended via
403            register_error.
404        """
405        self.stream = stream
406        self.errors = errors
407        self.bytebuffer = b""
408        # For str->str decoding this will stay a str
409        # For str->unicode decoding the first read will promote it to unicode
410        self.charbuffer = ""
411        self.linebuffer = None
412
413    def decode(self, input, errors='strict'):
414        raise NotImplementedError
415
416    def read(self, size=-1, chars=-1, firstline=False):
417
418        """ Decodes data from the stream self.stream and returns the
419            resulting object.
420
421            chars indicates the number of characters to read from the
422            stream. read() will never return more than chars
423            characters, but it might return less, if there are not enough
424            characters available.
425
426            size indicates the approximate maximum number of bytes to
427            read from the stream for decoding purposes. The decoder
428            can modify this setting as appropriate. The default value
429            -1 indicates to read and decode as much as possible.  size
430            is intended to prevent having to decode huge files in one
431            step.
432
433            If firstline is true, and a UnicodeDecodeError happens
434            after the first line terminator in the input only the first line
435            will be returned, the rest of the input will be kept until the
436            next call to read().
437
438            The method should use a greedy read strategy meaning that
439            it should read as much data as is allowed within the
440            definition of the encoding and the given size, e.g.  if
441            optional encoding endings or state markers are available
442            on the stream, these should be read too.
443        """
444        # If we have lines cached, first merge them back into characters
445        if self.linebuffer:
446            self.charbuffer = "".join(self.linebuffer)
447            self.linebuffer = None
448
449        # read until we get the required number of characters (if available)
450        while True:
451            # can the request can be satisfied from the character buffer?
452            if chars < 0:
453                if size < 0:
454                    if self.charbuffer:
455                        break
456                elif len(self.charbuffer) >= size:
457                    break
458            else:
459                if len(self.charbuffer) >= chars:
460                    break
461            # we need more data
462            if size < 0:
463                newdata = self.stream.read()
464            else:
465                newdata = self.stream.read(size)
466            # decode bytes (those remaining from the last call included)
467            data = self.bytebuffer + newdata
468            try:
469                newchars, decodedbytes = self.decode(data, self.errors)
470            except UnicodeDecodeError as exc:
471                if firstline:
472                    newchars, decodedbytes = \
473                        self.decode(data[:exc.start], self.errors)
474                    lines = newchars.splitlines(True)
475                    if len(lines)<=1:
476                        raise
477                else:
478                    raise
479            # keep undecoded bytes until the next call
480            self.bytebuffer = data[decodedbytes:]
481            # put new characters in the character buffer
482            self.charbuffer += newchars
483            # there was no data available
484            if not newdata:
485                break
486        if chars < 0:
487            # Return everything we've got
488            result = self.charbuffer
489            self.charbuffer = ""
490        else:
491            # Return the first chars characters
492            result = self.charbuffer[:chars]
493            self.charbuffer = self.charbuffer[chars:]
494        return result
495
496    def readline(self, size=None, keepends=True):
497
498        """ Read one line from the input stream and return the
499            decoded data.
500
501            size, if given, is passed as size argument to the
502            read() method.
503
504        """
505        # If we have lines cached from an earlier read, return
506        # them unconditionally
507        if self.linebuffer:
508            line = self.linebuffer[0]
509            del self.linebuffer[0]
510            if len(self.linebuffer) == 1:
511                # revert to charbuffer mode; we might need more data
512                # next time
513                self.charbuffer = self.linebuffer[0]
514                self.linebuffer = None
515            if not keepends:
516                line = line.splitlines(False)[0]
517            return line
518
519        readsize = size or 72
520        line = ""
521        # If size is given, we call read() only once
522        while True:
523            data = self.read(readsize, firstline=True)
524            if data:
525                # If we're at a "\r" read one extra character (which might
526                # be a "\n") to get a proper line ending. If the stream is
527                # temporarily exhausted we return the wrong line ending.
528                if data.endswith("\r"):
529                    data += self.read(size=1, chars=1)
530
531            line += data
532            lines = line.splitlines(True)
533            if lines:
534                if len(lines) > 1:
535                    # More than one line result; the first line is a full line
536                    # to return
537                    line = lines[0]
538                    del lines[0]
539                    if len(lines) > 1:
540                        # cache the remaining lines
541                        lines[-1] += self.charbuffer
542                        self.linebuffer = lines
543                        self.charbuffer = None
544                    else:
545                        # only one remaining line, put it back into charbuffer
546                        self.charbuffer = lines[0] + self.charbuffer
547                    if not keepends:
548                        line = line.splitlines(False)[0]
549                    break
550                line0withend = lines[0]
551                line0withoutend = lines[0].splitlines(False)[0]
552                if line0withend != line0withoutend: # We really have a line end
553                    # Put the rest back together and keep it until the next call
554                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
555                    if keepends:
556                        line = line0withend
557                    else:
558                        line = line0withoutend
559                    break
560            # we didn't get anything or this was our only try
561            if not data or size is not None:
562                if line and not keepends:
563                    line = line.splitlines(False)[0]
564                break
565            if readsize<8000:
566                readsize *= 2
567        return line
568
569    def readlines(self, sizehint=None, keepends=True):
570
571        """ Read all lines available on the input stream
572            and return them as list of lines.
573
574            Line breaks are implemented using the codec's decoder
575            method and are included in the list entries.
576
577            sizehint, if given, is ignored since there is no efficient
578            way to finding the true end-of-line.
579
580        """
581        data = self.read()
582        return data.splitlines(keepends)
583
584    def reset(self):
585
586        """ Resets the codec buffers used for keeping state.
587
588            Note that no stream repositioning should take place.
589            This method is primarily intended to be able to recover
590            from decoding errors.
591
592        """
593        self.bytebuffer = b""
594        self.charbuffer = ""
595        self.linebuffer = None
596
597    def seek(self, offset, whence=0):
598        """ Set the input stream's current position.
599
600            Resets the codec buffers used for keeping state.
601        """
602        self.reset()
603        self.stream.seek(offset, whence)
604
605    def __next__(self):
606
607        """ Return the next decoded line from the input stream."""
608        line = self.readline()
609        if line:
610            return line
611        raise StopIteration
612
613    def __iter__(self):
614        return self
615
616    def __getattr__(self, name,
617                    getattr=getattr):
618
619        """ Inherit all other methods from the underlying stream.
620        """
621        return getattr(self.stream, name)
622
623    def __enter__(self):
624        return self
625
626    def __exit__(self, type, value, tb):
627        self.stream.close()
628
629###
630
631class StreamReaderWriter:
632
633    """ StreamReaderWriter instances allow wrapping streams which
634        work in both read and write modes.
635
636        The design is such that one can use the factory functions
637        returned by the codec.lookup() function to construct the
638        instance.
639
640    """
641    # Optional attributes set by the file wrappers below
642    encoding = 'unknown'
643
644    def __init__(self, stream, Reader, Writer, errors='strict'):
645
646        """ Creates a StreamReaderWriter instance.
647
648            stream must be a Stream-like object.
649
650            Reader, Writer must be factory functions or classes
651            providing the StreamReader, StreamWriter interface resp.
652
653            Error handling is done in the same way as defined for the
654            StreamWriter/Readers.
655
656        """
657        self.stream = stream
658        self.reader = Reader(stream, errors)
659        self.writer = Writer(stream, errors)
660        self.errors = errors
661
662    def read(self, size=-1):
663
664        return self.reader.read(size)
665
666    def readline(self, size=None):
667
668        return self.reader.readline(size)
669
670    def readlines(self, sizehint=None):
671
672        return self.reader.readlines(sizehint)
673
674    def __next__(self):
675
676        """ Return the next decoded line from the input stream."""
677        return next(self.reader)
678
679    def __iter__(self):
680        return self
681
682    def write(self, data):
683
684        return self.writer.write(data)
685
686    def writelines(self, list):
687
688        return self.writer.writelines(list)
689
690    def reset(self):
691
692        self.reader.reset()
693        self.writer.reset()
694
695    def __getattr__(self, name,
696                    getattr=getattr):
697
698        """ Inherit all other methods from the underlying stream.
699        """
700        return getattr(self.stream, name)
701
702    # these are needed to make "with codecs.open(...)" work properly
703
704    def __enter__(self):
705        return self
706
707    def __exit__(self, type, value, tb):
708        self.stream.close()
709
710###
711
712class StreamRecoder:
713
714    """ StreamRecoder instances provide a frontend - backend
715        view of encoding data.
716
717        They use the complete set of APIs returned by the
718        codecs.lookup() function to implement their task.
719
720        Data written to the stream is first decoded into an
721        intermediate format (which is dependent on the given codec
722        combination) and then written to the stream using an instance
723        of the provided Writer class.
724
725        In the other direction, data is read from the stream using a
726        Reader instance and then return encoded data to the caller.
727
728    """
729    # Optional attributes set by the file wrappers below
730    data_encoding = 'unknown'
731    file_encoding = 'unknown'
732
733    def __init__(self, stream, encode, decode, Reader, Writer,
734                 errors='strict'):
735
736        """ Creates a StreamRecoder instance which implements a two-way
737            conversion: encode and decode work on the frontend (the
738            input to .read() and output of .write()) while
739            Reader and Writer work on the backend (reading and
740            writing to the stream).
741
742            You can use these objects to do transparent direct
743            recodings from e.g. latin-1 to utf-8 and back.
744
745            stream must be a file-like object.
746
747            encode, decode must adhere to the Codec interface, Reader,
748            Writer must be factory functions or classes providing the
749            StreamReader, StreamWriter interface resp.
750
751            encode and decode are needed for the frontend translation,
752            Reader and Writer for the backend translation. Unicode is
753            used as intermediate encoding.
754
755            Error handling is done in the same way as defined for the
756            StreamWriter/Readers.
757
758        """
759        self.stream = stream
760        self.encode = encode
761        self.decode = decode
762        self.reader = Reader(stream, errors)
763        self.writer = Writer(stream, errors)
764        self.errors = errors
765
766    def read(self, size=-1):
767
768        data = self.reader.read(size)
769        data, bytesencoded = self.encode(data, self.errors)
770        return data
771
772    def readline(self, size=None):
773
774        if size is None:
775            data = self.reader.readline()
776        else:
777            data = self.reader.readline(size)
778        data, bytesencoded = self.encode(data, self.errors)
779        return data
780
781    def readlines(self, sizehint=None):
782
783        data = self.reader.read()
784        data, bytesencoded = self.encode(data, self.errors)
785        return data.splitlines(1)
786
787    def __next__(self):
788
789        """ Return the next decoded line from the input stream."""
790        data = next(self.reader)
791        data, bytesencoded = self.encode(data, self.errors)
792        return data
793
794    def __iter__(self):
795        return self
796
797    def write(self, data):
798
799        data, bytesdecoded = self.decode(data, self.errors)
800        return self.writer.write(data)
801
802    def writelines(self, list):
803
804        data = ''.join(list)
805        data, bytesdecoded = self.decode(data, self.errors)
806        return self.writer.write(data)
807
808    def reset(self):
809
810        self.reader.reset()
811        self.writer.reset()
812
813    def __getattr__(self, name,
814                    getattr=getattr):
815
816        """ Inherit all other methods from the underlying stream.
817        """
818        return getattr(self.stream, name)
819
820    def __enter__(self):
821        return self
822
823    def __exit__(self, type, value, tb):
824        self.stream.close()
825
826### Shortcuts
827
828def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
829
830    """ Open an encoded file using the given mode and return
831        a wrapped version providing transparent encoding/decoding.
832
833        Note: The wrapped version will only accept the object format
834        defined by the codecs, i.e. Unicode objects for most builtin
835        codecs. Output is also codec dependent and will usually be
836        Unicode as well.
837
838        Files are always opened in binary mode, even if no binary mode
839        was specified. This is done to avoid data loss due to encodings
840        using 8-bit values. The default file mode is 'rb' meaning to
841        open the file in binary read mode.
842
843        encoding specifies the encoding which is to be used for the
844        file.
845
846        errors may be given to define the error handling. It defaults
847        to 'strict' which causes ValueErrors to be raised in case an
848        encoding error occurs.
849
850        buffering has the same meaning as for the builtin open() API.
851        It defaults to line buffered.
852
853        The returned wrapped file object provides an extra attribute
854        .encoding which allows querying the used encoding. This
855        attribute is only available if an encoding was specified as
856        parameter.
857
858    """
859    if encoding is not None and \
860       'b' not in mode:
861        # Force opening of the file in binary mode
862        mode = mode + 'b'
863    file = builtins.open(filename, mode, buffering)
864    if encoding is None:
865        return file
866    info = lookup(encoding)
867    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
868    # Add attributes to simplify introspection
869    srw.encoding = encoding
870    return srw
871
872def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
873
874    """ Return a wrapped version of file which provides transparent
875        encoding translation.
876
877        Strings written to the wrapped file are interpreted according
878        to the given data_encoding and then written to the original
879        file as string using file_encoding. The intermediate encoding
880        will usually be Unicode but depends on the specified codecs.
881
882        Strings are read from the file using file_encoding and then
883        passed back to the caller as string using data_encoding.
884
885        If file_encoding is not given, it defaults to data_encoding.
886
887        errors may be given to define the error handling. It defaults
888        to 'strict' which causes ValueErrors to be raised in case an
889        encoding error occurs.
890
891        The returned wrapped file object provides two extra attributes
892        .data_encoding and .file_encoding which reflect the given
893        parameters of the same name. The attributes can be used for
894        introspection by Python programs.
895
896    """
897    if file_encoding is None:
898        file_encoding = data_encoding
899    data_info = lookup(data_encoding)
900    file_info = lookup(file_encoding)
901    sr = StreamRecoder(file, data_info.encode, data_info.decode,
902                       file_info.streamreader, file_info.streamwriter, errors)
903    # Add attributes to simplify introspection
904    sr.data_encoding = data_encoding
905    sr.file_encoding = file_encoding
906    return sr
907
908### Helpers for codec lookup
909
910def getencoder(encoding):
911
912    """ Lookup up the codec for the given encoding and return
913        its encoder function.
914
915        Raises a LookupError in case the encoding cannot be found.
916
917    """
918    return lookup(encoding).encode
919
920def getdecoder(encoding):
921
922    """ Lookup up the codec for the given encoding and return
923        its decoder function.
924
925        Raises a LookupError in case the encoding cannot be found.
926
927    """
928    return lookup(encoding).decode
929
930def getincrementalencoder(encoding):
931
932    """ Lookup up the codec for the given encoding and return
933        its IncrementalEncoder class or factory function.
934
935        Raises a LookupError in case the encoding cannot be found
936        or the codecs doesn't provide an incremental encoder.
937
938    """
939    encoder = lookup(encoding).incrementalencoder
940    if encoder is None:
941        raise LookupError(encoding)
942    return encoder
943
944def getincrementaldecoder(encoding):
945
946    """ Lookup up the codec for the given encoding and return
947        its IncrementalDecoder class or factory function.
948
949        Raises a LookupError in case the encoding cannot be found
950        or the codecs doesn't provide an incremental decoder.
951
952    """
953    decoder = lookup(encoding).incrementaldecoder
954    if decoder is None:
955        raise LookupError(encoding)
956    return decoder
957
958def getreader(encoding):
959
960    """ Lookup up the codec for the given encoding and return
961        its StreamReader class or factory function.
962
963        Raises a LookupError in case the encoding cannot be found.
964
965    """
966    return lookup(encoding).streamreader
967
968def getwriter(encoding):
969
970    """ Lookup up the codec for the given encoding and return
971        its StreamWriter class or factory function.
972
973        Raises a LookupError in case the encoding cannot be found.
974
975    """
976    return lookup(encoding).streamwriter
977
978def iterencode(iterator, encoding, errors='strict', **kwargs):
979    """
980    Encoding iterator.
981
982    Encodes the input strings from the iterator using a IncrementalEncoder.
983
984    errors and kwargs are passed through to the IncrementalEncoder
985    constructor.
986    """
987    encoder = getincrementalencoder(encoding)(errors, **kwargs)
988    for input in iterator:
989        output = encoder.encode(input)
990        if output:
991            yield output
992    output = encoder.encode("", True)
993    if output:
994        yield output
995
996def iterdecode(iterator, encoding, errors='strict', **kwargs):
997    """
998    Decoding iterator.
999
1000    Decodes the input strings from the iterator using a IncrementalDecoder.
1001
1002    errors and kwargs are passed through to the IncrementalDecoder
1003    constructor.
1004    """
1005    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1006    for input in iterator:
1007        output = decoder.decode(input)
1008        if output:
1009            yield output
1010    output = decoder.decode(b"", True)
1011    if output:
1012        yield output
1013
1014### Helpers for charmap-based codecs
1015
1016def make_identity_dict(rng):
1017
1018    """ make_identity_dict(rng) -> dict
1019
1020        Return a dictionary where elements of the rng sequence are
1021        mapped to themselves.
1022
1023    """
1024    res = {}
1025    for i in rng:
1026        res[i]=i
1027    return res
1028
1029def make_encoding_map(decoding_map):
1030
1031    """ Creates an encoding map from a decoding map.
1032
1033        If a target mapping in the decoding map occurs multiple
1034        times, then that target is mapped to None (undefined mapping),
1035        causing an exception when encountered by the charmap codec
1036        during translation.
1037
1038        One example where this happens is cp875.py which decodes
1039        multiple character to \u001a.
1040
1041    """
1042    m = {}
1043    for k,v in decoding_map.items():
1044        if not v in m:
1045            m[v] = k
1046        else:
1047            m[v] = None
1048    return m
1049
1050### error handlers
1051
1052try:
1053    strict_errors = lookup_error("strict")
1054    ignore_errors = lookup_error("ignore")
1055    replace_errors = lookup_error("replace")
1056    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1057    backslashreplace_errors = lookup_error("backslashreplace")
1058except LookupError:
1059    # In --disable-unicode builds, these error handler are missing
1060    strict_errors = None
1061    ignore_errors = None
1062    replace_errors = None
1063    xmlcharrefreplace_errors = None
1064    backslashreplace_errors = None
1065
1066# Tell modulefinder that using codecs probably needs the encodings
1067# package
1068_false = 0
1069if _false:
1070    import encodings
1071
1072### Tests
1073
1074if __name__ == '__main__':
1075
1076    # Make stdout translate Latin-1 output into UTF-8 output
1077    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1078
1079    # Have stdin translate Latin-1 input into UTF-8 input
1080    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1081