codecs.py revision 3fed0870a6fec72665068e09200c674b574dabdb
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import builtins, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError as why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "strict_errors", "ignore_errors", "replace_errors",
24           "xmlcharrefreplace_errors",
25           "register_error", "lookup_error"]
26
27### Constants
28
29#
30# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
33#
34
35# UTF-8
36BOM_UTF8 = b'\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = b'\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = b'\x00\x00\xfe\xff'
49
50if sys.byteorder == 'little':
51
52    # UTF-16, native endianness
53    BOM = BOM_UTF16 = BOM_UTF16_LE
54
55    # UTF-32, native endianness
56    BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_BE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_BE
65
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
71
72
73### Codec base classes (defining the API)
74
75class CodecInfo(tuple):
76
77    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78        incrementalencoder=None, incrementaldecoder=None, name=None):
79        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80        self.name = name
81        self.encode = encode
82        self.decode = decode
83        self.incrementalencoder = incrementalencoder
84        self.incrementaldecoder = incrementaldecoder
85        self.streamwriter = streamwriter
86        self.streamreader = streamreader
87        return self
88
89    def __repr__(self):
90        return "<%s.%s object for encoding %s at 0x%x>" % \
91                (self.__class__.__module__, self.__class__.__name__,
92                 self.name, id(self))
93
94class Codec:
95
96    """ Defines the interface for stateless encoders/decoders.
97
98        The .encode()/.decode() methods may use different error
99        handling schemes by providing the errors argument. These
100        string values are predefined:
101
102         'strict' - raise a ValueError error (or a subclass)
103         'ignore' - ignore the character and continue with the next
104         'replace' - replace with a suitable replacement character;
105                    Python will use the official U+FFFD REPLACEMENT
106                    CHARACTER for the builtin Unicode codecs on
107                    decoding and '?' on encoding.
108         'xmlcharrefreplace' - Replace with the appropriate XML
109                               character reference (only for encoding).
110         'backslashreplace'  - Replace with backslashed escape sequences
111                               (only for encoding).
112
113        The set of allowed values can be extended via register_error.
114
115    """
116    def encode(self, input, errors='strict'):
117
118        """ Encodes the object input and returns a tuple (output
119            object, length consumed).
120
121            errors defines the error handling to apply. It defaults to
122            'strict' handling.
123
124            The method may not store state in the Codec instance. Use
125            StreamCodec for codecs which have to keep state in order to
126            make encoding/decoding efficient.
127
128            The encoder must be able to handle zero length input and
129            return an empty object of the output object type in this
130            situation.
131
132        """
133        raise NotImplementedError
134
135    def decode(self, input, errors='strict'):
136
137        """ Decodes the object input and returns a tuple (output
138            object, length consumed).
139
140            input must be an object which provides the bf_getreadbuf
141            buffer slot. Python strings, buffer objects and memory
142            mapped files are examples of objects providing this slot.
143
144            errors defines the error handling to apply. It defaults to
145            'strict' handling.
146
147            The method may not store state in the Codec instance. Use
148            StreamCodec for codecs which have to keep state in order to
149            make encoding/decoding efficient.
150
151            The decoder must be able to handle zero length input and
152            return an empty object of the output object type in this
153            situation.
154
155        """
156        raise NotImplementedError
157
158class IncrementalEncoder(object):
159    """
160    An IncrementalEncoder encodes an input in multiple steps. The input can
161    be passed piece by piece to the encode() method. The IncrementalEncoder
162    remembers the state of the encoding process between calls to encode().
163    """
164    def __init__(self, errors='strict'):
165        """
166        Creates an IncrementalEncoder instance.
167
168        The IncrementalEncoder may use different error handling schemes by
169        providing the errors keyword argument. See the module docstring
170        for a list of possible values.
171        """
172        self.errors = errors
173        self.buffer = ""
174
175    def encode(self, input, final=False):
176        """
177        Encodes input and returns the resulting object.
178        """
179        raise NotImplementedError
180
181    def reset(self):
182        """
183        Resets the encoder to the initial state.
184        """
185
186    def getstate(self):
187        """
188        Return the current state of the encoder.
189        """
190        return 0
191
192    def setstate(self, state):
193        """
194        Set the current state of the encoder. state must have been
195        returned by getstate().
196        """
197
198class BufferedIncrementalEncoder(IncrementalEncoder):
199    """
200    This subclass of IncrementalEncoder can be used as the baseclass for an
201    incremental encoder if the encoder must keep some of the output in a
202    buffer between calls to encode().
203    """
204    def __init__(self, errors='strict'):
205        IncrementalEncoder.__init__(self, errors)
206        # unencoded input that is kept between calls to encode()
207        self.buffer = ""
208
209    def _buffer_encode(self, input, errors, final):
210        # Overwrite this method in subclasses: It must encode input
211        # and return an (output, length consumed) tuple
212        raise NotImplementedError
213
214    def encode(self, input, final=False):
215        # encode input (taking the buffer into account)
216        data = self.buffer + input
217        (result, consumed) = self._buffer_encode(data, self.errors, final)
218        # keep unencoded input until the next call
219        self.buffer = data[consumed:]
220        return result
221
222    def reset(self):
223        IncrementalEncoder.reset(self)
224        self.buffer = ""
225
226    def getstate(self):
227        return self.buffer or 0
228
229    def setstate(self, state):
230        self.buffer = state or ""
231
232class IncrementalDecoder(object):
233    """
234    An IncrementalDecoder decodes an input in multiple steps. The input can
235    be passed piece by piece to the decode() method. The IncrementalDecoder
236    remembers the state of the decoding process between calls to decode().
237    """
238    def __init__(self, errors='strict'):
239        """
240        Create a IncrementalDecoder instance.
241
242        The IncrementalDecoder may use different error handling schemes by
243        providing the errors keyword argument. See the module docstring
244        for a list of possible values.
245        """
246        self.errors = errors
247
248    def decode(self, input, final=False):
249        """
250        Decode input and returns the resulting object.
251        """
252        raise NotImplementedError
253
254    def reset(self):
255        """
256        Reset the decoder to the initial state.
257        """
258
259    def getstate(self):
260        """
261        Return the current state of the decoder.
262
263        This must be a (buffered_input, additional_state_info) tuple.
264        buffered_input must be a bytes object containing bytes that
265        were passed to decode() that have not yet been converted.
266        additional_state_info must be a non-negative integer
267        representing the state of the decoder WITHOUT yet having
268        processed the contents of buffered_input.  In the initial state
269        and after reset(), getstate() must return (b"", 0).
270        """
271        return (b"", 0)
272
273    def setstate(self, state):
274        """
275        Set the current state of the decoder.
276
277        state must have been returned by getstate().  The effect of
278        setstate((b"", 0)) must be equivalent to reset().
279        """
280
281class BufferedIncrementalDecoder(IncrementalDecoder):
282    """
283    This subclass of IncrementalDecoder can be used as the baseclass for an
284    incremental decoder if the decoder must be able to handle incomplete
285    byte sequences.
286    """
287    def __init__(self, errors='strict'):
288        IncrementalDecoder.__init__(self, errors)
289        # undecoded input that is kept between calls to decode()
290        self.buffer = b""
291
292    def _buffer_decode(self, input, errors, final):
293        # Overwrite this method in subclasses: It must decode input
294        # and return an (output, length consumed) tuple
295        raise NotImplementedError
296
297    def decode(self, input, final=False):
298        # decode input (taking the buffer into account)
299        data = self.buffer + input
300        (result, consumed) = self._buffer_decode(data, self.errors, final)
301        # keep undecoded input until the next call
302        self.buffer = data[consumed:]
303        return result
304
305    def reset(self):
306        IncrementalDecoder.reset(self)
307        self.buffer = b""
308
309    def getstate(self):
310        # additional state info is always 0
311        return (self.buffer, 0)
312
313    def setstate(self, state):
314        # ignore additional state info
315        self.buffer = state[0]
316
317#
318# The StreamWriter and StreamReader class provide generic working
319# interfaces which can be used to implement new encoding submodules
320# very easily. See encodings/utf_8.py for an example on how this is
321# done.
322#
323
324class StreamWriter(Codec):
325
326    def __init__(self, stream, errors='strict'):
327
328        """ Creates a StreamWriter instance.
329
330            stream must be a file-like object open for writing
331            (binary) data.
332
333            The StreamWriter may use different error handling
334            schemes by providing the errors keyword argument. These
335            parameters are predefined:
336
337             'strict' - raise a ValueError (or a subclass)
338             'ignore' - ignore the character and continue with the next
339             'replace'- replace with a suitable replacement character
340             'xmlcharrefreplace' - Replace with the appropriate XML
341                                   character reference.
342             'backslashreplace'  - Replace with backslashed escape
343                                   sequences (only for encoding).
344
345            The set of allowed parameter values can be extended via
346            register_error.
347        """
348        self.stream = stream
349        self.errors = errors
350
351    def write(self, object):
352
353        """ Writes the object's contents encoded to self.stream.
354        """
355        data, consumed = self.encode(object, self.errors)
356        self.stream.write(data)
357
358    def writelines(self, list):
359
360        """ Writes the concatenated list of strings to the stream
361            using .write().
362        """
363        self.write(''.join(list))
364
365    def reset(self):
366
367        """ Flushes and resets the codec buffers used for keeping state.
368
369            Calling this method should ensure that the data on the
370            output is put into a clean state, that allows appending
371            of new fresh data without having to rescan the whole
372            stream to recover state.
373
374        """
375        pass
376
377    def __getattr__(self, name,
378                    getattr=getattr):
379
380        """ Inherit all other methods from the underlying stream.
381        """
382        return getattr(self.stream, name)
383
384    def __enter__(self):
385        return self
386
387    def __exit__(self, type, value, tb):
388        self.stream.close()
389
390###
391
392class StreamReader(Codec):
393
394    def __init__(self, stream, errors='strict'):
395
396        """ Creates a StreamReader instance.
397
398            stream must be a file-like object open for reading
399            (binary) data.
400
401            The StreamReader may use different error handling
402            schemes by providing the errors keyword argument. These
403            parameters are predefined:
404
405             'strict' - raise a ValueError (or a subclass)
406             'ignore' - ignore the character and continue with the next
407             'replace'- replace with a suitable replacement character;
408
409            The set of allowed parameter values can be extended via
410            register_error.
411        """
412        self.stream = stream
413        self.errors = errors
414        self.bytebuffer = b""
415        # For str->str decoding this will stay a str
416        # For str->unicode decoding the first read will promote it to unicode
417        self.charbuffer = ""
418        self.linebuffer = None
419
420    def decode(self, input, errors='strict'):
421        raise NotImplementedError
422
423    def read(self, size=-1, chars=-1, firstline=False):
424
425        """ Decodes data from the stream self.stream and returns the
426            resulting object.
427
428            chars indicates the number of characters to read from the
429            stream. read() will never return more than chars
430            characters, but it might return less, if there are not enough
431            characters available.
432
433            size indicates the approximate maximum number of bytes to
434            read from the stream for decoding purposes. The decoder
435            can modify this setting as appropriate. The default value
436            -1 indicates to read and decode as much as possible.  size
437            is intended to prevent having to decode huge files in one
438            step.
439
440            If firstline is true, and a UnicodeDecodeError happens
441            after the first line terminator in the input only the first line
442            will be returned, the rest of the input will be kept until the
443            next call to read().
444
445            The method should use a greedy read strategy meaning that
446            it should read as much data as is allowed within the
447            definition of the encoding and the given size, e.g.  if
448            optional encoding endings or state markers are available
449            on the stream, these should be read too.
450        """
451        # If we have lines cached, first merge them back into characters
452        if self.linebuffer:
453            self.charbuffer = "".join(self.linebuffer)
454            self.linebuffer = None
455
456        # read until we get the required number of characters (if available)
457        while True:
458            # can the request can be satisfied from the character buffer?
459            if chars < 0:
460                if size < 0:
461                    if self.charbuffer:
462                        break
463                elif len(self.charbuffer) >= size:
464                    break
465            else:
466                if len(self.charbuffer) >= chars:
467                    break
468            # we need more data
469            if size < 0:
470                newdata = self.stream.read()
471            else:
472                newdata = self.stream.read(size)
473            # decode bytes (those remaining from the last call included)
474            data = self.bytebuffer + newdata
475            try:
476                newchars, decodedbytes = self.decode(data, self.errors)
477            except UnicodeDecodeError as exc:
478                if firstline:
479                    newchars, decodedbytes = \
480                        self.decode(data[:exc.start], self.errors)
481                    lines = newchars.splitlines(True)
482                    if len(lines)<=1:
483                        raise
484                else:
485                    raise
486            # keep undecoded bytes until the next call
487            self.bytebuffer = data[decodedbytes:]
488            # put new characters in the character buffer
489            self.charbuffer += newchars
490            # there was no data available
491            if not newdata:
492                break
493        if chars < 0:
494            # Return everything we've got
495            result = self.charbuffer
496            self.charbuffer = ""
497        else:
498            # Return the first chars characters
499            result = self.charbuffer[:chars]
500            self.charbuffer = self.charbuffer[chars:]
501        return result
502
503    def readline(self, size=None, keepends=True):
504
505        """ Read one line from the input stream and return the
506            decoded data.
507
508            size, if given, is passed as size argument to the
509            read() method.
510
511        """
512        # If we have lines cached from an earlier read, return
513        # them unconditionally
514        if self.linebuffer:
515            line = self.linebuffer[0]
516            del self.linebuffer[0]
517            if len(self.linebuffer) == 1:
518                # revert to charbuffer mode; we might need more data
519                # next time
520                self.charbuffer = self.linebuffer[0]
521                self.linebuffer = None
522            if not keepends:
523                line = line.splitlines(False)[0]
524            return line
525
526        readsize = size or 72
527        line = ""
528        # If size is given, we call read() only once
529        while True:
530            data = self.read(readsize, firstline=True)
531            if data:
532                # If we're at a "\r" read one extra character (which might
533                # be a "\n") to get a proper line ending. If the stream is
534                # temporarily exhausted we return the wrong line ending.
535                if data.endswith("\r"):
536                    data += self.read(size=1, chars=1)
537
538            line += data
539            lines = line.splitlines(True)
540            if lines:
541                if len(lines) > 1:
542                    # More than one line result; the first line is a full line
543                    # to return
544                    line = lines[0]
545                    del lines[0]
546                    if len(lines) > 1:
547                        # cache the remaining lines
548                        lines[-1] += self.charbuffer
549                        self.linebuffer = lines
550                        self.charbuffer = None
551                    else:
552                        # only one remaining line, put it back into charbuffer
553                        self.charbuffer = lines[0] + self.charbuffer
554                    if not keepends:
555                        line = line.splitlines(False)[0]
556                    break
557                line0withend = lines[0]
558                line0withoutend = lines[0].splitlines(False)[0]
559                if line0withend != line0withoutend: # We really have a line end
560                    # Put the rest back together and keep it until the next call
561                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
562                    if keepends:
563                        line = line0withend
564                    else:
565                        line = line0withoutend
566                    break
567            # we didn't get anything or this was our only try
568            if not data or size is not None:
569                if line and not keepends:
570                    line = line.splitlines(False)[0]
571                break
572            if readsize<8000:
573                readsize *= 2
574        return line
575
576    def readlines(self, sizehint=None, keepends=True):
577
578        """ Read all lines available on the input stream
579            and return them as list of lines.
580
581            Line breaks are implemented using the codec's decoder
582            method and are included in the list entries.
583
584            sizehint, if given, is ignored since there is no efficient
585            way to finding the true end-of-line.
586
587        """
588        data = self.read()
589        return data.splitlines(keepends)
590
591    def reset(self):
592
593        """ Resets the codec buffers used for keeping state.
594
595            Note that no stream repositioning should take place.
596            This method is primarily intended to be able to recover
597            from decoding errors.
598
599        """
600        self.bytebuffer = b""
601        self.charbuffer = ""
602        self.linebuffer = None
603
604    def seek(self, offset, whence=0):
605        """ Set the input stream's current position.
606
607            Resets the codec buffers used for keeping state.
608        """
609        self.reset()
610        self.stream.seek(offset, whence)
611
612    def __next__(self):
613
614        """ Return the next decoded line from the input stream."""
615        line = self.readline()
616        if line:
617            return line
618        raise StopIteration
619
620    def __iter__(self):
621        return self
622
623    def __getattr__(self, name,
624                    getattr=getattr):
625
626        """ Inherit all other methods from the underlying stream.
627        """
628        return getattr(self.stream, name)
629
630    def __enter__(self):
631        return self
632
633    def __exit__(self, type, value, tb):
634        self.stream.close()
635
636###
637
638class StreamReaderWriter:
639
640    """ StreamReaderWriter instances allow wrapping streams which
641        work in both read and write modes.
642
643        The design is such that one can use the factory functions
644        returned by the codec.lookup() function to construct the
645        instance.
646
647    """
648    # Optional attributes set by the file wrappers below
649    encoding = 'unknown'
650
651    def __init__(self, stream, Reader, Writer, errors='strict'):
652
653        """ Creates a StreamReaderWriter instance.
654
655            stream must be a Stream-like object.
656
657            Reader, Writer must be factory functions or classes
658            providing the StreamReader, StreamWriter interface resp.
659
660            Error handling is done in the same way as defined for the
661            StreamWriter/Readers.
662
663        """
664        self.stream = stream
665        self.reader = Reader(stream, errors)
666        self.writer = Writer(stream, errors)
667        self.errors = errors
668
669    def read(self, size=-1):
670
671        return self.reader.read(size)
672
673    def readline(self, size=None):
674
675        return self.reader.readline(size)
676
677    def readlines(self, sizehint=None):
678
679        return self.reader.readlines(sizehint)
680
681    def __next__(self):
682
683        """ Return the next decoded line from the input stream."""
684        return next(self.reader)
685
686    def __iter__(self):
687        return self
688
689    def write(self, data):
690
691        return self.writer.write(data)
692
693    def writelines(self, list):
694
695        return self.writer.writelines(list)
696
697    def reset(self):
698
699        self.reader.reset()
700        self.writer.reset()
701
702    def seek(self, offset, whence=0):
703        self.reader.seek(offset, whence)
704        self.writer.seek(offset, whence)
705
706    def __getattr__(self, name,
707                    getattr=getattr):
708
709        """ Inherit all other methods from the underlying stream.
710        """
711        return getattr(self.stream, name)
712
713    # these are needed to make "with codecs.open(...)" work properly
714
715    def __enter__(self):
716        return self
717
718    def __exit__(self, type, value, tb):
719        self.stream.close()
720
721###
722
723class StreamRecoder:
724
725    """ StreamRecoder instances provide a frontend - backend
726        view of encoding data.
727
728        They use the complete set of APIs returned by the
729        codecs.lookup() function to implement their task.
730
731        Data written to the stream is first decoded into an
732        intermediate format (which is dependent on the given codec
733        combination) and then written to the stream using an instance
734        of the provided Writer class.
735
736        In the other direction, data is read from the stream using a
737        Reader instance and then return encoded data to the caller.
738
739    """
740    # Optional attributes set by the file wrappers below
741    data_encoding = 'unknown'
742    file_encoding = 'unknown'
743
744    def __init__(self, stream, encode, decode, Reader, Writer,
745                 errors='strict'):
746
747        """ Creates a StreamRecoder instance which implements a two-way
748            conversion: encode and decode work on the frontend (the
749            input to .read() and output of .write()) while
750            Reader and Writer work on the backend (reading and
751            writing to the stream).
752
753            You can use these objects to do transparent direct
754            recodings from e.g. latin-1 to utf-8 and back.
755
756            stream must be a file-like object.
757
758            encode, decode must adhere to the Codec interface, Reader,
759            Writer must be factory functions or classes providing the
760            StreamReader, StreamWriter interface resp.
761
762            encode and decode are needed for the frontend translation,
763            Reader and Writer for the backend translation. Unicode is
764            used as intermediate encoding.
765
766            Error handling is done in the same way as defined for the
767            StreamWriter/Readers.
768
769        """
770        self.stream = stream
771        self.encode = encode
772        self.decode = decode
773        self.reader = Reader(stream, errors)
774        self.writer = Writer(stream, errors)
775        self.errors = errors
776
777    def read(self, size=-1):
778
779        data = self.reader.read(size)
780        data, bytesencoded = self.encode(data, self.errors)
781        return data
782
783    def readline(self, size=None):
784
785        if size is None:
786            data = self.reader.readline()
787        else:
788            data = self.reader.readline(size)
789        data, bytesencoded = self.encode(data, self.errors)
790        return data
791
792    def readlines(self, sizehint=None):
793
794        data = self.reader.read()
795        data, bytesencoded = self.encode(data, self.errors)
796        return data.splitlines(1)
797
798    def __next__(self):
799
800        """ Return the next decoded line from the input stream."""
801        data = next(self.reader)
802        data, bytesencoded = self.encode(data, self.errors)
803        return data
804
805    def __iter__(self):
806        return self
807
808    def write(self, data):
809
810        data, bytesdecoded = self.decode(data, self.errors)
811        return self.writer.write(data)
812
813    def writelines(self, list):
814
815        data = ''.join(list)
816        data, bytesdecoded = self.decode(data, self.errors)
817        return self.writer.write(data)
818
819    def reset(self):
820
821        self.reader.reset()
822        self.writer.reset()
823
824    def __getattr__(self, name,
825                    getattr=getattr):
826
827        """ Inherit all other methods from the underlying stream.
828        """
829        return getattr(self.stream, name)
830
831    def __enter__(self):
832        return self
833
834    def __exit__(self, type, value, tb):
835        self.stream.close()
836
837### Shortcuts
838
839def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
840
841    """ Open an encoded file using the given mode and return
842        a wrapped version providing transparent encoding/decoding.
843
844        Note: The wrapped version will only accept the object format
845        defined by the codecs, i.e. Unicode objects for most builtin
846        codecs. Output is also codec dependent and will usually be
847        Unicode as well.
848
849        Files are always opened in binary mode, even if no binary mode
850        was specified. This is done to avoid data loss due to encodings
851        using 8-bit values. The default file mode is 'rb' meaning to
852        open the file in binary read mode.
853
854        encoding specifies the encoding which is to be used for the
855        file.
856
857        errors may be given to define the error handling. It defaults
858        to 'strict' which causes ValueErrors to be raised in case an
859        encoding error occurs.
860
861        buffering has the same meaning as for the builtin open() API.
862        It defaults to line buffered.
863
864        The returned wrapped file object provides an extra attribute
865        .encoding which allows querying the used encoding. This
866        attribute is only available if an encoding was specified as
867        parameter.
868
869    """
870    if encoding is not None and \
871       'b' not in mode:
872        # Force opening of the file in binary mode
873        mode = mode + 'b'
874    file = builtins.open(filename, mode, buffering)
875    if encoding is None:
876        return file
877    info = lookup(encoding)
878    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
879    # Add attributes to simplify introspection
880    srw.encoding = encoding
881    return srw
882
883def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
884
885    """ Return a wrapped version of file which provides transparent
886        encoding translation.
887
888        Strings written to the wrapped file are interpreted according
889        to the given data_encoding and then written to the original
890        file as string using file_encoding. The intermediate encoding
891        will usually be Unicode but depends on the specified codecs.
892
893        Strings are read from the file using file_encoding and then
894        passed back to the caller as string using data_encoding.
895
896        If file_encoding is not given, it defaults to data_encoding.
897
898        errors may be given to define the error handling. It defaults
899        to 'strict' which causes ValueErrors to be raised in case an
900        encoding error occurs.
901
902        The returned wrapped file object provides two extra attributes
903        .data_encoding and .file_encoding which reflect the given
904        parameters of the same name. The attributes can be used for
905        introspection by Python programs.
906
907    """
908    if file_encoding is None:
909        file_encoding = data_encoding
910    data_info = lookup(data_encoding)
911    file_info = lookup(file_encoding)
912    sr = StreamRecoder(file, data_info.encode, data_info.decode,
913                       file_info.streamreader, file_info.streamwriter, errors)
914    # Add attributes to simplify introspection
915    sr.data_encoding = data_encoding
916    sr.file_encoding = file_encoding
917    return sr
918
919### Helpers for codec lookup
920
921def getencoder(encoding):
922
923    """ Lookup up the codec for the given encoding and return
924        its encoder function.
925
926        Raises a LookupError in case the encoding cannot be found.
927
928    """
929    return lookup(encoding).encode
930
931def getdecoder(encoding):
932
933    """ Lookup up the codec for the given encoding and return
934        its decoder function.
935
936        Raises a LookupError in case the encoding cannot be found.
937
938    """
939    return lookup(encoding).decode
940
941def getincrementalencoder(encoding):
942
943    """ Lookup up the codec for the given encoding and return
944        its IncrementalEncoder class or factory function.
945
946        Raises a LookupError in case the encoding cannot be found
947        or the codecs doesn't provide an incremental encoder.
948
949    """
950    encoder = lookup(encoding).incrementalencoder
951    if encoder is None:
952        raise LookupError(encoding)
953    return encoder
954
955def getincrementaldecoder(encoding):
956
957    """ Lookup up the codec for the given encoding and return
958        its IncrementalDecoder class or factory function.
959
960        Raises a LookupError in case the encoding cannot be found
961        or the codecs doesn't provide an incremental decoder.
962
963    """
964    decoder = lookup(encoding).incrementaldecoder
965    if decoder is None:
966        raise LookupError(encoding)
967    return decoder
968
969def getreader(encoding):
970
971    """ Lookup up the codec for the given encoding and return
972        its StreamReader class or factory function.
973
974        Raises a LookupError in case the encoding cannot be found.
975
976    """
977    return lookup(encoding).streamreader
978
979def getwriter(encoding):
980
981    """ Lookup up the codec for the given encoding and return
982        its StreamWriter class or factory function.
983
984        Raises a LookupError in case the encoding cannot be found.
985
986    """
987    return lookup(encoding).streamwriter
988
989def iterencode(iterator, encoding, errors='strict', **kwargs):
990    """
991    Encoding iterator.
992
993    Encodes the input strings from the iterator using a IncrementalEncoder.
994
995    errors and kwargs are passed through to the IncrementalEncoder
996    constructor.
997    """
998    encoder = getincrementalencoder(encoding)(errors, **kwargs)
999    for input in iterator:
1000        output = encoder.encode(input)
1001        if output:
1002            yield output
1003    output = encoder.encode("", True)
1004    if output:
1005        yield output
1006
1007def iterdecode(iterator, encoding, errors='strict', **kwargs):
1008    """
1009    Decoding iterator.
1010
1011    Decodes the input strings from the iterator using a IncrementalDecoder.
1012
1013    errors and kwargs are passed through to the IncrementalDecoder
1014    constructor.
1015    """
1016    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1017    for input in iterator:
1018        output = decoder.decode(input)
1019        if output:
1020            yield output
1021    output = decoder.decode(b"", True)
1022    if output:
1023        yield output
1024
1025### Helpers for charmap-based codecs
1026
1027def make_identity_dict(rng):
1028
1029    """ make_identity_dict(rng) -> dict
1030
1031        Return a dictionary where elements of the rng sequence are
1032        mapped to themselves.
1033
1034    """
1035    res = {}
1036    for i in rng:
1037        res[i]=i
1038    return res
1039
1040def make_encoding_map(decoding_map):
1041
1042    """ Creates an encoding map from a decoding map.
1043
1044        If a target mapping in the decoding map occurs multiple
1045        times, then that target is mapped to None (undefined mapping),
1046        causing an exception when encountered by the charmap codec
1047        during translation.
1048
1049        One example where this happens is cp875.py which decodes
1050        multiple character to \u001a.
1051
1052    """
1053    m = {}
1054    for k,v in decoding_map.items():
1055        if not v in m:
1056            m[v] = k
1057        else:
1058            m[v] = None
1059    return m
1060
1061### error handlers
1062
1063try:
1064    strict_errors = lookup_error("strict")
1065    ignore_errors = lookup_error("ignore")
1066    replace_errors = lookup_error("replace")
1067    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1068    backslashreplace_errors = lookup_error("backslashreplace")
1069except LookupError:
1070    # In --disable-unicode builds, these error handler are missing
1071    strict_errors = None
1072    ignore_errors = None
1073    replace_errors = None
1074    xmlcharrefreplace_errors = None
1075    backslashreplace_errors = None
1076
1077# Tell modulefinder that using codecs probably needs the encodings
1078# package
1079_false = 0
1080if _false:
1081    import encodings
1082
1083### Tests
1084
1085if __name__ == '__main__':
1086
1087    # Make stdout translate Latin-1 output into UTF-8 output
1088    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1089
1090    # Have stdin translate Latin-1 input into UTF-8 input
1091    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1092