1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "strict_errors", "ignore_errors", "replace_errors",
24           "xmlcharrefreplace_errors",
25           "register_error", "lookup_error"]
26
27### Constants
28
29#
30# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
33#
34
35# UTF-8
36BOM_UTF8 = '\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
50if sys.byteorder == 'little':
51
52    # UTF-16, native endianness
53    BOM = BOM_UTF16 = BOM_UTF16_LE
54
55    # UTF-32, native endianness
56    BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_BE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_BE
65
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
71
72
73### Codec base classes (defining the API)
74
75class CodecInfo(tuple):
76
77    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78        incrementalencoder=None, incrementaldecoder=None, name=None):
79        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80        self.name = name
81        self.encode = encode
82        self.decode = decode
83        self.incrementalencoder = incrementalencoder
84        self.incrementaldecoder = incrementaldecoder
85        self.streamwriter = streamwriter
86        self.streamreader = streamreader
87        return self
88
89    def __repr__(self):
90        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
92class Codec:
93
94    """ Defines the interface for stateless encoders/decoders.
95
96        The .encode()/.decode() methods may use different error
97        handling schemes by providing the errors argument. These
98        string values are predefined:
99
100         'strict' - raise a ValueError error (or a subclass)
101         'ignore' - ignore the character and continue with the next
102         'replace' - replace with a suitable replacement character;
103                    Python will use the official U+FFFD REPLACEMENT
104                    CHARACTER for the builtin Unicode codecs on
105                    decoding and '?' on encoding.
106         'xmlcharrefreplace' - Replace with the appropriate XML
107                               character reference (only for encoding).
108         'backslashreplace'  - Replace with backslashed escape sequences
109                               (only for encoding).
110
111        The set of allowed values can be extended via register_error.
112
113    """
114    def encode(self, input, errors='strict'):
115
116        """ Encodes the object input and returns a tuple (output
117            object, length consumed).
118
119            errors defines the error handling to apply. It defaults to
120            'strict' handling.
121
122            The method may not store state in the Codec instance. Use
123            StreamCodec for codecs which have to keep state in order to
124            make encoding/decoding efficient.
125
126            The encoder must be able to handle zero length input and
127            return an empty object of the output object type in this
128            situation.
129
130        """
131        raise NotImplementedError
132
133    def decode(self, input, errors='strict'):
134
135        """ Decodes the object input and returns a tuple (output
136            object, length consumed).
137
138            input must be an object which provides the bf_getreadbuf
139            buffer slot. Python strings, buffer objects and memory
140            mapped files are examples of objects providing this slot.
141
142            errors defines the error handling to apply. It defaults to
143            'strict' handling.
144
145            The method may not store state in the Codec instance. Use
146            StreamCodec for codecs which have to keep state in order to
147            make encoding/decoding efficient.
148
149            The decoder must be able to handle zero length input and
150            return an empty object of the output object type in this
151            situation.
152
153        """
154        raise NotImplementedError
155
156class IncrementalEncoder(object):
157    """
158    An IncrementalEncoder encodes an input in multiple steps. The input can be
159    passed piece by piece to the encode() method. The IncrementalEncoder remembers
160    the state of the Encoding process between calls to encode().
161    """
162    def __init__(self, errors='strict'):
163        """
164        Creates an IncrementalEncoder instance.
165
166        The IncrementalEncoder may use different error handling schemes by
167        providing the errors keyword argument. See the module docstring
168        for a list of possible values.
169        """
170        self.errors = errors
171        self.buffer = ""
172
173    def encode(self, input, final=False):
174        """
175        Encodes input and returns the resulting object.
176        """
177        raise NotImplementedError
178
179    def reset(self):
180        """
181        Resets the encoder to the initial state.
182        """
183
184    def getstate(self):
185        """
186        Return the current state of the encoder.
187        """
188        return 0
189
190    def setstate(self, state):
191        """
192        Set the current state of the encoder. state must have been
193        returned by getstate().
194        """
195
196class BufferedIncrementalEncoder(IncrementalEncoder):
197    """
198    This subclass of IncrementalEncoder can be used as the baseclass for an
199    incremental encoder if the encoder must keep some of the output in a
200    buffer between calls to encode().
201    """
202    def __init__(self, errors='strict'):
203        IncrementalEncoder.__init__(self, errors)
204        self.buffer = "" # unencoded input that is kept between calls to encode()
205
206    def _buffer_encode(self, input, errors, final):
207        # Overwrite this method in subclasses: It must encode input
208        # and return an (output, length consumed) tuple
209        raise NotImplementedError
210
211    def encode(self, input, final=False):
212        # encode input (taking the buffer into account)
213        data = self.buffer + input
214        (result, consumed) = self._buffer_encode(data, self.errors, final)
215        # keep unencoded input until the next call
216        self.buffer = data[consumed:]
217        return result
218
219    def reset(self):
220        IncrementalEncoder.reset(self)
221        self.buffer = ""
222
223    def getstate(self):
224        return self.buffer or 0
225
226    def setstate(self, state):
227        self.buffer = state or ""
228
229class IncrementalDecoder(object):
230    """
231    An IncrementalDecoder decodes an input in multiple steps. The input can be
232    passed piece by piece to the decode() method. The IncrementalDecoder
233    remembers the state of the decoding process between calls to decode().
234    """
235    def __init__(self, errors='strict'):
236        """
237        Creates a IncrementalDecoder instance.
238
239        The IncrementalDecoder may use different error handling schemes by
240        providing the errors keyword argument. See the module docstring
241        for a list of possible values.
242        """
243        self.errors = errors
244
245    def decode(self, input, final=False):
246        """
247        Decodes input and returns the resulting object.
248        """
249        raise NotImplementedError
250
251    def reset(self):
252        """
253        Resets the decoder to the initial state.
254        """
255
256    def getstate(self):
257        """
258        Return the current state of the decoder.
259
260        This must be a (buffered_input, additional_state_info) tuple.
261        buffered_input must be a bytes object containing bytes that
262        were passed to decode() that have not yet been converted.
263        additional_state_info must be a non-negative integer
264        representing the state of the decoder WITHOUT yet having
265        processed the contents of buffered_input.  In the initial state
266        and after reset(), getstate() must return (b"", 0).
267        """
268        return (b"", 0)
269
270    def setstate(self, state):
271        """
272        Set the current state of the decoder.
273
274        state must have been returned by getstate().  The effect of
275        setstate((b"", 0)) must be equivalent to reset().
276        """
277
278class BufferedIncrementalDecoder(IncrementalDecoder):
279    """
280    This subclass of IncrementalDecoder can be used as the baseclass for an
281    incremental decoder if the decoder must be able to handle incomplete byte
282    sequences.
283    """
284    def __init__(self, errors='strict'):
285        IncrementalDecoder.__init__(self, errors)
286        self.buffer = "" # undecoded input that is kept between calls to decode()
287
288    def _buffer_decode(self, input, errors, final):
289        # Overwrite this method in subclasses: It must decode input
290        # and return an (output, length consumed) tuple
291        raise NotImplementedError
292
293    def decode(self, input, final=False):
294        # decode input (taking the buffer into account)
295        data = self.buffer + input
296        (result, consumed) = self._buffer_decode(data, self.errors, final)
297        # keep undecoded input until the next call
298        self.buffer = data[consumed:]
299        return result
300
301    def reset(self):
302        IncrementalDecoder.reset(self)
303        self.buffer = ""
304
305    def getstate(self):
306        # additional state info is always 0
307        return (self.buffer, 0)
308
309    def setstate(self, state):
310        # ignore additional state info
311        self.buffer = state[0]
312
313#
314# The StreamWriter and StreamReader class provide generic working
315# interfaces which can be used to implement new encoding submodules
316# very easily. See encodings/utf_8.py for an example on how this is
317# done.
318#
319
320class StreamWriter(Codec):
321
322    def __init__(self, stream, errors='strict'):
323
324        """ Creates a StreamWriter instance.
325
326            stream must be a file-like object open for writing
327            (binary) data.
328
329            The StreamWriter may use different error handling
330            schemes by providing the errors keyword argument. These
331            parameters are predefined:
332
333             'strict' - raise a ValueError (or a subclass)
334             'ignore' - ignore the character and continue with the next
335             'replace'- replace with a suitable replacement character
336             'xmlcharrefreplace' - Replace with the appropriate XML
337                                   character reference.
338             'backslashreplace'  - Replace with backslashed escape
339                                   sequences (only for encoding).
340
341            The set of allowed parameter values can be extended via
342            register_error.
343        """
344        self.stream = stream
345        self.errors = errors
346
347    def write(self, object):
348
349        """ Writes the object's contents encoded to self.stream.
350        """
351        data, consumed = self.encode(object, self.errors)
352        self.stream.write(data)
353
354    def writelines(self, list):
355
356        """ Writes the concatenated list of strings to the stream
357            using .write().
358        """
359        self.write(''.join(list))
360
361    def reset(self):
362
363        """ Flushes and resets the codec buffers used for keeping state.
364
365            Calling this method should ensure that the data on the
366            output is put into a clean state, that allows appending
367            of new fresh data without having to rescan the whole
368            stream to recover state.
369
370        """
371        pass
372
373    def seek(self, offset, whence=0):
374        self.stream.seek(offset, whence)
375        if whence == 0 and offset == 0:
376            self.reset()
377
378    def __getattr__(self, name,
379                    getattr=getattr):
380
381        """ Inherit all other methods from the underlying stream.
382        """
383        return getattr(self.stream, name)
384
385    def __enter__(self):
386        return self
387
388    def __exit__(self, type, value, tb):
389        self.stream.close()
390
391###
392
393class StreamReader(Codec):
394
395    def __init__(self, stream, errors='strict'):
396
397        """ Creates a StreamReader instance.
398
399            stream must be a file-like object open for reading
400            (binary) data.
401
402            The StreamReader may use different error handling
403            schemes by providing the errors keyword argument. These
404            parameters are predefined:
405
406             'strict' - raise a ValueError (or a subclass)
407             'ignore' - ignore the character and continue with the next
408             'replace'- replace with a suitable replacement character;
409
410            The set of allowed parameter values can be extended via
411            register_error.
412        """
413        self.stream = stream
414        self.errors = errors
415        self.bytebuffer = ""
416        # For str->str decoding this will stay a str
417        # For str->unicode decoding the first read will promote it to unicode
418        self.charbuffer = ""
419        self.linebuffer = None
420
421    def decode(self, input, errors='strict'):
422        raise NotImplementedError
423
424    def read(self, size=-1, chars=-1, firstline=False):
425
426        """ Decodes data from the stream self.stream and returns the
427            resulting object.
428
429            chars indicates the number of characters to read from the
430            stream. read() will never return more than chars
431            characters, but it might return less, if there are not enough
432            characters available.
433
434            size indicates the approximate maximum number of bytes to
435            read from the stream for decoding purposes. The decoder
436            can modify this setting as appropriate. The default value
437            -1 indicates to read and decode as much as possible.  size
438            is intended to prevent having to decode huge files in one
439            step.
440
441            If firstline is true, and a UnicodeDecodeError happens
442            after the first line terminator in the input only the first line
443            will be returned, the rest of the input will be kept until the
444            next call to read().
445
446            The method should use a greedy read strategy meaning that
447            it should read as much data as is allowed within the
448            definition of the encoding and the given size, e.g.  if
449            optional encoding endings or state markers are available
450            on the stream, these should be read too.
451        """
452        # If we have lines cached, first merge them back into characters
453        if self.linebuffer:
454            self.charbuffer = "".join(self.linebuffer)
455            self.linebuffer = None
456
457        # read until we get the required number of characters (if available)
458        while True:
459            # can the request can be satisfied from the character buffer?
460            if chars < 0:
461                if size < 0:
462                    if self.charbuffer:
463                        break
464                elif len(self.charbuffer) >= size:
465                    break
466            else:
467                if len(self.charbuffer) >= chars:
468                    break
469            # we need more data
470            if size < 0:
471                newdata = self.stream.read()
472            else:
473                newdata = self.stream.read(size)
474            # decode bytes (those remaining from the last call included)
475            data = self.bytebuffer + newdata
476            try:
477                newchars, decodedbytes = self.decode(data, self.errors)
478            except UnicodeDecodeError, exc:
479                if firstline:
480                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
481                    lines = newchars.splitlines(True)
482                    if len(lines)<=1:
483                        raise
484                else:
485                    raise
486            # keep undecoded bytes until the next call
487            self.bytebuffer = data[decodedbytes:]
488            # put new characters in the character buffer
489            self.charbuffer += newchars
490            # there was no data available
491            if not newdata:
492                break
493        if chars < 0:
494            # Return everything we've got
495            result = self.charbuffer
496            self.charbuffer = ""
497        else:
498            # Return the first chars characters
499            result = self.charbuffer[:chars]
500            self.charbuffer = self.charbuffer[chars:]
501        return result
502
503    def readline(self, size=None, keepends=True):
504
505        """ Read one line from the input stream and return the
506            decoded data.
507
508            size, if given, is passed as size argument to the
509            read() method.
510
511        """
512        # If we have lines cached from an earlier read, return
513        # them unconditionally
514        if self.linebuffer:
515            line = self.linebuffer[0]
516            del self.linebuffer[0]
517            if len(self.linebuffer) == 1:
518                # revert to charbuffer mode; we might need more data
519                # next time
520                self.charbuffer = self.linebuffer[0]
521                self.linebuffer = None
522            if not keepends:
523                line = line.splitlines(False)[0]
524            return line
525
526        readsize = size or 72
527        line = ""
528        # If size is given, we call read() only once
529        while True:
530            data = self.read(readsize, firstline=True)
531            if data:
532                # If we're at a "\r" read one extra character (which might
533                # be a "\n") to get a proper line ending. If the stream is
534                # temporarily exhausted we return the wrong line ending.
535                if data.endswith("\r"):
536                    data += self.read(size=1, chars=1)
537
538            line += data
539            lines = line.splitlines(True)
540            if lines:
541                if len(lines) > 1:
542                    # More than one line result; the first line is a full line
543                    # to return
544                    line = lines[0]
545                    del lines[0]
546                    if len(lines) > 1:
547                        # cache the remaining lines
548                        lines[-1] += self.charbuffer
549                        self.linebuffer = lines
550                        self.charbuffer = None
551                    else:
552                        # only one remaining line, put it back into charbuffer
553                        self.charbuffer = lines[0] + self.charbuffer
554                    if not keepends:
555                        line = line.splitlines(False)[0]
556                    break
557                line0withend = lines[0]
558                line0withoutend = lines[0].splitlines(False)[0]
559                if line0withend != line0withoutend: # We really have a line end
560                    # Put the rest back together and keep it until the next call
561                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
562                    if keepends:
563                        line = line0withend
564                    else:
565                        line = line0withoutend
566                    break
567            # we didn't get anything or this was our only try
568            if not data or size is not None:
569                if line and not keepends:
570                    line = line.splitlines(False)[0]
571                break
572            if readsize<8000:
573                readsize *= 2
574        return line
575
576    def readlines(self, sizehint=None, keepends=True):
577
578        """ Read all lines available on the input stream
579            and return them as list of lines.
580
581            Line breaks are implemented using the codec's decoder
582            method and are included in the list entries.
583
584            sizehint, if given, is ignored since there is no efficient
585            way to finding the true end-of-line.
586
587        """
588        data = self.read()
589        return data.splitlines(keepends)
590
591    def reset(self):
592
593        """ Resets the codec buffers used for keeping state.
594
595            Note that no stream repositioning should take place.
596            This method is primarily intended to be able to recover
597            from decoding errors.
598
599        """
600        self.bytebuffer = ""
601        self.charbuffer = u""
602        self.linebuffer = None
603
604    def seek(self, offset, whence=0):
605        """ Set the input stream's current position.
606
607            Resets the codec buffers used for keeping state.
608        """
609        self.stream.seek(offset, whence)
610        self.reset()
611
612    def next(self):
613
614        """ Return the next decoded line from the input stream."""
615        line = self.readline()
616        if line:
617            return line
618        raise StopIteration
619
620    def __iter__(self):
621        return self
622
623    def __getattr__(self, name,
624                    getattr=getattr):
625
626        """ Inherit all other methods from the underlying stream.
627        """
628        return getattr(self.stream, name)
629
630    def __enter__(self):
631        return self
632
633    def __exit__(self, type, value, tb):
634        self.stream.close()
635
636###
637
638class StreamReaderWriter:
639
640    """ StreamReaderWriter instances allow wrapping streams which
641        work in both read and write modes.
642
643        The design is such that one can use the factory functions
644        returned by the codec.lookup() function to construct the
645        instance.
646
647    """
648    # Optional attributes set by the file wrappers below
649    encoding = 'unknown'
650
651    def __init__(self, stream, Reader, Writer, errors='strict'):
652
653        """ Creates a StreamReaderWriter instance.
654
655            stream must be a Stream-like object.
656
657            Reader, Writer must be factory functions or classes
658            providing the StreamReader, StreamWriter interface resp.
659
660            Error handling is done in the same way as defined for the
661            StreamWriter/Readers.
662
663        """
664        self.stream = stream
665        self.reader = Reader(stream, errors)
666        self.writer = Writer(stream, errors)
667        self.errors = errors
668
669    def read(self, size=-1):
670
671        return self.reader.read(size)
672
673    def readline(self, size=None):
674
675        return self.reader.readline(size)
676
677    def readlines(self, sizehint=None):
678
679        return self.reader.readlines(sizehint)
680
681    def next(self):
682
683        """ Return the next decoded line from the input stream."""
684        return self.reader.next()
685
686    def __iter__(self):
687        return self
688
689    def write(self, data):
690
691        return self.writer.write(data)
692
693    def writelines(self, list):
694
695        return self.writer.writelines(list)
696
697    def reset(self):
698
699        self.reader.reset()
700        self.writer.reset()
701
702    def seek(self, offset, whence=0):
703        self.stream.seek(offset, whence)
704        self.reader.reset()
705        if whence == 0 and offset == 0:
706            self.writer.reset()
707
708    def __getattr__(self, name,
709                    getattr=getattr):
710
711        """ Inherit all other methods from the underlying stream.
712        """
713        return getattr(self.stream, name)
714
715    # these are needed to make "with codecs.open(...)" work properly
716
717    def __enter__(self):
718        return self
719
720    def __exit__(self, type, value, tb):
721        self.stream.close()
722
723###
724
725class StreamRecoder:
726
727    """ StreamRecoder instances provide a frontend - backend
728        view of encoding data.
729
730        They use the complete set of APIs returned by the
731        codecs.lookup() function to implement their task.
732
733        Data written to the stream is first decoded into an
734        intermediate format (which is dependent on the given codec
735        combination) and then written to the stream using an instance
736        of the provided Writer class.
737
738        In the other direction, data is read from the stream using a
739        Reader instance and then return encoded data to the caller.
740
741    """
742    # Optional attributes set by the file wrappers below
743    data_encoding = 'unknown'
744    file_encoding = 'unknown'
745
746    def __init__(self, stream, encode, decode, Reader, Writer,
747                 errors='strict'):
748
749        """ Creates a StreamRecoder instance which implements a two-way
750            conversion: encode and decode work on the frontend (the
751            input to .read() and output of .write()) while
752            Reader and Writer work on the backend (reading and
753            writing to the stream).
754
755            You can use these objects to do transparent direct
756            recodings from e.g. latin-1 to utf-8 and back.
757
758            stream must be a file-like object.
759
760            encode, decode must adhere to the Codec interface, Reader,
761            Writer must be factory functions or classes providing the
762            StreamReader, StreamWriter interface resp.
763
764            encode and decode are needed for the frontend translation,
765            Reader and Writer for the backend translation. Unicode is
766            used as intermediate encoding.
767
768            Error handling is done in the same way as defined for the
769            StreamWriter/Readers.
770
771        """
772        self.stream = stream
773        self.encode = encode
774        self.decode = decode
775        self.reader = Reader(stream, errors)
776        self.writer = Writer(stream, errors)
777        self.errors = errors
778
779    def read(self, size=-1):
780
781        data = self.reader.read(size)
782        data, bytesencoded = self.encode(data, self.errors)
783        return data
784
785    def readline(self, size=None):
786
787        if size is None:
788            data = self.reader.readline()
789        else:
790            data = self.reader.readline(size)
791        data, bytesencoded = self.encode(data, self.errors)
792        return data
793
794    def readlines(self, sizehint=None):
795
796        data = self.reader.read()
797        data, bytesencoded = self.encode(data, self.errors)
798        return data.splitlines(1)
799
800    def next(self):
801
802        """ Return the next decoded line from the input stream."""
803        data = self.reader.next()
804        data, bytesencoded = self.encode(data, self.errors)
805        return data
806
807    def __iter__(self):
808        return self
809
810    def write(self, data):
811
812        data, bytesdecoded = self.decode(data, self.errors)
813        return self.writer.write(data)
814
815    def writelines(self, list):
816
817        data = ''.join(list)
818        data, bytesdecoded = self.decode(data, self.errors)
819        return self.writer.write(data)
820
821    def reset(self):
822
823        self.reader.reset()
824        self.writer.reset()
825
826    def __getattr__(self, name,
827                    getattr=getattr):
828
829        """ Inherit all other methods from the underlying stream.
830        """
831        return getattr(self.stream, name)
832
833    def __enter__(self):
834        return self
835
836    def __exit__(self, type, value, tb):
837        self.stream.close()
838
839### Shortcuts
840
841def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
842
843    """ Open an encoded file using the given mode and return
844        a wrapped version providing transparent encoding/decoding.
845
846        Note: The wrapped version will only accept the object format
847        defined by the codecs, i.e. Unicode objects for most builtin
848        codecs. Output is also codec dependent and will usually be
849        Unicode as well.
850
851        Files are always opened in binary mode, even if no binary mode
852        was specified. This is done to avoid data loss due to encodings
853        using 8-bit values. The default file mode is 'rb' meaning to
854        open the file in binary read mode.
855
856        encoding specifies the encoding which is to be used for the
857        file.
858
859        errors may be given to define the error handling. It defaults
860        to 'strict' which causes ValueErrors to be raised in case an
861        encoding error occurs.
862
863        buffering has the same meaning as for the builtin open() API.
864        It defaults to line buffered.
865
866        The returned wrapped file object provides an extra attribute
867        .encoding which allows querying the used encoding. This
868        attribute is only available if an encoding was specified as
869        parameter.
870
871    """
872    if encoding is not None:
873        if 'U' in mode:
874            # No automatic conversion of '\n' is done on reading and writing
875            mode = mode.strip().replace('U', '')
876            if mode[:1] not in set('rwa'):
877                mode = 'r' + mode
878        if 'b' not in mode:
879            # Force opening of the file in binary mode
880            mode = mode + 'b'
881    file = __builtin__.open(filename, mode, buffering)
882    if encoding is None:
883        return file
884    info = lookup(encoding)
885    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
886    # Add attributes to simplify introspection
887    srw.encoding = encoding
888    return srw
889
890def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
891
892    """ Return a wrapped version of file which provides transparent
893        encoding translation.
894
895        Strings written to the wrapped file are interpreted according
896        to the given data_encoding and then written to the original
897        file as string using file_encoding. The intermediate encoding
898        will usually be Unicode but depends on the specified codecs.
899
900        Strings are read from the file using file_encoding and then
901        passed back to the caller as string using data_encoding.
902
903        If file_encoding is not given, it defaults to data_encoding.
904
905        errors may be given to define the error handling. It defaults
906        to 'strict' which causes ValueErrors to be raised in case an
907        encoding error occurs.
908
909        The returned wrapped file object provides two extra attributes
910        .data_encoding and .file_encoding which reflect the given
911        parameters of the same name. The attributes can be used for
912        introspection by Python programs.
913
914    """
915    if file_encoding is None:
916        file_encoding = data_encoding
917    data_info = lookup(data_encoding)
918    file_info = lookup(file_encoding)
919    sr = StreamRecoder(file, data_info.encode, data_info.decode,
920                       file_info.streamreader, file_info.streamwriter, errors)
921    # Add attributes to simplify introspection
922    sr.data_encoding = data_encoding
923    sr.file_encoding = file_encoding
924    return sr
925
926### Helpers for codec lookup
927
928def getencoder(encoding):
929
930    """ Lookup up the codec for the given encoding and return
931        its encoder function.
932
933        Raises a LookupError in case the encoding cannot be found.
934
935    """
936    return lookup(encoding).encode
937
938def getdecoder(encoding):
939
940    """ Lookup up the codec for the given encoding and return
941        its decoder function.
942
943        Raises a LookupError in case the encoding cannot be found.
944
945    """
946    return lookup(encoding).decode
947
948def getincrementalencoder(encoding):
949
950    """ Lookup up the codec for the given encoding and return
951        its IncrementalEncoder class or factory function.
952
953        Raises a LookupError in case the encoding cannot be found
954        or the codecs doesn't provide an incremental encoder.
955
956    """
957    encoder = lookup(encoding).incrementalencoder
958    if encoder is None:
959        raise LookupError(encoding)
960    return encoder
961
962def getincrementaldecoder(encoding):
963
964    """ Lookup up the codec for the given encoding and return
965        its IncrementalDecoder class or factory function.
966
967        Raises a LookupError in case the encoding cannot be found
968        or the codecs doesn't provide an incremental decoder.
969
970    """
971    decoder = lookup(encoding).incrementaldecoder
972    if decoder is None:
973        raise LookupError(encoding)
974    return decoder
975
976def getreader(encoding):
977
978    """ Lookup up the codec for the given encoding and return
979        its StreamReader class or factory function.
980
981        Raises a LookupError in case the encoding cannot be found.
982
983    """
984    return lookup(encoding).streamreader
985
986def getwriter(encoding):
987
988    """ Lookup up the codec for the given encoding and return
989        its StreamWriter class or factory function.
990
991        Raises a LookupError in case the encoding cannot be found.
992
993    """
994    return lookup(encoding).streamwriter
995
996def iterencode(iterator, encoding, errors='strict', **kwargs):
997    """
998    Encoding iterator.
999
1000    Encodes the input strings from the iterator using a IncrementalEncoder.
1001
1002    errors and kwargs are passed through to the IncrementalEncoder
1003    constructor.
1004    """
1005    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1006    for input in iterator:
1007        output = encoder.encode(input)
1008        if output:
1009            yield output
1010    output = encoder.encode("", True)
1011    if output:
1012        yield output
1013
1014def iterdecode(iterator, encoding, errors='strict', **kwargs):
1015    """
1016    Decoding iterator.
1017
1018    Decodes the input strings from the iterator using a IncrementalDecoder.
1019
1020    errors and kwargs are passed through to the IncrementalDecoder
1021    constructor.
1022    """
1023    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1024    for input in iterator:
1025        output = decoder.decode(input)
1026        if output:
1027            yield output
1028    output = decoder.decode("", True)
1029    if output:
1030        yield output
1031
1032### Helpers for charmap-based codecs
1033
1034def make_identity_dict(rng):
1035
1036    """ make_identity_dict(rng) -> dict
1037
1038        Return a dictionary where elements of the rng sequence are
1039        mapped to themselves.
1040
1041    """
1042    res = {}
1043    for i in rng:
1044        res[i]=i
1045    return res
1046
1047def make_encoding_map(decoding_map):
1048
1049    """ Creates an encoding map from a decoding map.
1050
1051        If a target mapping in the decoding map occurs multiple
1052        times, then that target is mapped to None (undefined mapping),
1053        causing an exception when encountered by the charmap codec
1054        during translation.
1055
1056        One example where this happens is cp875.py which decodes
1057        multiple character to \u001a.
1058
1059    """
1060    m = {}
1061    for k,v in decoding_map.items():
1062        if not v in m:
1063            m[v] = k
1064        else:
1065            m[v] = None
1066    return m
1067
1068### error handlers
1069
1070try:
1071    strict_errors = lookup_error("strict")
1072    ignore_errors = lookup_error("ignore")
1073    replace_errors = lookup_error("replace")
1074    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1075    backslashreplace_errors = lookup_error("backslashreplace")
1076except LookupError:
1077    # In --disable-unicode builds, these error handler are missing
1078    strict_errors = None
1079    ignore_errors = None
1080    replace_errors = None
1081    xmlcharrefreplace_errors = None
1082    backslashreplace_errors = None
1083
1084# Tell modulefinder that using codecs probably needs the encodings
1085# package
1086_false = 0
1087if _false:
1088    import encodings
1089
1090### Tests
1091
1092if __name__ == '__main__':
1093
1094    # Make stdout translate Latin-1 output into UTF-8 output
1095    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1096
1097    # Have stdin translate Latin-1 input into UTF-8 input
1098    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1099