1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24           "StreamReader", "StreamWriter",
25           "StreamReaderWriter", "StreamRecoder",
26           "getencoder", "getdecoder", "getincrementalencoder",
27           "getincrementaldecoder", "getreader", "getwriter",
28           "encode", "decode", "iterencode", "iterdecode",
29           "strict_errors", "ignore_errors", "replace_errors",
30           "xmlcharrefreplace_errors", "backslashreplace_errors",
31           "register_error", "lookup_error"]
32
33### Constants
34
35#
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
39#
40
41# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56if sys.byteorder == 'little':
57
58    # UTF-16, native endianness
59    BOM = BOM_UTF16 = BOM_UTF16_LE
60
61    # UTF-32, native endianness
62    BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66    # UTF-16, native endianness
67    BOM = BOM_UTF16 = BOM_UTF16_BE
68
69    # UTF-32, native endianness
70    BOM_UTF32 = BOM_UTF32_BE
71
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
77
78
79### Codec base classes (defining the API)
80
81class CodecInfo(tuple):
82
83    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
84        incrementalencoder=None, incrementaldecoder=None, name=None):
85        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
86        self.name = name
87        self.encode = encode
88        self.decode = decode
89        self.incrementalencoder = incrementalencoder
90        self.incrementaldecoder = incrementaldecoder
91        self.streamwriter = streamwriter
92        self.streamreader = streamreader
93        return self
94
95    def __repr__(self):
96        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
97
98class Codec:
99
100    """ Defines the interface for stateless encoders/decoders.
101
102        The .encode()/.decode() methods may use different error
103        handling schemes by providing the errors argument. These
104        string values are predefined:
105
106         'strict' - raise a ValueError error (or a subclass)
107         'ignore' - ignore the character and continue with the next
108         'replace' - replace with a suitable replacement character;
109                    Python will use the official U+FFFD REPLACEMENT
110                    CHARACTER for the builtin Unicode codecs on
111                    decoding and '?' on encoding.
112         'xmlcharrefreplace' - Replace with the appropriate XML
113                               character reference (only for encoding).
114         'backslashreplace'  - Replace with backslashed escape sequences
115                               (only for encoding).
116
117        The set of allowed values can be extended via register_error.
118
119    """
120    def encode(self, input, errors='strict'):
121
122        """ Encodes the object input and returns a tuple (output
123            object, length consumed).
124
125            errors defines the error handling to apply. It defaults to
126            'strict' handling.
127
128            The method may not store state in the Codec instance. Use
129            StreamCodec for codecs which have to keep state in order to
130            make encoding/decoding efficient.
131
132            The encoder must be able to handle zero length input and
133            return an empty object of the output object type in this
134            situation.
135
136        """
137        raise NotImplementedError
138
139    def decode(self, input, errors='strict'):
140
141        """ Decodes the object input and returns a tuple (output
142            object, length consumed).
143
144            input must be an object which provides the bf_getreadbuf
145            buffer slot. Python strings, buffer objects and memory
146            mapped files are examples of objects providing this slot.
147
148            errors defines the error handling to apply. It defaults to
149            'strict' handling.
150
151            The method may not store state in the Codec instance. Use
152            StreamCodec for codecs which have to keep state in order to
153            make encoding/decoding efficient.
154
155            The decoder must be able to handle zero length input and
156            return an empty object of the output object type in this
157            situation.
158
159        """
160        raise NotImplementedError
161
162class IncrementalEncoder(object):
163    """
164    An IncrementalEncoder encodes an input in multiple steps. The input can be
165    passed piece by piece to the encode() method. The IncrementalEncoder remembers
166    the state of the Encoding process between calls to encode().
167    """
168    def __init__(self, errors='strict'):
169        """
170        Creates an IncrementalEncoder instance.
171
172        The IncrementalEncoder may use different error handling schemes by
173        providing the errors keyword argument. See the module docstring
174        for a list of possible values.
175        """
176        self.errors = errors
177        self.buffer = ""
178
179    def encode(self, input, final=False):
180        """
181        Encodes input and returns the resulting object.
182        """
183        raise NotImplementedError
184
185    def reset(self):
186        """
187        Resets the encoder to the initial state.
188        """
189
190    def getstate(self):
191        """
192        Return the current state of the encoder.
193        """
194        return 0
195
196    def setstate(self, state):
197        """
198        Set the current state of the encoder. state must have been
199        returned by getstate().
200        """
201
202class BufferedIncrementalEncoder(IncrementalEncoder):
203    """
204    This subclass of IncrementalEncoder can be used as the baseclass for an
205    incremental encoder if the encoder must keep some of the output in a
206    buffer between calls to encode().
207    """
208    def __init__(self, errors='strict'):
209        IncrementalEncoder.__init__(self, errors)
210        self.buffer = "" # unencoded input that is kept between calls to encode()
211
212    def _buffer_encode(self, input, errors, final):
213        # Overwrite this method in subclasses: It must encode input
214        # and return an (output, length consumed) tuple
215        raise NotImplementedError
216
217    def encode(self, input, final=False):
218        # encode input (taking the buffer into account)
219        data = self.buffer + input
220        (result, consumed) = self._buffer_encode(data, self.errors, final)
221        # keep unencoded input until the next call
222        self.buffer = data[consumed:]
223        return result
224
225    def reset(self):
226        IncrementalEncoder.reset(self)
227        self.buffer = ""
228
229    def getstate(self):
230        return self.buffer or 0
231
232    def setstate(self, state):
233        self.buffer = state or ""
234
235class IncrementalDecoder(object):
236    """
237    An IncrementalDecoder decodes an input in multiple steps. The input can be
238    passed piece by piece to the decode() method. The IncrementalDecoder
239    remembers the state of the decoding process between calls to decode().
240    """
241    def __init__(self, errors='strict'):
242        """
243        Creates a IncrementalDecoder instance.
244
245        The IncrementalDecoder may use different error handling schemes by
246        providing the errors keyword argument. See the module docstring
247        for a list of possible values.
248        """
249        self.errors = errors
250
251    def decode(self, input, final=False):
252        """
253        Decodes input and returns the resulting object.
254        """
255        raise NotImplementedError
256
257    def reset(self):
258        """
259        Resets the decoder to the initial state.
260        """
261
262    def getstate(self):
263        """
264        Return the current state of the decoder.
265
266        This must be a (buffered_input, additional_state_info) tuple.
267        buffered_input must be a bytes object containing bytes that
268        were passed to decode() that have not yet been converted.
269        additional_state_info must be a non-negative integer
270        representing the state of the decoder WITHOUT yet having
271        processed the contents of buffered_input.  In the initial state
272        and after reset(), getstate() must return (b"", 0).
273        """
274        return (b"", 0)
275
276    def setstate(self, state):
277        """
278        Set the current state of the decoder.
279
280        state must have been returned by getstate().  The effect of
281        setstate((b"", 0)) must be equivalent to reset().
282        """
283
284class BufferedIncrementalDecoder(IncrementalDecoder):
285    """
286    This subclass of IncrementalDecoder can be used as the baseclass for an
287    incremental decoder if the decoder must be able to handle incomplete byte
288    sequences.
289    """
290    def __init__(self, errors='strict'):
291        IncrementalDecoder.__init__(self, errors)
292        self.buffer = "" # undecoded input that is kept between calls to decode()
293
294    def _buffer_decode(self, input, errors, final):
295        # Overwrite this method in subclasses: It must decode input
296        # and return an (output, length consumed) tuple
297        raise NotImplementedError
298
299    def decode(self, input, final=False):
300        # decode input (taking the buffer into account)
301        data = self.buffer + input
302        (result, consumed) = self._buffer_decode(data, self.errors, final)
303        # keep undecoded input until the next call
304        self.buffer = data[consumed:]
305        return result
306
307    def reset(self):
308        IncrementalDecoder.reset(self)
309        self.buffer = ""
310
311    def getstate(self):
312        # additional state info is always 0
313        return (self.buffer, 0)
314
315    def setstate(self, state):
316        # ignore additional state info
317        self.buffer = state[0]
318
319#
320# The StreamWriter and StreamReader class provide generic working
321# interfaces which can be used to implement new encoding submodules
322# very easily. See encodings/utf_8.py for an example on how this is
323# done.
324#
325
326class StreamWriter(Codec):
327
328    def __init__(self, stream, errors='strict'):
329
330        """ Creates a StreamWriter instance.
331
332            stream must be a file-like object open for writing
333            (binary) data.
334
335            The StreamWriter may use different error handling
336            schemes by providing the errors keyword argument. These
337            parameters are predefined:
338
339             'strict' - raise a ValueError (or a subclass)
340             'ignore' - ignore the character and continue with the next
341             'replace'- replace with a suitable replacement character
342             'xmlcharrefreplace' - Replace with the appropriate XML
343                                   character reference.
344             'backslashreplace'  - Replace with backslashed escape
345                                   sequences (only for encoding).
346
347            The set of allowed parameter values can be extended via
348            register_error.
349        """
350        self.stream = stream
351        self.errors = errors
352
353    def write(self, object):
354
355        """ Writes the object's contents encoded to self.stream.
356        """
357        data, consumed = self.encode(object, self.errors)
358        self.stream.write(data)
359
360    def writelines(self, list):
361
362        """ Writes the concatenated list of strings to the stream
363            using .write().
364        """
365        self.write(''.join(list))
366
367    def reset(self):
368
369        """ Flushes and resets the codec buffers used for keeping state.
370
371            Calling this method should ensure that the data on the
372            output is put into a clean state, that allows appending
373            of new fresh data without having to rescan the whole
374            stream to recover state.
375
376        """
377        pass
378
379    def seek(self, offset, whence=0):
380        self.stream.seek(offset, whence)
381        if whence == 0 and offset == 0:
382            self.reset()
383
384    def __getattr__(self, name,
385                    getattr=getattr):
386
387        """ Inherit all other methods from the underlying stream.
388        """
389        return getattr(self.stream, name)
390
391    def __enter__(self):
392        return self
393
394    def __exit__(self, type, value, tb):
395        self.stream.close()
396
397###
398
399class StreamReader(Codec):
400
401    def __init__(self, stream, errors='strict'):
402
403        """ Creates a StreamReader instance.
404
405            stream must be a file-like object open for reading
406            (binary) data.
407
408            The StreamReader may use different error handling
409            schemes by providing the errors keyword argument. These
410            parameters are predefined:
411
412             'strict' - raise a ValueError (or a subclass)
413             'ignore' - ignore the character and continue with the next
414             'replace'- replace with a suitable replacement character;
415
416            The set of allowed parameter values can be extended via
417            register_error.
418        """
419        self.stream = stream
420        self.errors = errors
421        self.bytebuffer = ""
422        # For str->str decoding this will stay a str
423        # For str->unicode decoding the first read will promote it to unicode
424        self.charbuffer = ""
425        self.linebuffer = None
426
427    def decode(self, input, errors='strict'):
428        raise NotImplementedError
429
430    def read(self, size=-1, chars=-1, firstline=False):
431
432        """ Decodes data from the stream self.stream and returns the
433            resulting object.
434
435            chars indicates the number of characters to read from the
436            stream. read() will never return more than chars
437            characters, but it might return less, if there are not enough
438            characters available.
439
440            size indicates the approximate maximum number of bytes to
441            read from the stream for decoding purposes. The decoder
442            can modify this setting as appropriate. The default value
443            -1 indicates to read and decode as much as possible.  size
444            is intended to prevent having to decode huge files in one
445            step.
446
447            If firstline is true, and a UnicodeDecodeError happens
448            after the first line terminator in the input only the first line
449            will be returned, the rest of the input will be kept until the
450            next call to read().
451
452            The method should use a greedy read strategy meaning that
453            it should read as much data as is allowed within the
454            definition of the encoding and the given size, e.g.  if
455            optional encoding endings or state markers are available
456            on the stream, these should be read too.
457        """
458        # If we have lines cached, first merge them back into characters
459        if self.linebuffer:
460            self.charbuffer = "".join(self.linebuffer)
461            self.linebuffer = None
462
463        # read until we get the required number of characters (if available)
464        while True:
465            # can the request be satisfied from the character buffer?
466            if chars >= 0:
467                if len(self.charbuffer) >= chars:
468                    break
469            elif size >= 0:
470                if len(self.charbuffer) >= size:
471                    break
472            # we need more data
473            if size < 0:
474                newdata = self.stream.read()
475            else:
476                newdata = self.stream.read(size)
477            # decode bytes (those remaining from the last call included)
478            data = self.bytebuffer + newdata
479            try:
480                newchars, decodedbytes = self.decode(data, self.errors)
481            except UnicodeDecodeError, exc:
482                if firstline:
483                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
484                    lines = newchars.splitlines(True)
485                    if len(lines)<=1:
486                        raise
487                else:
488                    raise
489            # keep undecoded bytes until the next call
490            self.bytebuffer = data[decodedbytes:]
491            # put new characters in the character buffer
492            self.charbuffer += newchars
493            # there was no data available
494            if not newdata:
495                break
496        if chars < 0:
497            # Return everything we've got
498            result = self.charbuffer
499            self.charbuffer = ""
500        else:
501            # Return the first chars characters
502            result = self.charbuffer[:chars]
503            self.charbuffer = self.charbuffer[chars:]
504        return result
505
506    def readline(self, size=None, keepends=True):
507
508        """ Read one line from the input stream and return the
509            decoded data.
510
511            size, if given, is passed as size argument to the
512            read() method.
513
514        """
515        # If we have lines cached from an earlier read, return
516        # them unconditionally
517        if self.linebuffer:
518            line = self.linebuffer[0]
519            del self.linebuffer[0]
520            if len(self.linebuffer) == 1:
521                # revert to charbuffer mode; we might need more data
522                # next time
523                self.charbuffer = self.linebuffer[0]
524                self.linebuffer = None
525            if not keepends:
526                line = line.splitlines(False)[0]
527            return line
528
529        readsize = size or 72
530        line = ""
531        # If size is given, we call read() only once
532        while True:
533            data = self.read(readsize, firstline=True)
534            if data:
535                # If we're at a "\r" read one extra character (which might
536                # be a "\n") to get a proper line ending. If the stream is
537                # temporarily exhausted we return the wrong line ending.
538                if data.endswith("\r"):
539                    data += self.read(size=1, chars=1)
540
541            line += data
542            lines = line.splitlines(True)
543            if lines:
544                if len(lines) > 1:
545                    # More than one line result; the first line is a full line
546                    # to return
547                    line = lines[0]
548                    del lines[0]
549                    if len(lines) > 1:
550                        # cache the remaining lines
551                        lines[-1] += self.charbuffer
552                        self.linebuffer = lines
553                        self.charbuffer = None
554                    else:
555                        # only one remaining line, put it back into charbuffer
556                        self.charbuffer = lines[0] + self.charbuffer
557                    if not keepends:
558                        line = line.splitlines(False)[0]
559                    break
560                line0withend = lines[0]
561                line0withoutend = lines[0].splitlines(False)[0]
562                if line0withend != line0withoutend: # We really have a line end
563                    # Put the rest back together and keep it until the next call
564                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
565                    if keepends:
566                        line = line0withend
567                    else:
568                        line = line0withoutend
569                    break
570            # we didn't get anything or this was our only try
571            if not data or size is not None:
572                if line and not keepends:
573                    line = line.splitlines(False)[0]
574                break
575            if readsize<8000:
576                readsize *= 2
577        return line
578
579    def readlines(self, sizehint=None, keepends=True):
580
581        """ Read all lines available on the input stream
582            and return them as list of lines.
583
584            Line breaks are implemented using the codec's decoder
585            method and are included in the list entries.
586
587            sizehint, if given, is ignored since there is no efficient
588            way to finding the true end-of-line.
589
590        """
591        data = self.read()
592        return data.splitlines(keepends)
593
594    def reset(self):
595
596        """ Resets the codec buffers used for keeping state.
597
598            Note that no stream repositioning should take place.
599            This method is primarily intended to be able to recover
600            from decoding errors.
601
602        """
603        self.bytebuffer = ""
604        self.charbuffer = u""
605        self.linebuffer = None
606
607    def seek(self, offset, whence=0):
608        """ Set the input stream's current position.
609
610            Resets the codec buffers used for keeping state.
611        """
612        self.stream.seek(offset, whence)
613        self.reset()
614
615    def next(self):
616
617        """ Return the next decoded line from the input stream."""
618        line = self.readline()
619        if line:
620            return line
621        raise StopIteration
622
623    def __iter__(self):
624        return self
625
626    def __getattr__(self, name,
627                    getattr=getattr):
628
629        """ Inherit all other methods from the underlying stream.
630        """
631        return getattr(self.stream, name)
632
633    def __enter__(self):
634        return self
635
636    def __exit__(self, type, value, tb):
637        self.stream.close()
638
639###
640
641class StreamReaderWriter:
642
643    """ StreamReaderWriter instances allow wrapping streams which
644        work in both read and write modes.
645
646        The design is such that one can use the factory functions
647        returned by the codec.lookup() function to construct the
648        instance.
649
650    """
651    # Optional attributes set by the file wrappers below
652    encoding = 'unknown'
653
654    def __init__(self, stream, Reader, Writer, errors='strict'):
655
656        """ Creates a StreamReaderWriter instance.
657
658            stream must be a Stream-like object.
659
660            Reader, Writer must be factory functions or classes
661            providing the StreamReader, StreamWriter interface resp.
662
663            Error handling is done in the same way as defined for the
664            StreamWriter/Readers.
665
666        """
667        self.stream = stream
668        self.reader = Reader(stream, errors)
669        self.writer = Writer(stream, errors)
670        self.errors = errors
671
672    def read(self, size=-1):
673
674        return self.reader.read(size)
675
676    def readline(self, size=None):
677
678        return self.reader.readline(size)
679
680    def readlines(self, sizehint=None):
681
682        return self.reader.readlines(sizehint)
683
684    def next(self):
685
686        """ Return the next decoded line from the input stream."""
687        return self.reader.next()
688
689    def __iter__(self):
690        return self
691
692    def write(self, data):
693
694        return self.writer.write(data)
695
696    def writelines(self, list):
697
698        return self.writer.writelines(list)
699
700    def reset(self):
701
702        self.reader.reset()
703        self.writer.reset()
704
705    def seek(self, offset, whence=0):
706        self.stream.seek(offset, whence)
707        self.reader.reset()
708        if whence == 0 and offset == 0:
709            self.writer.reset()
710
711    def __getattr__(self, name,
712                    getattr=getattr):
713
714        """ Inherit all other methods from the underlying stream.
715        """
716        return getattr(self.stream, name)
717
718    # these are needed to make "with codecs.open(...)" work properly
719
720    def __enter__(self):
721        return self
722
723    def __exit__(self, type, value, tb):
724        self.stream.close()
725
726###
727
728class StreamRecoder:
729
730    """ StreamRecoder instances provide a frontend - backend
731        view of encoding data.
732
733        They use the complete set of APIs returned by the
734        codecs.lookup() function to implement their task.
735
736        Data written to the stream is first decoded into an
737        intermediate format (which is dependent on the given codec
738        combination) and then written to the stream using an instance
739        of the provided Writer class.
740
741        In the other direction, data is read from the stream using a
742        Reader instance and then return encoded data to the caller.
743
744    """
745    # Optional attributes set by the file wrappers below
746    data_encoding = 'unknown'
747    file_encoding = 'unknown'
748
749    def __init__(self, stream, encode, decode, Reader, Writer,
750                 errors='strict'):
751
752        """ Creates a StreamRecoder instance which implements a two-way
753            conversion: encode and decode work on the frontend (the
754            input to .read() and output of .write()) while
755            Reader and Writer work on the backend (reading and
756            writing to the stream).
757
758            You can use these objects to do transparent direct
759            recodings from e.g. latin-1 to utf-8 and back.
760
761            stream must be a file-like object.
762
763            encode, decode must adhere to the Codec interface, Reader,
764            Writer must be factory functions or classes providing the
765            StreamReader, StreamWriter interface resp.
766
767            encode and decode are needed for the frontend translation,
768            Reader and Writer for the backend translation. Unicode is
769            used as intermediate encoding.
770
771            Error handling is done in the same way as defined for the
772            StreamWriter/Readers.
773
774        """
775        self.stream = stream
776        self.encode = encode
777        self.decode = decode
778        self.reader = Reader(stream, errors)
779        self.writer = Writer(stream, errors)
780        self.errors = errors
781
782    def read(self, size=-1):
783
784        data = self.reader.read(size)
785        data, bytesencoded = self.encode(data, self.errors)
786        return data
787
788    def readline(self, size=None):
789
790        if size is None:
791            data = self.reader.readline()
792        else:
793            data = self.reader.readline(size)
794        data, bytesencoded = self.encode(data, self.errors)
795        return data
796
797    def readlines(self, sizehint=None):
798
799        data = self.reader.read()
800        data, bytesencoded = self.encode(data, self.errors)
801        return data.splitlines(1)
802
803    def next(self):
804
805        """ Return the next decoded line from the input stream."""
806        data = self.reader.next()
807        data, bytesencoded = self.encode(data, self.errors)
808        return data
809
810    def __iter__(self):
811        return self
812
813    def write(self, data):
814
815        data, bytesdecoded = self.decode(data, self.errors)
816        return self.writer.write(data)
817
818    def writelines(self, list):
819
820        data = ''.join(list)
821        data, bytesdecoded = self.decode(data, self.errors)
822        return self.writer.write(data)
823
824    def reset(self):
825
826        self.reader.reset()
827        self.writer.reset()
828
829    def __getattr__(self, name,
830                    getattr=getattr):
831
832        """ Inherit all other methods from the underlying stream.
833        """
834        return getattr(self.stream, name)
835
836    def __enter__(self):
837        return self
838
839    def __exit__(self, type, value, tb):
840        self.stream.close()
841
842### Shortcuts
843
844def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
845
846    """ Open an encoded file using the given mode and return
847        a wrapped version providing transparent encoding/decoding.
848
849        Note: The wrapped version will only accept the object format
850        defined by the codecs, i.e. Unicode objects for most builtin
851        codecs. Output is also codec dependent and will usually be
852        Unicode as well.
853
854        Files are always opened in binary mode, even if no binary mode
855        was specified. This is done to avoid data loss due to encodings
856        using 8-bit values. The default file mode is 'rb' meaning to
857        open the file in binary read mode.
858
859        encoding specifies the encoding which is to be used for the
860        file.
861
862        errors may be given to define the error handling. It defaults
863        to 'strict' which causes ValueErrors to be raised in case an
864        encoding error occurs.
865
866        buffering has the same meaning as for the builtin open() API.
867        It defaults to line buffered.
868
869        The returned wrapped file object provides an extra attribute
870        .encoding which allows querying the used encoding. This
871        attribute is only available if an encoding was specified as
872        parameter.
873
874    """
875    if encoding is not None:
876        if 'U' in mode:
877            # No automatic conversion of '\n' is done on reading and writing
878            mode = mode.strip().replace('U', '')
879            if mode[:1] not in set('rwa'):
880                mode = 'r' + mode
881        if 'b' not in mode:
882            # Force opening of the file in binary mode
883            mode = mode + 'b'
884    file = __builtin__.open(filename, mode, buffering)
885    if encoding is None:
886        return file
887    info = lookup(encoding)
888    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
889    # Add attributes to simplify introspection
890    srw.encoding = encoding
891    return srw
892
893def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
894
895    """ Return a wrapped version of file which provides transparent
896        encoding translation.
897
898        Strings written to the wrapped file are interpreted according
899        to the given data_encoding and then written to the original
900        file as string using file_encoding. The intermediate encoding
901        will usually be Unicode but depends on the specified codecs.
902
903        Strings are read from the file using file_encoding and then
904        passed back to the caller as string using data_encoding.
905
906        If file_encoding is not given, it defaults to data_encoding.
907
908        errors may be given to define the error handling. It defaults
909        to 'strict' which causes ValueErrors to be raised in case an
910        encoding error occurs.
911
912        The returned wrapped file object provides two extra attributes
913        .data_encoding and .file_encoding which reflect the given
914        parameters of the same name. The attributes can be used for
915        introspection by Python programs.
916
917    """
918    if file_encoding is None:
919        file_encoding = data_encoding
920    data_info = lookup(data_encoding)
921    file_info = lookup(file_encoding)
922    sr = StreamRecoder(file, data_info.encode, data_info.decode,
923                       file_info.streamreader, file_info.streamwriter, errors)
924    # Add attributes to simplify introspection
925    sr.data_encoding = data_encoding
926    sr.file_encoding = file_encoding
927    return sr
928
929### Helpers for codec lookup
930
931def getencoder(encoding):
932
933    """ Lookup up the codec for the given encoding and return
934        its encoder function.
935
936        Raises a LookupError in case the encoding cannot be found.
937
938    """
939    return lookup(encoding).encode
940
941def getdecoder(encoding):
942
943    """ Lookup up the codec for the given encoding and return
944        its decoder function.
945
946        Raises a LookupError in case the encoding cannot be found.
947
948    """
949    return lookup(encoding).decode
950
951def getincrementalencoder(encoding):
952
953    """ Lookup up the codec for the given encoding and return
954        its IncrementalEncoder class or factory function.
955
956        Raises a LookupError in case the encoding cannot be found
957        or the codecs doesn't provide an incremental encoder.
958
959    """
960    encoder = lookup(encoding).incrementalencoder
961    if encoder is None:
962        raise LookupError(encoding)
963    return encoder
964
965def getincrementaldecoder(encoding):
966
967    """ Lookup up the codec for the given encoding and return
968        its IncrementalDecoder class or factory function.
969
970        Raises a LookupError in case the encoding cannot be found
971        or the codecs doesn't provide an incremental decoder.
972
973    """
974    decoder = lookup(encoding).incrementaldecoder
975    if decoder is None:
976        raise LookupError(encoding)
977    return decoder
978
979def getreader(encoding):
980
981    """ Lookup up the codec for the given encoding and return
982        its StreamReader class or factory function.
983
984        Raises a LookupError in case the encoding cannot be found.
985
986    """
987    return lookup(encoding).streamreader
988
989def getwriter(encoding):
990
991    """ Lookup up the codec for the given encoding and return
992        its StreamWriter class or factory function.
993
994        Raises a LookupError in case the encoding cannot be found.
995
996    """
997    return lookup(encoding).streamwriter
998
999def iterencode(iterator, encoding, errors='strict', **kwargs):
1000    """
1001    Encoding iterator.
1002
1003    Encodes the input strings from the iterator using a IncrementalEncoder.
1004
1005    errors and kwargs are passed through to the IncrementalEncoder
1006    constructor.
1007    """
1008    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1009    for input in iterator:
1010        output = encoder.encode(input)
1011        if output:
1012            yield output
1013    output = encoder.encode("", True)
1014    if output:
1015        yield output
1016
1017def iterdecode(iterator, encoding, errors='strict', **kwargs):
1018    """
1019    Decoding iterator.
1020
1021    Decodes the input strings from the iterator using a IncrementalDecoder.
1022
1023    errors and kwargs are passed through to the IncrementalDecoder
1024    constructor.
1025    """
1026    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1027    for input in iterator:
1028        output = decoder.decode(input)
1029        if output:
1030            yield output
1031    output = decoder.decode("", True)
1032    if output:
1033        yield output
1034
1035### Helpers for charmap-based codecs
1036
1037def make_identity_dict(rng):
1038
1039    """ make_identity_dict(rng) -> dict
1040
1041        Return a dictionary where elements of the rng sequence are
1042        mapped to themselves.
1043
1044    """
1045    res = {}
1046    for i in rng:
1047        res[i]=i
1048    return res
1049
1050def make_encoding_map(decoding_map):
1051
1052    """ Creates an encoding map from a decoding map.
1053
1054        If a target mapping in the decoding map occurs multiple
1055        times, then that target is mapped to None (undefined mapping),
1056        causing an exception when encountered by the charmap codec
1057        during translation.
1058
1059        One example where this happens is cp875.py which decodes
1060        multiple character to \\u001a.
1061
1062    """
1063    m = {}
1064    for k,v in decoding_map.items():
1065        if not v in m:
1066            m[v] = k
1067        else:
1068            m[v] = None
1069    return m
1070
1071### error handlers
1072
1073try:
1074    strict_errors = lookup_error("strict")
1075    ignore_errors = lookup_error("ignore")
1076    replace_errors = lookup_error("replace")
1077    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1078    backslashreplace_errors = lookup_error("backslashreplace")
1079except LookupError:
1080    # In --disable-unicode builds, these error handler are missing
1081    strict_errors = None
1082    ignore_errors = None
1083    replace_errors = None
1084    xmlcharrefreplace_errors = None
1085    backslashreplace_errors = None
1086
1087# Tell modulefinder that using codecs probably needs the encodings
1088# package
1089_false = 0
1090if _false:
1091    import encodings
1092
1093### Tests
1094
1095if __name__ == '__main__':
1096
1097    # Make stdout translate Latin-1 output into UTF-8 output
1098    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1099
1100    # Have stdin translate Latin-1 input into UTF-8 input
1101    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1102