183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh""" codecs -- Python Codec Registry, API and helpers.
283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehWritten by Marc-Andre Lemburg (mal@lemburg.com).
583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh"""#"
983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehimport __builtin__, sys
1183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Registry and builtin stateless codec functions
1383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehtry:
1583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    from _codecs import *
1683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehexcept ImportError, why:
1783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    raise SystemError('Failed to load the builtin codecs: %s' % why)
1883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
1983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
2083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
2183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
2283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
2383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "strict_errors", "ignore_errors", "replace_errors",
2483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "xmlcharrefreplace_errors",
2583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh           "register_error", "lookup_error"]
2683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
2783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Constants
2883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
2983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh#
3083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
3183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# and its possible byte string values
3283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# for UTF8/UTF16/UTF32 output and little/big endian machines
3383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh#
3483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
3583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# UTF-8
3683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM_UTF8 = '\xef\xbb\xbf'
3783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
3883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# UTF-16, little endian
3983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM_LE = BOM_UTF16_LE = '\xff\xfe'
4083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# UTF-16, big endian
4283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM_BE = BOM_UTF16_BE = '\xfe\xff'
4383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# UTF-32, little endian
4583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM_UTF32_LE = '\xff\xfe\x00\x00'
4683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
4783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# UTF-32, big endian
4883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM_UTF32_BE = '\x00\x00\xfe\xff'
4983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
5083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehif sys.byteorder == 'little':
5183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
5283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # UTF-16, native endianness
5383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    BOM = BOM_UTF16 = BOM_UTF16_LE
5483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
5583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # UTF-32, native endianness
5683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    BOM_UTF32 = BOM_UTF32_LE
5783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
5883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehelse:
5983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
6083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # UTF-16, native endianness
6183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    BOM = BOM_UTF16 = BOM_UTF16_BE
6283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
6383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # UTF-32, native endianness
6483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    BOM_UTF32 = BOM_UTF32_BE
6583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
6683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# Old broken names (don't use in new code)
6783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM32_LE = BOM_UTF16_LE
6883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM32_BE = BOM_UTF16_BE
6983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM64_LE = BOM_UTF32_LE
7083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew HsiehBOM64_BE = BOM_UTF32_BE
7183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
7283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
7383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Codec base classes (defining the API)
7483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
7583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass CodecInfo(tuple):
7683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
7783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
7883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        incrementalencoder=None, incrementaldecoder=None, name=None):
7983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
8083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.name = name
8183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.encode = encode
8283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.decode = decode
8383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.incrementalencoder = incrementalencoder
8483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.incrementaldecoder = incrementaldecoder
8583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.streamwriter = streamwriter
8683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.streamreader = streamreader
8783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
8883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
8983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __repr__(self):
9083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
9183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
9283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass Codec:
9383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
9483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Defines the interface for stateless encoders/decoders.
9583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
9683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The .encode()/.decode() methods may use different error
9783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        handling schemes by providing the errors argument. These
9883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        string values are predefined:
9983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
10083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh         'strict' - raise a ValueError error (or a subclass)
10183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh         'ignore' - ignore the character and continue with the next
10283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh         'replace' - replace with a suitable replacement character;
10383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    Python will use the official U+FFFD REPLACEMENT
10483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    CHARACTER for the builtin Unicode codecs on
10583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    decoding and '?' on encoding.
10683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh         'xmlcharrefreplace' - Replace with the appropriate XML
10783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                               character reference (only for encoding).
10883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh         'backslashreplace'  - Replace with backslashed escape sequences
10983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                               (only for encoding).
11083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
11183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The set of allowed values can be extended via register_error.
11283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
11383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
11483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def encode(self, input, errors='strict'):
11583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
11683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Encodes the object input and returns a tuple (output
11783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            object, length consumed).
11883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
11983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            errors defines the error handling to apply. It defaults to
12083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            'strict' handling.
12183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
12283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The method may not store state in the Codec instance. Use
12383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            StreamCodec for codecs which have to keep state in order to
12483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            make encoding/decoding efficient.
12583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
12683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The encoder must be able to handle zero length input and
12783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return an empty object of the output object type in this
12883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            situation.
12983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
13083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
13183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
13283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
13383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def decode(self, input, errors='strict'):
13483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
13583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Decodes the object input and returns a tuple (output
13683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            object, length consumed).
13783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
13883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            input must be an object which provides the bf_getreadbuf
13983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            buffer slot. Python strings, buffer objects and memory
14083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            mapped files are examples of objects providing this slot.
14183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
14283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            errors defines the error handling to apply. It defaults to
14383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            'strict' handling.
14483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
14583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The method may not store state in the Codec instance. Use
14683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            StreamCodec for codecs which have to keep state in order to
14783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            make encoding/decoding efficient.
14883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
14983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The decoder must be able to handle zero length input and
15083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return an empty object of the output object type in this
15183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            situation.
15283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
15383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
15483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
15583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
15683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass IncrementalEncoder(object):
15783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
15883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    An IncrementalEncoder encodes an input in multiple steps. The input can be
15983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    passed piece by piece to the encode() method. The IncrementalEncoder remembers
16083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    the state of the Encoding process between calls to encode().
16183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
16283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
16383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
16483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Creates an IncrementalEncoder instance.
16583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
16683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The IncrementalEncoder may use different error handling schemes by
16783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        providing the errors keyword argument. See the module docstring
16883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        for a list of possible values.
16983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
17083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
17183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = ""
17283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
17383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def encode(self, input, final=False):
17483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
17583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Encodes input and returns the resulting object.
17683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
17783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
17883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
17983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
18083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
18183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Resets the encoder to the initial state.
18283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
18383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
18483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def getstate(self):
18583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
18683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Return the current state of the encoder.
18783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
18883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return 0
18983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
19083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def setstate(self, state):
19183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
19283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Set the current state of the encoder. state must have been
19383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        returned by getstate().
19483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
19583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
19683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass BufferedIncrementalEncoder(IncrementalEncoder):
19783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
19883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    This subclass of IncrementalEncoder can be used as the baseclass for an
19983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    incremental encoder if the encoder must keep some of the output in a
20083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    buffer between calls to encode().
20183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
20283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
20383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        IncrementalEncoder.__init__(self, errors)
20483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = "" # unencoded input that is kept between calls to encode()
20583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
20683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def _buffer_encode(self, input, errors, final):
20783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # Overwrite this method in subclasses: It must encode input
20883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # and return an (output, length consumed) tuple
20983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
21083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
21183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def encode(self, input, final=False):
21283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # encode input (taking the buffer into account)
21383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.buffer + input
21483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        (result, consumed) = self._buffer_encode(data, self.errors, final)
21583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # keep unencoded input until the next call
21683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = data[consumed:]
21783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return result
21883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
21983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
22083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        IncrementalEncoder.reset(self)
22183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = ""
22283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
22383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def getstate(self):
22483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.buffer or 0
22583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
22683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def setstate(self, state):
22783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = state or ""
22883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
22983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass IncrementalDecoder(object):
23083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
23183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    An IncrementalDecoder decodes an input in multiple steps. The input can be
23283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    passed piece by piece to the decode() method. The IncrementalDecoder
23383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    remembers the state of the decoding process between calls to decode().
23483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
23583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
23683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
23783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Creates a IncrementalDecoder instance.
23883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
23983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The IncrementalDecoder may use different error handling schemes by
24083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        providing the errors keyword argument. See the module docstring
24183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        for a list of possible values.
24283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
24383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
24483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
24583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def decode(self, input, final=False):
24683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
24783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Decodes input and returns the resulting object.
24883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
24983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
25083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
25183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
25283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
25383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Resets the decoder to the initial state.
25483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
25583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
25683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def getstate(self):
25783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
25883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Return the current state of the decoder.
25983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
26083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        This must be a (buffered_input, additional_state_info) tuple.
26183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        buffered_input must be a bytes object containing bytes that
26283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        were passed to decode() that have not yet been converted.
26383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        additional_state_info must be a non-negative integer
26483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        representing the state of the decoder WITHOUT yet having
26583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        processed the contents of buffered_input.  In the initial state
26683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        and after reset(), getstate() must return (b"", 0).
26783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
26883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return (b"", 0)
26983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
27083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def setstate(self, state):
27183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
27283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Set the current state of the decoder.
27383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
27483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        state must have been returned by getstate().  The effect of
27583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        setstate((b"", 0)) must be equivalent to reset().
27683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
27783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
27883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass BufferedIncrementalDecoder(IncrementalDecoder):
27983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
28083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    This subclass of IncrementalDecoder can be used as the baseclass for an
28183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    incremental decoder if the decoder must be able to handle incomplete byte
28283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sequences.
28383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
28483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, errors='strict'):
28583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        IncrementalDecoder.__init__(self, errors)
28683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = "" # undecoded input that is kept between calls to decode()
28783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
28883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def _buffer_decode(self, input, errors, final):
28983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # Overwrite this method in subclasses: It must decode input
29083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # and return an (output, length consumed) tuple
29183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
29283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
29383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def decode(self, input, final=False):
29483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # decode input (taking the buffer into account)
29583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.buffer + input
29683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        (result, consumed) = self._buffer_decode(data, self.errors, final)
29783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # keep undecoded input until the next call
29883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = data[consumed:]
29983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return result
30083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
30183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
30283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        IncrementalDecoder.reset(self)
30383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = ""
30483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
30583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def getstate(self):
30683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # additional state info is always 0
30783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return (self.buffer, 0)
30883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
30983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def setstate(self, state):
31083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # ignore additional state info
31183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.buffer = state[0]
31283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
31383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh#
31483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# The StreamWriter and StreamReader class provide generic working
31583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# interfaces which can be used to implement new encoding submodules
31683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# very easily. See encodings/utf_8.py for an example on how this is
31783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# done.
31883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh#
31983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
32083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamWriter(Codec):
32183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
32283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, stream, errors='strict'):
32383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
32483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Creates a StreamWriter instance.
32583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
32683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream must be a file-like object open for writing
32783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            (binary) data.
32883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
32983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The StreamWriter may use different error handling
33083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            schemes by providing the errors keyword argument. These
33183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            parameters are predefined:
33283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
33383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'strict' - raise a ValueError (or a subclass)
33483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'ignore' - ignore the character and continue with the next
33583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'replace'- replace with a suitable replacement character
33683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'xmlcharrefreplace' - Replace with the appropriate XML
33783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                                   character reference.
33883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'backslashreplace'  - Replace with backslashed escape
33983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                                   sequences (only for encoding).
34083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
34183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The set of allowed parameter values can be extended via
34283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            register_error.
34383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
34483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream = stream
34583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
34683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
34783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def write(self, object):
34883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
34983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Writes the object's contents encoded to self.stream.
35083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
35183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, consumed = self.encode(object, self.errors)
35283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.write(data)
35383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
35483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def writelines(self, list):
35583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
35683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Writes the concatenated list of strings to the stream
35783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            using .write().
35883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
35983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.write(''.join(list))
36083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
36183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
36283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
36383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Flushes and resets the codec buffers used for keeping state.
36483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
36583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Calling this method should ensure that the data on the
36683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            output is put into a clean state, that allows appending
36783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            of new fresh data without having to rescan the whole
36883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream to recover state.
36983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
37083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
37183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        pass
37283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
37383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def seek(self, offset, whence=0):
37483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.seek(offset, whence)
37583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if whence == 0 and offset == 0:
37683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.reset()
37783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
37883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __getattr__(self, name,
37983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    getattr=getattr):
38083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
38183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Inherit all other methods from the underlying stream.
38283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
38383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return getattr(self.stream, name)
38483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
38583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __enter__(self):
38683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
38783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
38883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __exit__(self, type, value, tb):
38983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.close()
39083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
39183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh###
39283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
39383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamReader(Codec):
39483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
39583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, stream, errors='strict'):
39683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
39783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Creates a StreamReader instance.
39883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
39983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream must be a file-like object open for reading
40083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            (binary) data.
40183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
40283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The StreamReader may use different error handling
40383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            schemes by providing the errors keyword argument. These
40483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            parameters are predefined:
40583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
40683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'strict' - raise a ValueError (or a subclass)
40783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'ignore' - ignore the character and continue with the next
40883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh             'replace'- replace with a suitable replacement character;
40983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
41083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The set of allowed parameter values can be extended via
41183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            register_error.
41283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
41383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream = stream
41483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
41583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.bytebuffer = ""
41683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # For str->str decoding this will stay a str
41783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # For str->unicode decoding the first read will promote it to unicode
41883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.charbuffer = ""
41983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.linebuffer = None
42083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
42183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def decode(self, input, errors='strict'):
42283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise NotImplementedError
42383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
42483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def read(self, size=-1, chars=-1, firstline=False):
42583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
42683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Decodes data from the stream self.stream and returns the
42783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            resulting object.
42883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
42983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            chars indicates the number of characters to read from the
43083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream. read() will never return more than chars
43183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            characters, but it might return less, if there are not enough
43283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            characters available.
43383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
43483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            size indicates the approximate maximum number of bytes to
43583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            read from the stream for decoding purposes. The decoder
43683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            can modify this setting as appropriate. The default value
43783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            -1 indicates to read and decode as much as possible.  size
43883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            is intended to prevent having to decode huge files in one
43983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            step.
44083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
44183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            If firstline is true, and a UnicodeDecodeError happens
44283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            after the first line terminator in the input only the first line
44383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            will be returned, the rest of the input will be kept until the
44483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            next call to read().
44583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
44683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            The method should use a greedy read strategy meaning that
44783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            it should read as much data as is allowed within the
44883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            definition of the encoding and the given size, e.g.  if
44983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            optional encoding endings or state markers are available
45083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            on the stream, these should be read too.
45183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
45283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # If we have lines cached, first merge them back into characters
45383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if self.linebuffer:
45483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.charbuffer = "".join(self.linebuffer)
45583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.linebuffer = None
45683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
45783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # read until we get the required number of characters (if available)
45883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        while True:
45983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # can the request can be satisfied from the character buffer?
46083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if chars < 0:
46183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if size < 0:
46283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    if self.charbuffer:
46383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        break
46483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                elif len(self.charbuffer) >= size:
46583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    break
46683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            else:
46783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if len(self.charbuffer) >= chars:
46883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    break
46983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # we need more data
47083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if size < 0:
47183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                newdata = self.stream.read()
47283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            else:
47383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                newdata = self.stream.read(size)
47483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # decode bytes (those remaining from the last call included)
47583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            data = self.bytebuffer + newdata
47683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            try:
47783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                newchars, decodedbytes = self.decode(data, self.errors)
47883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            except UnicodeDecodeError, exc:
47983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if firstline:
48083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
48183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    lines = newchars.splitlines(True)
48283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    if len(lines)<=1:
48383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        raise
48483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                else:
48583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    raise
48683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # keep undecoded bytes until the next call
48783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.bytebuffer = data[decodedbytes:]
48883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # put new characters in the character buffer
48983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.charbuffer += newchars
49083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # there was no data available
49183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if not newdata:
49283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                break
49383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if chars < 0:
49483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # Return everything we've got
49583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            result = self.charbuffer
49683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.charbuffer = ""
49783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        else:
49883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # Return the first chars characters
49983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            result = self.charbuffer[:chars]
50083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.charbuffer = self.charbuffer[chars:]
50183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return result
50283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
50383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readline(self, size=None, keepends=True):
50483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
50583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Read one line from the input stream and return the
50683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            decoded data.
50783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
50883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            size, if given, is passed as size argument to the
50983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            read() method.
51083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
51183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
51283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # If we have lines cached from an earlier read, return
51383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # them unconditionally
51483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if self.linebuffer:
51583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            line = self.linebuffer[0]
51683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            del self.linebuffer[0]
51783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if len(self.linebuffer) == 1:
51883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # revert to charbuffer mode; we might need more data
51983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # next time
52083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                self.charbuffer = self.linebuffer[0]
52183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                self.linebuffer = None
52283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if not keepends:
52383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                line = line.splitlines(False)[0]
52483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return line
52583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
52683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        readsize = size or 72
52783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        line = ""
52883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        # If size is given, we call read() only once
52983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        while True:
53083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            data = self.read(readsize, firstline=True)
53183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if data:
53283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # If we're at a "\r" read one extra character (which might
53383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # be a "\n") to get a proper line ending. If the stream is
53483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                # temporarily exhausted we return the wrong line ending.
53583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if data.endswith("\r"):
53683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    data += self.read(size=1, chars=1)
53783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
53883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            line += data
53983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            lines = line.splitlines(True)
54083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if lines:
54183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if len(lines) > 1:
54283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    # More than one line result; the first line is a full line
54383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    # to return
54483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    line = lines[0]
54583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    del lines[0]
54683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    if len(lines) > 1:
54783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        # cache the remaining lines
54883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        lines[-1] += self.charbuffer
54983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        self.linebuffer = lines
55083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        self.charbuffer = None
55183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    else:
55283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        # only one remaining line, put it back into charbuffer
55383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        self.charbuffer = lines[0] + self.charbuffer
55483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    if not keepends:
55583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        line = line.splitlines(False)[0]
55683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    break
55783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                line0withend = lines[0]
55883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                line0withoutend = lines[0].splitlines(False)[0]
55983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if line0withend != line0withoutend: # We really have a line end
56083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    # Put the rest back together and keep it until the next call
56183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
56283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    if keepends:
56383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        line = line0withend
56483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    else:
56583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                        line = line0withoutend
56683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    break
56783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # we didn't get anything or this was our only try
56883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if not data or size is not None:
56983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                if line and not keepends:
57083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    line = line.splitlines(False)[0]
57183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                break
57283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if readsize<8000:
57383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                readsize *= 2
57483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return line
57583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
57683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readlines(self, sizehint=None, keepends=True):
57783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
57883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Read all lines available on the input stream
57983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            and return them as list of lines.
58083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
58183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Line breaks are implemented using the codec's decoder
58283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            method and are included in the list entries.
58383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
58483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            sizehint, if given, is ignored since there is no efficient
58583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            way to finding the true end-of-line.
58683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
58783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
58883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.read()
58983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return data.splitlines(keepends)
59083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
59183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
59283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
59383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Resets the codec buffers used for keeping state.
59483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
59583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Note that no stream repositioning should take place.
59683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            This method is primarily intended to be able to recover
59783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            from decoding errors.
59883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
59983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
60083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.bytebuffer = ""
60183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.charbuffer = u""
60283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.linebuffer = None
60383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
60483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def seek(self, offset, whence=0):
60583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Set the input stream's current position.
60683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
60783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Resets the codec buffers used for keeping state.
60883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
60983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.seek(offset, whence)
61083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reset()
61183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
61283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def next(self):
61383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
61483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Return the next decoded line from the input stream."""
61583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        line = self.readline()
61683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if line:
61783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            return line
61883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise StopIteration
61983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
62083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __iter__(self):
62183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
62283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
62383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __getattr__(self, name,
62483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    getattr=getattr):
62583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
62683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Inherit all other methods from the underlying stream.
62783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
62883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return getattr(self.stream, name)
62983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
63083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __enter__(self):
63183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
63283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
63383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __exit__(self, type, value, tb):
63483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.close()
63583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
63683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh###
63783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
63883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamReaderWriter:
63983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
64083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ StreamReaderWriter instances allow wrapping streams which
64183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        work in both read and write modes.
64283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
64383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The design is such that one can use the factory functions
64483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        returned by the codec.lookup() function to construct the
64583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        instance.
64683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
64783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
64883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Optional attributes set by the file wrappers below
64983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    encoding = 'unknown'
65083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
65183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, stream, Reader, Writer, errors='strict'):
65283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
65383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Creates a StreamReaderWriter instance.
65483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
65583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream must be a Stream-like object.
65683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
65783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Reader, Writer must be factory functions or classes
65883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            providing the StreamReader, StreamWriter interface resp.
65983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
66083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Error handling is done in the same way as defined for the
66183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            StreamWriter/Readers.
66283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
66383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
66483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream = stream
66583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reader = Reader(stream, errors)
66683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.writer = Writer(stream, errors)
66783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
66883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
66983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def read(self, size=-1):
67083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
67183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.reader.read(size)
67283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
67383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readline(self, size=None):
67483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
67583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.reader.readline(size)
67683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
67783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readlines(self, sizehint=None):
67883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
67983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.reader.readlines(sizehint)
68083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
68183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def next(self):
68283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
68383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Return the next decoded line from the input stream."""
68483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.reader.next()
68583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
68683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __iter__(self):
68783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
68883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
68983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def write(self, data):
69083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
69183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.writer.write(data)
69283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
69383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def writelines(self, list):
69483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
69583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.writer.writelines(list)
69683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
69783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
69883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
69983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reader.reset()
70083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.writer.reset()
70183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
70283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def seek(self, offset, whence=0):
70383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.seek(offset, whence)
70483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reader.reset()
70583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if whence == 0 and offset == 0:
70683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            self.writer.reset()
70783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
70883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __getattr__(self, name,
70983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    getattr=getattr):
71083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
71183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Inherit all other methods from the underlying stream.
71283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
71383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return getattr(self.stream, name)
71483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
71583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # these are needed to make "with codecs.open(...)" work properly
71683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
71783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __enter__(self):
71883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
71983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
72083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __exit__(self, type, value, tb):
72183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.close()
72283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
72383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh###
72483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
72583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehclass StreamRecoder:
72683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
72783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ StreamRecoder instances provide a frontend - backend
72883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        view of encoding data.
72983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
73083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        They use the complete set of APIs returned by the
73183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs.lookup() function to implement their task.
73283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
73383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Data written to the stream is first decoded into an
73483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        intermediate format (which is dependent on the given codec
73583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        combination) and then written to the stream using an instance
73683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        of the provided Writer class.
73783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
73883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        In the other direction, data is read from the stream using a
73983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Reader instance and then return encoded data to the caller.
74083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
74183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
74283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Optional attributes set by the file wrappers below
74383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    data_encoding = 'unknown'
74483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    file_encoding = 'unknown'
74583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
74683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __init__(self, stream, encode, decode, Reader, Writer,
74783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                 errors='strict'):
74883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
74983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Creates a StreamRecoder instance which implements a two-way
75083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            conversion: encode and decode work on the frontend (the
75183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            input to .read() and output of .write()) while
75283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Reader and Writer work on the backend (reading and
75383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            writing to the stream).
75483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
75583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            You can use these objects to do transparent direct
75683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            recodings from e.g. latin-1 to utf-8 and back.
75783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
75883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            stream must be a file-like object.
75983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
76083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            encode, decode must adhere to the Codec interface, Reader,
76183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Writer must be factory functions or classes providing the
76283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            StreamReader, StreamWriter interface resp.
76383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
76483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            encode and decode are needed for the frontend translation,
76583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Reader and Writer for the backend translation. Unicode is
76683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            used as intermediate encoding.
76783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
76883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            Error handling is done in the same way as defined for the
76983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            StreamWriter/Readers.
77083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
77183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
77283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream = stream
77383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.encode = encode
77483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.decode = decode
77583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reader = Reader(stream, errors)
77683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.writer = Writer(stream, errors)
77783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.errors = errors
77883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
77983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def read(self, size=-1):
78083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
78183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.reader.read(size)
78283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesencoded = self.encode(data, self.errors)
78383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return data
78483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
78583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readline(self, size=None):
78683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
78783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if size is None:
78883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            data = self.reader.readline()
78983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        else:
79083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            data = self.reader.readline(size)
79183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesencoded = self.encode(data, self.errors)
79283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return data
79383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
79483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def readlines(self, sizehint=None):
79583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
79683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.reader.read()
79783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesencoded = self.encode(data, self.errors)
79883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return data.splitlines(1)
79983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
80083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def next(self):
80183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
80283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Return the next decoded line from the input stream."""
80383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = self.reader.next()
80483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesencoded = self.encode(data, self.errors)
80583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return data
80683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
80783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __iter__(self):
80883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
80983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
81083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def write(self, data):
81183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
81283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesdecoded = self.decode(data, self.errors)
81383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.writer.write(data)
81483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
81583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def writelines(self, list):
81683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
81783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data = ''.join(list)
81883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        data, bytesdecoded = self.decode(data, self.errors)
81983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self.writer.write(data)
82083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
82183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def reset(self):
82283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
82383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.reader.reset()
82483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.writer.reset()
82583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
82683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __getattr__(self, name,
82783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                    getattr=getattr):
82883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
82983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """ Inherit all other methods from the underlying stream.
83083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        """
83183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return getattr(self.stream, name)
83283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
83383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __enter__(self):
83483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return self
83583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
83683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    def __exit__(self, type, value, tb):
83783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        self.stream.close()
83883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
83983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Shortcuts
84083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
84183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
84283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
84383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Open an encoded file using the given mode and return
84483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        a wrapped version providing transparent encoding/decoding.
84583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
84683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Note: The wrapped version will only accept the object format
84783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        defined by the codecs, i.e. Unicode objects for most builtin
84883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        codecs. Output is also codec dependent and will usually be
84983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Unicode as well.
85083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
85183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Files are always opened in binary mode, even if no binary mode
85283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        was specified. This is done to avoid data loss due to encodings
85383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        using 8-bit values. The default file mode is 'rb' meaning to
85483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        open the file in binary read mode.
85583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
85683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        encoding specifies the encoding which is to be used for the
85783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        file.
85883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
85983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        errors may be given to define the error handling. It defaults
86083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        to 'strict' which causes ValueErrors to be raised in case an
86183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        encoding error occurs.
86283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
86383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        buffering has the same meaning as for the builtin open() API.
86483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        It defaults to line buffered.
86583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
86683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The returned wrapped file object provides an extra attribute
86783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        .encoding which allows querying the used encoding. This
86883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        attribute is only available if an encoding was specified as
86983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        parameter.
87083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
87183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
87283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if encoding is not None:
87383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if 'U' in mode:
87483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # No automatic conversion of '\n' is done on reading and writing
87583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            mode = mode.strip().replace('U', '')
87683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            if mode[:1] not in set('rwa'):
87783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                mode = 'r' + mode
87883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if 'b' not in mode:
87983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            # Force opening of the file in binary mode
88083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            mode = mode + 'b'
88183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    file = __builtin__.open(filename, mode, buffering)
88283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if encoding is None:
88383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        return file
88483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    info = lookup(encoding)
88583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
88683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Add attributes to simplify introspection
88783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    srw.encoding = encoding
88883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return srw
88983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
89083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
89183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
89283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Return a wrapped version of file which provides transparent
89383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        encoding translation.
89483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
89583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Strings written to the wrapped file are interpreted according
89683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        to the given data_encoding and then written to the original
89783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        file as string using file_encoding. The intermediate encoding
89883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        will usually be Unicode but depends on the specified codecs.
89983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
90083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Strings are read from the file using file_encoding and then
90183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        passed back to the caller as string using data_encoding.
90283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
90383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        If file_encoding is not given, it defaults to data_encoding.
90483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
90583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        errors may be given to define the error handling. It defaults
90683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        to 'strict' which causes ValueErrors to be raised in case an
90783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        encoding error occurs.
90883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
90983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        The returned wrapped file object provides two extra attributes
91083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        .data_encoding and .file_encoding which reflect the given
91183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        parameters of the same name. The attributes can be used for
91283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        introspection by Python programs.
91383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
91483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
91583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if file_encoding is None:
91683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        file_encoding = data_encoding
91783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    data_info = lookup(data_encoding)
91883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    file_info = lookup(file_encoding)
91983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sr = StreamRecoder(file, data_info.encode, data_info.decode,
92083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh                       file_info.streamreader, file_info.streamwriter, errors)
92183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Add attributes to simplify introspection
92283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sr.data_encoding = data_encoding
92383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sr.file_encoding = file_encoding
92483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return sr
92583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
92683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Helpers for codec lookup
92783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
92883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getencoder(encoding):
92983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
93083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
93183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its encoder function.
93283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
93383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found.
93483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
93583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
93683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return lookup(encoding).encode
93783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
93883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getdecoder(encoding):
93983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
94083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
94183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its decoder function.
94283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
94383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found.
94483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
94583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
94683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return lookup(encoding).decode
94783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
94883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getincrementalencoder(encoding):
94983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
95083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
95183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its IncrementalEncoder class or factory function.
95283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
95383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found
95483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        or the codecs doesn't provide an incremental encoder.
95583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
95683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
95783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    encoder = lookup(encoding).incrementalencoder
95883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if encoder is None:
95983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise LookupError(encoding)
96083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return encoder
96183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
96283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getincrementaldecoder(encoding):
96383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
96483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
96583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its IncrementalDecoder class or factory function.
96683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
96783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found
96883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        or the codecs doesn't provide an incremental decoder.
96983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
97083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
97183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    decoder = lookup(encoding).incrementaldecoder
97283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if decoder is None:
97383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        raise LookupError(encoding)
97483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return decoder
97583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
97683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getreader(encoding):
97783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
97883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
97983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its StreamReader class or factory function.
98083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
98183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found.
98283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
98383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
98483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return lookup(encoding).streamreader
98583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
98683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef getwriter(encoding):
98783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
98883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Lookup up the codec for the given encoding and return
98983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        its StreamWriter class or factory function.
99083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
99183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Raises a LookupError in case the encoding cannot be found.
99283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
99383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
99483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return lookup(encoding).streamwriter
99583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
99683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef iterencode(iterator, encoding, errors='strict', **kwargs):
99783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
99883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    Encoding iterator.
99983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
100083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    Encodes the input strings from the iterator using a IncrementalEncoder.
100183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
100283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    errors and kwargs are passed through to the IncrementalEncoder
100383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    constructor.
100483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
100583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    encoder = getincrementalencoder(encoding)(errors, **kwargs)
100683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    for input in iterator:
100783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        output = encoder.encode(input)
100883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if output:
100983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            yield output
101083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    output = encoder.encode("", True)
101183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if output:
101283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        yield output
101383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
101483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef iterdecode(iterator, encoding, errors='strict', **kwargs):
101583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
101683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    Decoding iterator.
101783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
101883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    Decodes the input strings from the iterator using a IncrementalDecoder.
101983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
102083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    errors and kwargs are passed through to the IncrementalDecoder
102183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    constructor.
102283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
102383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
102483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    for input in iterator:
102583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        output = decoder.decode(input)
102683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if output:
102783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            yield output
102883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    output = decoder.decode("", True)
102983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    if output:
103083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        yield output
103183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
103283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Helpers for charmap-based codecs
103383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
103483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef make_identity_dict(rng):
103583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
103683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ make_identity_dict(rng) -> dict
103783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
103883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        Return a dictionary where elements of the rng sequence are
103983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        mapped to themselves.
104083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
104183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
104283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    res = {}
104383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    for i in rng:
104483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        res[i]=i
104583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return res
104683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
104783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehdef make_encoding_map(decoding_map):
104883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
104983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """ Creates an encoding map from a decoding map.
105083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
105183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        If a target mapping in the decoding map occurs multiple
105283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        times, then that target is mapped to None (undefined mapping),
105383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        causing an exception when encountered by the charmap codec
105483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        during translation.
105583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
105683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        One example where this happens is cp875.py which decodes
105783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        multiple character to \u001a.
105883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
105983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    """
106083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    m = {}
106183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    for k,v in decoding_map.items():
106283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        if not v in m:
106383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            m[v] = k
106483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh        else:
106583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh            m[v] = None
106683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    return m
106783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
106883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### error handlers
106983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
107083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehtry:
107183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    strict_errors = lookup_error("strict")
107283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    ignore_errors = lookup_error("ignore")
107383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    replace_errors = lookup_error("replace")
107483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
107583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    backslashreplace_errors = lookup_error("backslashreplace")
107683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehexcept LookupError:
107783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # In --disable-unicode builds, these error handler are missing
107883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    strict_errors = None
107983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    ignore_errors = None
108083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    replace_errors = None
108183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    xmlcharrefreplace_errors = None
108283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    backslashreplace_errors = None
108383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
108483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# Tell modulefinder that using codecs probably needs the encodings
108583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh# package
108683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh_false = 0
108783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehif _false:
108883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    import encodings
108983760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
109083760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh### Tests
109183760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
109283760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsiehif __name__ == '__main__':
109383760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
109483760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Make stdout translate Latin-1 output into UTF-8 output
109583760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
109683760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh
109783760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    # Have stdin translate Latin-1 input into UTF-8 input
109883760d213fb3bec7b4117d266fcfbf6fe2ba14abAndrew Hsieh    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1099