codecs.py revision 3aeb632c3152fa082132ce55b9a880e0d16b04ae
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct, __builtin__
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError,\
18          'Failed to load the builtin codecs: %s' % why
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "strict_errors", "ignore_errors", "replace_errors",
25           "xmlcharrefreplace_errors",
26           "register_error", "lookup_error"]
27
28### Constants
29
30#
31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
34#
35
36# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51# UTF-16, native endianness
52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
53
54# UTF-32, native endianness
55BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
56
57# Old broken names (don't use in new code)
58BOM32_LE = BOM_UTF16_LE
59BOM32_BE = BOM_UTF16_BE
60BOM64_LE = BOM_UTF32_LE
61BOM64_BE = BOM_UTF32_BE
62
63
64### Codec base classes (defining the API)
65
66class Codec:
67
68    """ Defines the interface for stateless encoders/decoders.
69
70        The .encode()/.decode() methods may implement different error
71        handling schemes by providing the errors argument. These
72        string values are defined:
73
74         'strict' - raise a ValueError error (or a subclass)
75         'ignore' - ignore the character and continue with the next
76         'replace' - replace with a suitable replacement character;
77                    Python will use the official U+FFFD REPLACEMENT
78                    CHARACTER for the builtin Unicode codecs.
79
80    """
81    def encode(self, input, errors='strict'):
82
83        """ Encodes the object input and returns a tuple (output
84            object, length consumed).
85
86            errors defines the error handling to apply. It defaults to
87            'strict' handling.
88
89            The method may not store state in the Codec instance. Use
90            StreamCodec for codecs which have to keep state in order to
91            make encoding/decoding efficient.
92
93            The encoder must be able to handle zero length input and
94            return an empty object of the output object type in this
95            situation.
96
97        """
98        raise NotImplementedError
99
100    def decode(self, input, errors='strict'):
101
102        """ Decodes the object input and returns a tuple (output
103            object, length consumed).
104
105            input must be an object which provides the bf_getreadbuf
106            buffer slot. Python strings, buffer objects and memory
107            mapped files are examples of objects providing this slot.
108
109            errors defines the error handling to apply. It defaults to
110            'strict' handling.
111
112            The method may not store state in the Codec instance. Use
113            StreamCodec for codecs which have to keep state in order to
114            make encoding/decoding efficient.
115
116            The decoder must be able to handle zero length input and
117            return an empty object of the output object type in this
118            situation.
119
120        """
121        raise NotImplementedError
122
123#
124# The StreamWriter and StreamReader class provide generic working
125# interfaces which can be used to implement new encoding submodules
126# very easily. See encodings/utf_8.py for an example on how this is
127# done.
128#
129
130class StreamWriter(Codec):
131
132    def __init__(self, stream, errors='strict'):
133
134        """ Creates a StreamWriter instance.
135
136            stream must be a file-like object open for writing
137            (binary) data.
138
139            The StreamWriter may implement different error handling
140            schemes by providing the errors keyword argument. These
141            parameters are defined:
142
143             'strict' - raise a ValueError (or a subclass)
144             'ignore' - ignore the character and continue with the next
145             'replace'- replace with a suitable replacement character
146
147        """
148        self.stream = stream
149        self.errors = errors
150
151    def write(self, object):
152
153        """ Writes the object's contents encoded to self.stream.
154        """
155        data, consumed = self.encode(object, self.errors)
156        self.stream.write(data)
157
158    def writelines(self, list):
159
160        """ Writes the concatenated list of strings to the stream
161            using .write().
162        """
163        self.write(''.join(list))
164
165    def reset(self):
166
167        """ Flushes and resets the codec buffers used for keeping state.
168
169            Calling this method should ensure that the data on the
170            output is put into a clean state, that allows appending
171            of new fresh data without having to rescan the whole
172            stream to recover state.
173
174        """
175        pass
176
177    def __getattr__(self, name,
178                    getattr=getattr):
179
180        """ Inherit all other methods from the underlying stream.
181        """
182        return getattr(self.stream, name)
183
184###
185
186class StreamReader(Codec):
187
188    def __init__(self, stream, errors='strict'):
189
190        """ Creates a StreamReader instance.
191
192            stream must be a file-like object open for reading
193            (binary) data.
194
195            The StreamReader may implement different error handling
196            schemes by providing the errors keyword argument. These
197            parameters are defined:
198
199             'strict' - raise a ValueError (or a subclass)
200             'ignore' - ignore the character and continue with the next
201             'replace'- replace with a suitable replacement character;
202
203        """
204        self.stream = stream
205        self.errors = errors
206
207    def read(self, size=-1):
208
209        """ Decodes data from the stream self.stream and returns the
210            resulting object.
211
212            size indicates the approximate maximum number of bytes to
213            read from the stream for decoding purposes. The decoder
214            can modify this setting as appropriate. The default value
215            -1 indicates to read and decode as much as possible.  size
216            is intended to prevent having to decode huge files in one
217            step.
218
219            The method should use a greedy read strategy meaning that
220            it should read as much data as is allowed within the
221            definition of the encoding and the given size, e.g.  if
222            optional encoding endings or state markers are available
223            on the stream, these should be read too.
224
225        """
226        # Unsliced reading:
227        if size < 0:
228            return self.decode(self.stream.read(), self.errors)[0]
229
230        # Sliced reading:
231        read = self.stream.read
232        decode = self.decode
233        data = read(size)
234        i = 0
235        while 1:
236            try:
237                object, decodedbytes = decode(data, self.errors)
238            except ValueError, why:
239                # This method is slow but should work under pretty much
240                # all conditions; at most 10 tries are made
241                i = i + 1
242                newdata = read(1)
243                if not newdata or i > 10:
244                    raise
245                data = data + newdata
246            else:
247                return object
248
249    def readline(self, size=None):
250
251        """ Read one line from the input stream and return the
252            decoded data.
253
254            Note: Unlike the .readlines() method, this method inherits
255            the line breaking knowledge from the underlying stream's
256            .readline() method -- there is currently no support for
257            line breaking using the codec decoder due to lack of line
258            buffering. Sublcasses should however, if possible, try to
259            implement this method using their own knowledge of line
260            breaking.
261
262            size, if given, is passed as size argument to the stream's
263            .readline() method.
264
265        """
266        if size is None:
267            line = self.stream.readline()
268        else:
269            line = self.stream.readline(size)
270        return self.decode(line, self.errors)[0]
271
272
273    def readlines(self, sizehint=None):
274
275        """ Read all lines available on the input stream
276            and return them as list of lines.
277
278            Line breaks are implemented using the codec's decoder
279            method and are included in the list entries.
280
281            sizehint, if given, is passed as size argument to the
282            stream's .read() method.
283
284        """
285        if sizehint is None:
286            data = self.stream.read()
287        else:
288            data = self.stream.read(sizehint)
289        return self.decode(data, self.errors)[0].splitlines(1)
290
291    def reset(self):
292
293        """ Resets the codec buffers used for keeping state.
294
295            Note that no stream repositioning should take place.
296            This method is primarily intended to be able to recover
297            from decoding errors.
298
299        """
300        pass
301
302    def __getattr__(self, name,
303                    getattr=getattr):
304
305        """ Inherit all other methods from the underlying stream.
306        """
307        return getattr(self.stream, name)
308
309###
310
311class StreamReaderWriter:
312
313    """ StreamReaderWriter instances allow wrapping streams which
314        work in both read and write modes.
315
316        The design is such that one can use the factory functions
317        returned by the codec.lookup() function to construct the
318        instance.
319
320    """
321    # Optional attributes set by the file wrappers below
322    encoding = 'unknown'
323
324    def __init__(self, stream, Reader, Writer, errors='strict'):
325
326        """ Creates a StreamReaderWriter instance.
327
328            stream must be a Stream-like object.
329
330            Reader, Writer must be factory functions or classes
331            providing the StreamReader, StreamWriter interface resp.
332
333            Error handling is done in the same way as defined for the
334            StreamWriter/Readers.
335
336        """
337        self.stream = stream
338        self.reader = Reader(stream, errors)
339        self.writer = Writer(stream, errors)
340        self.errors = errors
341
342    def read(self, size=-1):
343
344        return self.reader.read(size)
345
346    def readline(self, size=None):
347
348        return self.reader.readline(size)
349
350    def readlines(self, sizehint=None):
351
352        return self.reader.readlines(sizehint)
353
354    def write(self, data):
355
356        return self.writer.write(data)
357
358    def writelines(self, list):
359
360        return self.writer.writelines(list)
361
362    def reset(self):
363
364        self.reader.reset()
365        self.writer.reset()
366
367    def __getattr__(self, name,
368                    getattr=getattr):
369
370        """ Inherit all other methods from the underlying stream.
371        """
372        return getattr(self.stream, name)
373
374###
375
376class StreamRecoder:
377
378    """ StreamRecoder instances provide a frontend - backend
379        view of encoding data.
380
381        They use the complete set of APIs returned by the
382        codecs.lookup() function to implement their task.
383
384        Data written to the stream is first decoded into an
385        intermediate format (which is dependent on the given codec
386        combination) and then written to the stream using an instance
387        of the provided Writer class.
388
389        In the other direction, data is read from the stream using a
390        Reader instance and then return encoded data to the caller.
391
392    """
393    # Optional attributes set by the file wrappers below
394    data_encoding = 'unknown'
395    file_encoding = 'unknown'
396
397    def __init__(self, stream, encode, decode, Reader, Writer,
398                 errors='strict'):
399
400        """ Creates a StreamRecoder instance which implements a two-way
401            conversion: encode and decode work on the frontend (the
402            input to .read() and output of .write()) while
403            Reader and Writer work on the backend (reading and
404            writing to the stream).
405
406            You can use these objects to do transparent direct
407            recodings from e.g. latin-1 to utf-8 and back.
408
409            stream must be a file-like object.
410
411            encode, decode must adhere to the Codec interface, Reader,
412            Writer must be factory functions or classes providing the
413            StreamReader, StreamWriter interface resp.
414
415            encode and decode are needed for the frontend translation,
416            Reader and Writer for the backend translation. Unicode is
417            used as intermediate encoding.
418
419            Error handling is done in the same way as defined for the
420            StreamWriter/Readers.
421
422        """
423        self.stream = stream
424        self.encode = encode
425        self.decode = decode
426        self.reader = Reader(stream, errors)
427        self.writer = Writer(stream, errors)
428        self.errors = errors
429
430    def read(self, size=-1):
431
432        data = self.reader.read(size)
433        data, bytesencoded = self.encode(data, self.errors)
434        return data
435
436    def readline(self, size=None):
437
438        if size is None:
439            data = self.reader.readline()
440        else:
441            data = self.reader.readline(size)
442        data, bytesencoded = self.encode(data, self.errors)
443        return data
444
445    def readlines(self, sizehint=None):
446
447        if sizehint is None:
448            data = self.reader.read()
449        else:
450            data = self.reader.read(sizehint)
451        data, bytesencoded = self.encode(data, self.errors)
452        return data.splitlines(1)
453
454    def write(self, data):
455
456        data, bytesdecoded = self.decode(data, self.errors)
457        return self.writer.write(data)
458
459    def writelines(self, list):
460
461        data = ''.join(list)
462        data, bytesdecoded = self.decode(data, self.errors)
463        return self.writer.write(data)
464
465    def reset(self):
466
467        self.reader.reset()
468        self.writer.reset()
469
470    def __getattr__(self, name,
471                    getattr=getattr):
472
473        """ Inherit all other methods from the underlying stream.
474        """
475        return getattr(self.stream, name)
476
477### Shortcuts
478
479def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
480
481    """ Open an encoded file using the given mode and return
482        a wrapped version providing transparent encoding/decoding.
483
484        Note: The wrapped version will only accept the object format
485        defined by the codecs, i.e. Unicode objects for most builtin
486        codecs. Output is also codec dependent and will usually by
487        Unicode as well.
488
489        Files are always opened in binary mode, even if no binary mode
490        was specified. Thisis done to avoid data loss due to encodings
491        using 8-bit values. The default file mode is 'rb' meaning to
492        open the file in binary read mode.
493
494        encoding specifies the encoding which is to be used for the
495        the file.
496
497        errors may be given to define the error handling. It defaults
498        to 'strict' which causes ValueErrors to be raised in case an
499        encoding error occurs.
500
501        buffering has the same meaning as for the builtin open() API.
502        It defaults to line buffered.
503
504        The returned wrapped file object provides an extra attribute
505        .encoding which allows querying the used encoding. This
506        attribute is only available if an encoding was specified as
507        parameter.
508
509    """
510    if encoding is not None and \
511       'b' not in mode:
512        # Force opening of the file in binary mode
513        mode = mode + 'b'
514    file = __builtin__.open(filename, mode, buffering)
515    if encoding is None:
516        return file
517    (e, d, sr, sw) = lookup(encoding)
518    srw = StreamReaderWriter(file, sr, sw, errors)
519    # Add attributes to simplify introspection
520    srw.encoding = encoding
521    return srw
522
523def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
524
525    """ Return a wrapped version of file which provides transparent
526        encoding translation.
527
528        Strings written to the wrapped file are interpreted according
529        to the given data_encoding and then written to the original
530        file as string using file_encoding. The intermediate encoding
531        will usually be Unicode but depends on the specified codecs.
532
533        Strings are read from the file using file_encoding and then
534        passed back to the caller as string using data_encoding.
535
536        If file_encoding is not given, it defaults to data_encoding.
537
538        errors may be given to define the error handling. It defaults
539        to 'strict' which causes ValueErrors to be raised in case an
540        encoding error occurs.
541
542        The returned wrapped file object provides two extra attributes
543        .data_encoding and .file_encoding which reflect the given
544        parameters of the same name. The attributes can be used for
545        introspection by Python programs.
546
547    """
548    if file_encoding is None:
549        file_encoding = data_encoding
550    encode, decode = lookup(data_encoding)[:2]
551    Reader, Writer = lookup(file_encoding)[2:]
552    sr = StreamRecoder(file,
553                       encode, decode, Reader, Writer,
554                       errors)
555    # Add attributes to simplify introspection
556    sr.data_encoding = data_encoding
557    sr.file_encoding = file_encoding
558    return sr
559
560### Helpers for codec lookup
561
562def getencoder(encoding):
563
564    """ Lookup up the codec for the given encoding and return
565        its encoder function.
566
567        Raises a LookupError in case the encoding cannot be found.
568
569    """
570    return lookup(encoding)[0]
571
572def getdecoder(encoding):
573
574    """ Lookup up the codec for the given encoding and return
575        its decoder function.
576
577        Raises a LookupError in case the encoding cannot be found.
578
579    """
580    return lookup(encoding)[1]
581
582def getreader(encoding):
583
584    """ Lookup up the codec for the given encoding and return
585        its StreamReader class or factory function.
586
587        Raises a LookupError in case the encoding cannot be found.
588
589    """
590    return lookup(encoding)[2]
591
592def getwriter(encoding):
593
594    """ Lookup up the codec for the given encoding and return
595        its StreamWriter class or factory function.
596
597        Raises a LookupError in case the encoding cannot be found.
598
599    """
600    return lookup(encoding)[3]
601
602### Helpers for charmap-based codecs
603
604def make_identity_dict(rng):
605
606    """ make_identity_dict(rng) -> dict
607
608        Return a dictionary where elements of the rng sequence are
609        mapped to themselves.
610
611    """
612    res = {}
613    for i in rng:
614        res[i]=i
615    return res
616
617def make_encoding_map(decoding_map):
618
619    """ Creates an encoding map from a decoding map.
620
621        If a target mapping in the decoding map occurrs multiple
622        times, then that target is mapped to None (undefined mapping),
623        causing an exception when encountered by the charmap codec
624        during translation.
625
626        One example where this happens is cp875.py which decodes
627        multiple character to \u001a.
628
629    """
630    m = {}
631    for k,v in decoding_map.items():
632        if not v in m:
633            m[v] = k
634        else:
635            m[v] = None
636    return m
637
638### error handlers
639
640strict_errors = lookup_error("strict")
641ignore_errors = lookup_error("ignore")
642replace_errors = lookup_error("replace")
643xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
644backslashreplace_errors = lookup_error("backslashreplace")
645
646# Tell modulefinder that using codecs probably needs the encodings
647# package
648_false = 0
649if _false:
650    import encodings
651
652### Tests
653
654if __name__ == '__main__':
655
656    import sys
657
658    # Make stdout translate Latin-1 output into UTF-8 output
659    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
660
661    # Have stdin translate Latin-1 input into UTF-8 input
662    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
663