codecs.py revision 7f82f7955efb5ad32e142a3164341c53565c7df0
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct, __builtin__
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError,\
18          'Failed to load the builtin codecs: %s' % why
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "strict_errors", "ignore_errors", "replace_errors",
25           "xmlcharrefreplace_errors",
26           "register_error", "lookup_error"]
27
28### Constants
29
30#
31# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
34#
35
36# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51# UTF-16, native endianness
52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
53
54# UTF-32, native endianness
55BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
56
57# Old broken names (don't use in new code)
58BOM32_LE = BOM_UTF16_LE
59BOM32_BE = BOM_UTF16_BE
60BOM64_LE = BOM_UTF32_LE
61BOM64_BE = BOM_UTF32_BE
62
63
64### Codec base classes (defining the API)
65
66class Codec:
67
68    """ Defines the interface for stateless encoders/decoders.
69
70        The .encode()/.decode() methods may use different error
71        handling schemes by providing the errors argument. These
72        string values are predefined:
73
74         'strict' - raise a ValueError error (or a subclass)
75         'ignore' - ignore the character and continue with the next
76         'replace' - replace with a suitable replacement character;
77                    Python will use the official U+FFFD REPLACEMENT
78                    CHARACTER for the builtin Unicode codecs on
79                    decoding and '?' on encoding.
80         'xmlcharrefreplace' - Replace with the appropriate XML
81                               character reference (only for encoding).
82         'backslashreplace'  - Replace with backslashed escape sequences
83                               (only for encoding).
84
85        The set of allowed values can be extended via register_error.
86
87    """
88    def encode(self, input, errors='strict'):
89
90        """ Encodes the object input and returns a tuple (output
91            object, length consumed).
92
93            errors defines the error handling to apply. It defaults to
94            'strict' handling.
95
96            The method may not store state in the Codec instance. Use
97            StreamCodec for codecs which have to keep state in order to
98            make encoding/decoding efficient.
99
100            The encoder must be able to handle zero length input and
101            return an empty object of the output object type in this
102            situation.
103
104        """
105        raise NotImplementedError
106
107    def decode(self, input, errors='strict'):
108
109        """ Decodes the object input and returns a tuple (output
110            object, length consumed).
111
112            input must be an object which provides the bf_getreadbuf
113            buffer slot. Python strings, buffer objects and memory
114            mapped files are examples of objects providing this slot.
115
116            errors defines the error handling to apply. It defaults to
117            'strict' handling.
118
119            The method may not store state in the Codec instance. Use
120            StreamCodec for codecs which have to keep state in order to
121            make encoding/decoding efficient.
122
123            The decoder must be able to handle zero length input and
124            return an empty object of the output object type in this
125            situation.
126
127        """
128        raise NotImplementedError
129
130#
131# The StreamWriter and StreamReader class provide generic working
132# interfaces which can be used to implement new encoding submodules
133# very easily. See encodings/utf_8.py for an example on how this is
134# done.
135#
136
137class StreamWriter(Codec):
138
139    def __init__(self, stream, errors='strict'):
140
141        """ Creates a StreamWriter instance.
142
143            stream must be a file-like object open for writing
144            (binary) data.
145
146            The StreamWriter may use different error handling
147            schemes by providing the errors keyword argument. These
148            parameters are predefined:
149
150             'strict' - raise a ValueError (or a subclass)
151             'ignore' - ignore the character and continue with the next
152             'replace'- replace with a suitable replacement character
153             'xmlcharrefreplace' - Replace with the appropriate XML
154                                   character reference.
155             'backslashreplace'  - Replace with backslashed escape
156                                   sequences (only for encoding).
157
158            The set of allowed parameter values can be extended via
159            register_error.
160        """
161        self.stream = stream
162        self.errors = errors
163
164    def write(self, object):
165
166        """ Writes the object's contents encoded to self.stream.
167        """
168        data, consumed = self.encode(object, self.errors)
169        self.stream.write(data)
170
171    def writelines(self, list):
172
173        """ Writes the concatenated list of strings to the stream
174            using .write().
175        """
176        self.write(''.join(list))
177
178    def reset(self):
179
180        """ Flushes and resets the codec buffers used for keeping state.
181
182            Calling this method should ensure that the data on the
183            output is put into a clean state, that allows appending
184            of new fresh data without having to rescan the whole
185            stream to recover state.
186
187        """
188        pass
189
190    def __getattr__(self, name,
191                    getattr=getattr):
192
193        """ Inherit all other methods from the underlying stream.
194        """
195        return getattr(self.stream, name)
196
197###
198
199class StreamReader(Codec):
200
201    def __init__(self, stream, errors='strict'):
202
203        """ Creates a StreamReader instance.
204
205            stream must be a file-like object open for reading
206            (binary) data.
207
208            The StreamReader may use different error handling
209            schemes by providing the errors keyword argument. These
210            parameters are predefined:
211
212             'strict' - raise a ValueError (or a subclass)
213             'ignore' - ignore the character and continue with the next
214             'replace'- replace with a suitable replacement character;
215
216            The set of allowed parameter values can be extended via
217            register_error.
218        """
219        self.stream = stream
220        self.errors = errors
221
222    def read(self, size=-1):
223
224        """ Decodes data from the stream self.stream and returns the
225            resulting object.
226
227            size indicates the approximate maximum number of bytes to
228            read from the stream for decoding purposes. The decoder
229            can modify this setting as appropriate. The default value
230            -1 indicates to read and decode as much as possible.  size
231            is intended to prevent having to decode huge files in one
232            step.
233
234            The method should use a greedy read strategy meaning that
235            it should read as much data as is allowed within the
236            definition of the encoding and the given size, e.g.  if
237            optional encoding endings or state markers are available
238            on the stream, these should be read too.
239
240        """
241        # Unsliced reading:
242        if size < 0:
243            return self.decode(self.stream.read(), self.errors)[0]
244
245        # Sliced reading:
246        read = self.stream.read
247        decode = self.decode
248        data = read(size)
249        i = 0
250        while 1:
251            try:
252                object, decodedbytes = decode(data, self.errors)
253            except ValueError, why:
254                # This method is slow but should work under pretty much
255                # all conditions; at most 10 tries are made
256                i = i + 1
257                newdata = read(1)
258                if not newdata or i > 10:
259                    raise
260                data = data + newdata
261            else:
262                return object
263
264    def readline(self, size=None):
265
266        """ Read one line from the input stream and return the
267            decoded data.
268
269            Note: Unlike the .readlines() method, this method inherits
270            the line breaking knowledge from the underlying stream's
271            .readline() method -- there is currently no support for
272            line breaking using the codec decoder due to lack of line
273            buffering. Sublcasses should however, if possible, try to
274            implement this method using their own knowledge of line
275            breaking.
276
277            size, if given, is passed as size argument to the stream's
278            .readline() method.
279
280        """
281        if size is None:
282            line = self.stream.readline()
283        else:
284            line = self.stream.readline(size)
285        return self.decode(line, self.errors)[0]
286
287
288    def readlines(self, sizehint=None):
289
290        """ Read all lines available on the input stream
291            and return them as list of lines.
292
293            Line breaks are implemented using the codec's decoder
294            method and are included in the list entries.
295
296            sizehint, if given, is passed as size argument to the
297            stream's .read() method.
298
299        """
300        if sizehint is None:
301            data = self.stream.read()
302        else:
303            data = self.stream.read(sizehint)
304        return self.decode(data, self.errors)[0].splitlines(1)
305
306    def reset(self):
307
308        """ Resets the codec buffers used for keeping state.
309
310            Note that no stream repositioning should take place.
311            This method is primarily intended to be able to recover
312            from decoding errors.
313
314        """
315        pass
316
317    def next(self):
318
319        """ Return the next decoded line from the input stream."""
320        line = self.readline()
321        if line:
322            return line
323        raise StopIteration
324
325    def __iter__(self):
326        return self
327
328    def __getattr__(self, name,
329                    getattr=getattr):
330
331        """ Inherit all other methods from the underlying stream.
332        """
333        return getattr(self.stream, name)
334
335###
336
337class StreamReaderWriter:
338
339    """ StreamReaderWriter instances allow wrapping streams which
340        work in both read and write modes.
341
342        The design is such that one can use the factory functions
343        returned by the codec.lookup() function to construct the
344        instance.
345
346    """
347    # Optional attributes set by the file wrappers below
348    encoding = 'unknown'
349
350    def __init__(self, stream, Reader, Writer, errors='strict'):
351
352        """ Creates a StreamReaderWriter instance.
353
354            stream must be a Stream-like object.
355
356            Reader, Writer must be factory functions or classes
357            providing the StreamReader, StreamWriter interface resp.
358
359            Error handling is done in the same way as defined for the
360            StreamWriter/Readers.
361
362        """
363        self.stream = stream
364        self.reader = Reader(stream, errors)
365        self.writer = Writer(stream, errors)
366        self.errors = errors
367
368    def read(self, size=-1):
369
370        return self.reader.read(size)
371
372    def readline(self, size=None):
373
374        return self.reader.readline(size)
375
376    def readlines(self, sizehint=None):
377
378        return self.reader.readlines(sizehint)
379
380    def next(self):
381
382        """ Return the next decoded line from the input stream."""
383        return self.reader.next()
384
385    def __iter__(self):
386        return self
387
388    def write(self, data):
389
390        return self.writer.write(data)
391
392    def writelines(self, list):
393
394        return self.writer.writelines(list)
395
396    def reset(self):
397
398        self.reader.reset()
399        self.writer.reset()
400
401    def __getattr__(self, name,
402                    getattr=getattr):
403
404        """ Inherit all other methods from the underlying stream.
405        """
406        return getattr(self.stream, name)
407
408###
409
410class StreamRecoder:
411
412    """ StreamRecoder instances provide a frontend - backend
413        view of encoding data.
414
415        They use the complete set of APIs returned by the
416        codecs.lookup() function to implement their task.
417
418        Data written to the stream is first decoded into an
419        intermediate format (which is dependent on the given codec
420        combination) and then written to the stream using an instance
421        of the provided Writer class.
422
423        In the other direction, data is read from the stream using a
424        Reader instance and then return encoded data to the caller.
425
426    """
427    # Optional attributes set by the file wrappers below
428    data_encoding = 'unknown'
429    file_encoding = 'unknown'
430
431    def __init__(self, stream, encode, decode, Reader, Writer,
432                 errors='strict'):
433
434        """ Creates a StreamRecoder instance which implements a two-way
435            conversion: encode and decode work on the frontend (the
436            input to .read() and output of .write()) while
437            Reader and Writer work on the backend (reading and
438            writing to the stream).
439
440            You can use these objects to do transparent direct
441            recodings from e.g. latin-1 to utf-8 and back.
442
443            stream must be a file-like object.
444
445            encode, decode must adhere to the Codec interface, Reader,
446            Writer must be factory functions or classes providing the
447            StreamReader, StreamWriter interface resp.
448
449            encode and decode are needed for the frontend translation,
450            Reader and Writer for the backend translation. Unicode is
451            used as intermediate encoding.
452
453            Error handling is done in the same way as defined for the
454            StreamWriter/Readers.
455
456        """
457        self.stream = stream
458        self.encode = encode
459        self.decode = decode
460        self.reader = Reader(stream, errors)
461        self.writer = Writer(stream, errors)
462        self.errors = errors
463
464    def read(self, size=-1):
465
466        data = self.reader.read(size)
467        data, bytesencoded = self.encode(data, self.errors)
468        return data
469
470    def readline(self, size=None):
471
472        if size is None:
473            data = self.reader.readline()
474        else:
475            data = self.reader.readline(size)
476        data, bytesencoded = self.encode(data, self.errors)
477        return data
478
479    def readlines(self, sizehint=None):
480
481        if sizehint is None:
482            data = self.reader.read()
483        else:
484            data = self.reader.read(sizehint)
485        data, bytesencoded = self.encode(data, self.errors)
486        return data.splitlines(1)
487
488    def next(self):
489
490        """ Return the next decoded line from the input stream."""
491        return self.reader.next()
492
493    def __iter__(self):
494        return self
495
496    def write(self, data):
497
498        data, bytesdecoded = self.decode(data, self.errors)
499        return self.writer.write(data)
500
501    def writelines(self, list):
502
503        data = ''.join(list)
504        data, bytesdecoded = self.decode(data, self.errors)
505        return self.writer.write(data)
506
507    def reset(self):
508
509        self.reader.reset()
510        self.writer.reset()
511
512    def __getattr__(self, name,
513                    getattr=getattr):
514
515        """ Inherit all other methods from the underlying stream.
516        """
517        return getattr(self.stream, name)
518
519### Shortcuts
520
521def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
522
523    """ Open an encoded file using the given mode and return
524        a wrapped version providing transparent encoding/decoding.
525
526        Note: The wrapped version will only accept the object format
527        defined by the codecs, i.e. Unicode objects for most builtin
528        codecs. Output is also codec dependent and will usually by
529        Unicode as well.
530
531        Files are always opened in binary mode, even if no binary mode
532        was specified. Thisis done to avoid data loss due to encodings
533        using 8-bit values. The default file mode is 'rb' meaning to
534        open the file in binary read mode.
535
536        encoding specifies the encoding which is to be used for the
537        the file.
538
539        errors may be given to define the error handling. It defaults
540        to 'strict' which causes ValueErrors to be raised in case an
541        encoding error occurs.
542
543        buffering has the same meaning as for the builtin open() API.
544        It defaults to line buffered.
545
546        The returned wrapped file object provides an extra attribute
547        .encoding which allows querying the used encoding. This
548        attribute is only available if an encoding was specified as
549        parameter.
550
551    """
552    if encoding is not None and \
553       'b' not in mode:
554        # Force opening of the file in binary mode
555        mode = mode + 'b'
556    file = __builtin__.open(filename, mode, buffering)
557    if encoding is None:
558        return file
559    (e, d, sr, sw) = lookup(encoding)
560    srw = StreamReaderWriter(file, sr, sw, errors)
561    # Add attributes to simplify introspection
562    srw.encoding = encoding
563    return srw
564
565def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
566
567    """ Return a wrapped version of file which provides transparent
568        encoding translation.
569
570        Strings written to the wrapped file are interpreted according
571        to the given data_encoding and then written to the original
572        file as string using file_encoding. The intermediate encoding
573        will usually be Unicode but depends on the specified codecs.
574
575        Strings are read from the file using file_encoding and then
576        passed back to the caller as string using data_encoding.
577
578        If file_encoding is not given, it defaults to data_encoding.
579
580        errors may be given to define the error handling. It defaults
581        to 'strict' which causes ValueErrors to be raised in case an
582        encoding error occurs.
583
584        The returned wrapped file object provides two extra attributes
585        .data_encoding and .file_encoding which reflect the given
586        parameters of the same name. The attributes can be used for
587        introspection by Python programs.
588
589    """
590    if file_encoding is None:
591        file_encoding = data_encoding
592    encode, decode = lookup(data_encoding)[:2]
593    Reader, Writer = lookup(file_encoding)[2:]
594    sr = StreamRecoder(file,
595                       encode, decode, Reader, Writer,
596                       errors)
597    # Add attributes to simplify introspection
598    sr.data_encoding = data_encoding
599    sr.file_encoding = file_encoding
600    return sr
601
602### Helpers for codec lookup
603
604def getencoder(encoding):
605
606    """ Lookup up the codec for the given encoding and return
607        its encoder function.
608
609        Raises a LookupError in case the encoding cannot be found.
610
611    """
612    return lookup(encoding)[0]
613
614def getdecoder(encoding):
615
616    """ Lookup up the codec for the given encoding and return
617        its decoder function.
618
619        Raises a LookupError in case the encoding cannot be found.
620
621    """
622    return lookup(encoding)[1]
623
624def getreader(encoding):
625
626    """ Lookup up the codec for the given encoding and return
627        its StreamReader class or factory function.
628
629        Raises a LookupError in case the encoding cannot be found.
630
631    """
632    return lookup(encoding)[2]
633
634def getwriter(encoding):
635
636    """ Lookup up the codec for the given encoding and return
637        its StreamWriter class or factory function.
638
639        Raises a LookupError in case the encoding cannot be found.
640
641    """
642    return lookup(encoding)[3]
643
644### Helpers for charmap-based codecs
645
646def make_identity_dict(rng):
647
648    """ make_identity_dict(rng) -> dict
649
650        Return a dictionary where elements of the rng sequence are
651        mapped to themselves.
652
653    """
654    res = {}
655    for i in rng:
656        res[i]=i
657    return res
658
659def make_encoding_map(decoding_map):
660
661    """ Creates an encoding map from a decoding map.
662
663        If a target mapping in the decoding map occurrs multiple
664        times, then that target is mapped to None (undefined mapping),
665        causing an exception when encountered by the charmap codec
666        during translation.
667
668        One example where this happens is cp875.py which decodes
669        multiple character to \u001a.
670
671    """
672    m = {}
673    for k,v in decoding_map.items():
674        if not v in m:
675            m[v] = k
676        else:
677            m[v] = None
678    return m
679
680### error handlers
681
682strict_errors = lookup_error("strict")
683ignore_errors = lookup_error("ignore")
684replace_errors = lookup_error("replace")
685xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
686backslashreplace_errors = lookup_error("backslashreplace")
687
688# Tell modulefinder that using codecs probably needs the encodings
689# package
690_false = 0
691if _false:
692    import encodings
693
694### Tests
695
696if __name__ == '__main__':
697
698    import sys
699
700    # Make stdout translate Latin-1 output into UTF-8 output
701    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
702
703    # Have stdin translate Latin-1 input into UTF-8 input
704    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
705