codecs.py revision e99d5ea25ba994491c773d9b5872332334ccd1c5
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError,why:
17    raise SystemError,\
18          'Failed to load the builtin codecs: %s' % why
19
20__all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
21           "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
22
23### Constants
24
25#
26# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
27#
28BOM = struct.pack('=H',0xFEFF)
29#
30BOM_BE = BOM32_BE = '\376\377'
31#       corresponds to Unicode U+FEFF in UTF-16 on big endian
32#       platforms == ZERO WIDTH NO-BREAK SPACE
33BOM_LE = BOM32_LE = '\377\376'
34#       corresponds to Unicode U+FFFE in UTF-16 on little endian
35#       platforms == defined as being an illegal Unicode character
36
37#
38# 64-bit Byte Order Marks
39#
40BOM64_BE = '\000\000\376\377'
41#       corresponds to Unicode U+0000FEFF in UCS-4
42BOM64_LE = '\377\376\000\000'
43#       corresponds to Unicode U+0000FFFE in UCS-4
44
45
46### Codec base classes (defining the API)
47
48class Codec:
49
50    """ Defines the interface for stateless encoders/decoders.
51
52        The .encode()/.decode() methods may implement different error
53        handling schemes by providing the errors argument. These
54        string values are defined:
55
56         'strict' - raise a ValueError error (or a subclass)
57         'ignore' - ignore the character and continue with the next
58         'replace' - replace with a suitable replacement character;
59                    Python will use the official U+FFFD REPLACEMENT
60                    CHARACTER for the builtin Unicode codecs.
61
62    """
63    def encode(self,input,errors='strict'):
64
65        """ Encodes the object input and returns a tuple (output
66            object, length consumed).
67
68            errors defines the error handling to apply. It defaults to
69            'strict' handling.
70
71            The method may not store state in the Codec instance. Use
72            StreamCodec for codecs which have to keep state in order to
73            make encoding/decoding efficient.
74
75            The encoder must be able to handle zero length input and
76            return an empty object of the output object type in this
77            situation.
78
79        """
80        raise NotImplementedError
81
82    def decode(self,input,errors='strict'):
83
84        """ Decodes the object input and returns a tuple (output
85            object, length consumed).
86
87            input must be an object which provides the bf_getreadbuf
88            buffer slot. Python strings, buffer objects and memory
89            mapped files are examples of objects providing this slot.
90
91            errors defines the error handling to apply. It defaults to
92            'strict' handling.
93
94            The method may not store state in the Codec instance. Use
95            StreamCodec for codecs which have to keep state in order to
96            make encoding/decoding efficient.
97
98            The decoder must be able to handle zero length input and
99            return an empty object of the output object type in this
100            situation.
101
102        """
103        raise NotImplementedError
104
105#
106# The StreamWriter and StreamReader class provide generic working
107# interfaces which can be used to implement new encodings submodules
108# very easily. See encodings/utf_8.py for an example on how this is
109# done.
110#
111
112class StreamWriter(Codec):
113
114    def __init__(self,stream,errors='strict'):
115
116        """ Creates a StreamWriter instance.
117
118            stream must be a file-like object open for writing
119            (binary) data.
120
121            The StreamWriter may implement different error handling
122            schemes by providing the errors keyword argument. These
123            parameters are defined:
124
125             'strict' - raise a ValueError (or a subclass)
126             'ignore' - ignore the character and continue with the next
127             'replace'- replace with a suitable replacement character
128
129        """
130        self.stream = stream
131        self.errors = errors
132
133    def write(self, object):
134
135        """ Writes the object's contents encoded to self.stream.
136        """
137        data, consumed = self.encode(object,self.errors)
138        self.stream.write(data)
139
140    def writelines(self, list):
141
142        """ Writes the concatenated list of strings to the stream
143            using .write().
144        """
145        self.write(''.join(list))
146
147    def reset(self):
148
149        """ Flushes and resets the codec buffers used for keeping state.
150
151            Calling this method should ensure that the data on the
152            output is put into a clean state, that allows appending
153            of new fresh data without having to rescan the whole
154            stream to recover state.
155
156        """
157        pass
158
159    def __getattr__(self,name,
160
161                    getattr=getattr):
162
163        """ Inherit all other methods from the underlying stream.
164        """
165        return getattr(self.stream,name)
166
167###
168
169class StreamReader(Codec):
170
171    def __init__(self,stream,errors='strict'):
172
173        """ Creates a StreamReader instance.
174
175            stream must be a file-like object open for reading
176            (binary) data.
177
178            The StreamReader may implement different error handling
179            schemes by providing the errors keyword argument. These
180            parameters are defined:
181
182             'strict' - raise a ValueError (or a subclass)
183             'ignore' - ignore the character and continue with the next
184             'replace'- replace with a suitable replacement character;
185
186        """
187        self.stream = stream
188        self.errors = errors
189
190    def read(self, size=-1):
191
192        """ Decodes data from the stream self.stream and returns the
193            resulting object.
194
195            size indicates the approximate maximum number of bytes to
196            read from the stream for decoding purposes. The decoder
197            can modify this setting as appropriate. The default value
198            -1 indicates to read and decode as much as possible.  size
199            is intended to prevent having to decode huge files in one
200            step.
201
202            The method should use a greedy read strategy meaning that
203            it should read as much data as is allowed within the
204            definition of the encoding and the given size, e.g.  if
205            optional encoding endings or state markers are available
206            on the stream, these should be read too.
207
208        """
209        # Unsliced reading:
210        if size < 0:
211            return self.decode(self.stream.read(), self.errors)[0]
212
213        # Sliced reading:
214        read = self.stream.read
215        decode = self.decode
216        data = read(size)
217        i = 0
218        while 1:
219            try:
220                object, decodedbytes = decode(data, self.errors)
221            except ValueError,why:
222                # This method is slow but should work under pretty much
223                # all conditions; at most 10 tries are made
224                i = i + 1
225                newdata = read(1)
226                if not newdata or i > 10:
227                    raise
228                data = data + newdata
229            else:
230                return object
231
232    def readline(self, size=None):
233
234        """ Read one line from the input stream and return the
235            decoded data.
236
237            Note: Unlike the .readlines() method, this method inherits
238            the line breaking knowledge from the underlying stream's
239            .readline() method -- there is currently no support for
240            line breaking using the codec decoder due to lack of line
241            buffering. Sublcasses should however, if possible, try to
242            implement this method using their own knowledge of line
243            breaking.
244
245            size, if given, is passed as size argument to the stream's
246            .readline() method.
247
248        """
249        if size is None:
250            line = self.stream.readline()
251        else:
252            line = self.stream.readline(size)
253        return self.decode(line,self.errors)[0]
254
255
256    def readlines(self, sizehint=0):
257
258        """ Read all lines available on the input stream
259            and return them as list of lines.
260
261            Line breaks are implemented using the codec's decoder
262            method and are included in the list entries.
263
264            sizehint, if given, is passed as size argument to the
265            stream's .read() method.
266
267        """
268        if sizehint is None:
269            data = self.stream.read()
270        else:
271            data = self.stream.read(sizehint)
272        return self.decode(data,self.errors)[0].splitlines(1)
273
274    def reset(self):
275
276        """ Resets the codec buffers used for keeping state.
277
278            Note that no stream repositioning should take place.
279            This method is primarily intended to be able to recover
280            from decoding errors.
281
282        """
283        pass
284
285    def __getattr__(self,name,
286
287                    getattr=getattr):
288
289        """ Inherit all other methods from the underlying stream.
290        """
291        return getattr(self.stream,name)
292
293###
294
295class StreamReaderWriter:
296
297    """ StreamReaderWriter instances allow wrapping streams which
298        work in both read and write modes.
299
300        The design is such that one can use the factory functions
301        returned by the codec.lookup() function to construct the
302        instance.
303
304    """
305    # Optional attributes set by the file wrappers below
306    encoding = 'unknown'
307
308    def __init__(self,stream,Reader,Writer,errors='strict'):
309
310        """ Creates a StreamReaderWriter instance.
311
312            stream must be a Stream-like object.
313
314            Reader, Writer must be factory functions or classes
315            providing the StreamReader, StreamWriter interface resp.
316
317            Error handling is done in the same way as defined for the
318            StreamWriter/Readers.
319
320        """
321        self.stream = stream
322        self.reader = Reader(stream, errors)
323        self.writer = Writer(stream, errors)
324        self.errors = errors
325
326    def read(self,size=-1):
327
328        return self.reader.read(size)
329
330    def readline(self, size=None):
331
332        return self.reader.readline(size)
333
334    def readlines(self, sizehint=None):
335
336        return self.reader.readlines(sizehint)
337
338    def write(self,data):
339
340        return self.writer.write(data)
341
342    def writelines(self,list):
343
344        return self.writer.writelines(list)
345
346    def reset(self):
347
348        self.reader.reset()
349        self.writer.reset()
350
351    def __getattr__(self,name,
352
353                    getattr=getattr):
354
355        """ Inherit all other methods from the underlying stream.
356        """
357        return getattr(self.stream,name)
358
359###
360
361class StreamRecoder:
362
363    """ StreamRecoder instances provide a frontend - backend
364        view of encoding data.
365
366        They use the complete set of APIs returned by the
367        codecs.lookup() function to implement their task.
368
369        Data written to the stream is first decoded into an
370        intermediate format (which is dependent on the given codec
371        combination) and then written to the stream using an instance
372        of the provided Writer class.
373
374        In the other direction, data is read from the stream using a
375        Reader instance and then return encoded data to the caller.
376
377    """
378    # Optional attributes set by the file wrappers below
379    data_encoding = 'unknown'
380    file_encoding = 'unknown'
381
382    def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
383
384        """ Creates a StreamRecoder instance which implements a two-way
385            conversion: encode and decode work on the frontend (the
386            input to .read() and output of .write()) while
387            Reader and Writer work on the backend (reading and
388            writing to the stream).
389
390            You can use these objects to do transparent direct
391            recodings from e.g. latin-1 to utf-8 and back.
392
393            stream must be a file-like object.
394
395            encode, decode must adhere to the Codec interface, Reader,
396            Writer must be factory functions or classes providing the
397            StreamReader, StreamWriter interface resp.
398
399            encode and decode are needed for the frontend translation,
400            Reader and Writer for the backend translation. Unicode is
401            used as intermediate encoding.
402
403            Error handling is done in the same way as defined for the
404            StreamWriter/Readers.
405
406        """
407        self.stream = stream
408        self.encode = encode
409        self.decode = decode
410        self.reader = Reader(stream, errors)
411        self.writer = Writer(stream, errors)
412        self.errors = errors
413
414    def read(self,size=-1):
415
416        data = self.reader.read(size)
417        data, bytesencoded = self.encode(data, self.errors)
418        return data
419
420    def readline(self,size=None):
421
422        if size is None:
423            data = self.reader.readline()
424        else:
425            data = self.reader.readline(size)
426        data, bytesencoded = self.encode(data, self.errors)
427        return data
428
429    def readlines(self,sizehint=None):
430
431        if sizehint is None:
432            data = self.reader.read()
433        else:
434            data = self.reader.read(sizehint)
435        data, bytesencoded = self.encode(data, self.errors)
436        return data.splitlines(1)
437
438    def write(self,data):
439
440        data, bytesdecoded = self.decode(data, self.errors)
441        return self.writer.write(data)
442
443    def writelines(self,list):
444
445        data = ''.join(list)
446        data, bytesdecoded = self.decode(data, self.errors)
447        return self.writer.write(data)
448
449    def reset(self):
450
451        self.reader.reset()
452        self.writer.reset()
453
454    def __getattr__(self,name,
455
456                    getattr=getattr):
457
458        """ Inherit all other methods from the underlying stream.
459        """
460        return getattr(self.stream,name)
461
462### Shortcuts
463
464def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
465
466    """ Open an encoded file using the given mode and return
467        a wrapped version providing transparent encoding/decoding.
468
469        Note: The wrapped version will only accept the object format
470        defined by the codecs, i.e. Unicode objects for most builtin
471        codecs. Output is also codec dependent and will usually by
472        Unicode as well.
473
474        Files are always opened in binary mode, even if no binary mode
475        was specified. Thisis done to avoid data loss due to encodings
476        using 8-bit values. The default file mode is 'rb' meaning to
477        open the file in binary read mode.
478
479        encoding specifies the encoding which is to be used for the
480        the file.
481
482        errors may be given to define the error handling. It defaults
483        to 'strict' which causes ValueErrors to be raised in case an
484        encoding error occurs.
485
486        buffering has the same meaning as for the builtin open() API.
487        It defaults to line buffered.
488
489        The returned wrapped file object provides an extra attribute
490        .encoding which allows querying the used encoding. This
491        attribute is only available if an encoding was specified as
492        parameter.
493
494    """
495    if encoding is not None and \
496       'b' not in mode:
497        # Force opening of the file in binary mode
498        mode = mode + 'b'
499    file = __builtin__.open(filename, mode, buffering)
500    if encoding is None:
501        return file
502    (e,d,sr,sw) = lookup(encoding)
503    srw = StreamReaderWriter(file, sr, sw, errors)
504    # Add attributes to simplify introspection
505    srw.encoding = encoding
506    return srw
507
508def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
509
510    """ Return a wrapped version of file which provides transparent
511        encoding translation.
512
513        Strings written to the wrapped file are interpreted according
514        to the given data_encoding and then written to the original
515        file as string using file_encoding. The intermediate encoding
516        will usually be Unicode but depends on the specified codecs.
517
518        Strings are read from the file using file_encoding and then
519        passed back to the caller as string using data_encoding.
520
521        If file_encoding is not given, it defaults to data_encoding.
522
523        errors may be given to define the error handling. It defaults
524        to 'strict' which causes ValueErrors to be raised in case an
525        encoding error occurs.
526
527        The returned wrapped file object provides two extra attributes
528        .data_encoding and .file_encoding which reflect the given
529        parameters of the same name. The attributes can be used for
530        introspection by Python programs.
531
532    """
533    if file_encoding is None:
534        file_encoding = data_encoding
535    encode, decode = lookup(data_encoding)[:2]
536    Reader, Writer = lookup(file_encoding)[2:]
537    sr = StreamRecoder(file,
538                       encode,decode,Reader,Writer,
539                       errors)
540    # Add attributes to simplify introspection
541    sr.data_encoding = data_encoding
542    sr.file_encoding = file_encoding
543    return sr
544
545### Helpers for charmap-based codecs
546
547def make_identity_dict(rng):
548
549    """ make_identity_dict(rng) -> dict
550
551        Return a dictionary where elements of the rng sequence are
552        mapped to themselves.
553
554    """
555    res = {}
556    for i in rng:
557        res[i]=i
558    return res
559
560### Tests
561
562if __name__ == '__main__':
563
564    import sys
565
566    # Make stdout translate Latin-1 output into UTF-8 output
567    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
568
569    # Have stdin translate Latin-1 input into UTF-8 input
570    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
571