1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import io
11import __builtin__
12
13__all__ = ["GzipFile","open"]
14
15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
19def write32u(output, value):
20    # The L format writes the bit pattern correctly whether signed
21    # or unsigned.
22    output.write(struct.pack("<L", value))
23
24def read32(input):
25    return struct.unpack("<I", input.read(4))[0]
26
27def open(filename, mode="rb", compresslevel=9):
28    """Shorthand for GzipFile(filename, mode, compresslevel).
29
30    The filename argument is required; mode defaults to 'rb'
31    and compresslevel defaults to 9.
32
33    """
34    return GzipFile(filename, mode, compresslevel)
35
36class GzipFile(io.BufferedIOBase):
37    """The GzipFile class simulates most of the methods of a file object with
38    the exception of the readinto() and truncate() methods.
39
40    """
41
42    myfileobj = None
43    max_read_chunk = 10 * 1024 * 1024   # 10Mb
44
45    def __init__(self, filename=None, mode=None,
46                 compresslevel=9, fileobj=None, mtime=None):
47        """Constructor for the GzipFile class.
48
49        At least one of fileobj and filename must be given a
50        non-trivial value.
51
52        The new class instance is based on fileobj, which can be a regular
53        file, a StringIO object, or any other object which simulates a file.
54        It defaults to None, in which case filename is opened to provide
55        a file object.
56
57        When fileobj is not None, the filename argument is only used to be
58        included in the gzip file header, which may includes the original
59        filename of the uncompressed file.  It defaults to the filename of
60        fileobj, if discernible; otherwise, it defaults to the empty string,
61        and in this case the original filename is not included in the header.
62
63        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64        depending on whether the file will be read or written.  The default
65        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66        Be aware that only the 'rb', 'ab', and 'wb' values should be used
67        for cross-platform portability.
68
69        The compresslevel argument is an integer from 0 to 9 controlling the
70        level of compression; 1 is fastest and produces the least compression,
71        and 9 is slowest and produces the most compression. 0 is no compression
72        at all. The default is 9.
73
74        The mtime argument is an optional numeric timestamp to be written
75        to the stream when compressing.  All gzip compressed streams
76        are required to contain a timestamp.  If omitted or None, the
77        current time is used.  This module ignores the timestamp when
78        decompressing; however, some programs, such as gunzip, make use
79        of it.  The format of the timestamp is the same as that of the
80        return value of time.time() and of the st_mtime member of the
81        object returned by os.stat().
82
83        """
84
85        # Make sure we don't inadvertently enable universal newlines on the
86        # underlying file object - in read mode, this causes data corruption.
87        if mode:
88            mode = mode.replace('U', '')
89        # guarantee the file is opened in binary mode on platforms
90        # that care about that sort of thing
91        if mode and 'b' not in mode:
92            mode += 'b'
93        if fileobj is None:
94            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
95        if filename is None:
96            # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97            # attribute. Avoid saving this in the gzip header's filename field.
98            if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
99                filename = fileobj.name
100            else:
101                filename = ''
102        if mode is None:
103            if hasattr(fileobj, 'mode'): mode = fileobj.mode
104            else: mode = 'rb'
105
106        if mode[0:1] == 'r':
107            self.mode = READ
108            # Set flag indicating start of a new member
109            self._new_member = True
110            # Buffer data read from gzip file. extrastart is offset in
111            # stream where buffer starts. extrasize is number of
112            # bytes remaining in buffer from current stream position.
113            self.extrabuf = ""
114            self.extrasize = 0
115            self.extrastart = 0
116            self.name = filename
117            # Starts small, scales exponentially
118            self.min_readsize = 100
119
120        elif mode[0:1] == 'w' or mode[0:1] == 'a':
121            self.mode = WRITE
122            self._init_write(filename)
123            self.compress = zlib.compressobj(compresslevel,
124                                             zlib.DEFLATED,
125                                             -zlib.MAX_WBITS,
126                                             zlib.DEF_MEM_LEVEL,
127                                             0)
128        else:
129            raise IOError, "Mode " + mode + " not supported"
130
131        self.fileobj = fileobj
132        self.offset = 0
133        self.mtime = mtime
134
135        if self.mode == WRITE:
136            self._write_gzip_header()
137
138    @property
139    def filename(self):
140        import warnings
141        warnings.warn("use the name attribute", DeprecationWarning, 2)
142        if self.mode == WRITE and self.name[-3:] != ".gz":
143            return self.name + ".gz"
144        return self.name
145
146    def __repr__(self):
147        s = repr(self.fileobj)
148        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
149
150    def _check_closed(self):
151        """Raises a ValueError if the underlying file object has been closed.
152
153        """
154        if self.closed:
155            raise ValueError('I/O operation on closed file.')
156
157    def _init_write(self, filename):
158        self.name = filename
159        self.crc = zlib.crc32("") & 0xffffffffL
160        self.size = 0
161        self.writebuf = []
162        self.bufsize = 0
163
164    def _write_gzip_header(self):
165        self.fileobj.write('\037\213')             # magic header
166        self.fileobj.write('\010')                 # compression method
167        fname = os.path.basename(self.name)
168        if fname.endswith(".gz"):
169            fname = fname[:-3]
170        flags = 0
171        if fname:
172            flags = FNAME
173        self.fileobj.write(chr(flags))
174        mtime = self.mtime
175        if mtime is None:
176            mtime = time.time()
177        write32u(self.fileobj, long(mtime))
178        self.fileobj.write('\002')
179        self.fileobj.write('\377')
180        if fname:
181            self.fileobj.write(fname + '\000')
182
183    def _init_read(self):
184        self.crc = zlib.crc32("") & 0xffffffffL
185        self.size = 0
186
187    def _read_gzip_header(self):
188        magic = self.fileobj.read(2)
189        if magic != '\037\213':
190            raise IOError, 'Not a gzipped file'
191        method = ord( self.fileobj.read(1) )
192        if method != 8:
193            raise IOError, 'Unknown compression method'
194        flag = ord( self.fileobj.read(1) )
195        self.mtime = read32(self.fileobj)
196        # extraflag = self.fileobj.read(1)
197        # os = self.fileobj.read(1)
198        self.fileobj.read(2)
199
200        if flag & FEXTRA:
201            # Read & discard the extra field, if present
202            xlen = ord(self.fileobj.read(1))
203            xlen = xlen + 256*ord(self.fileobj.read(1))
204            self.fileobj.read(xlen)
205        if flag & FNAME:
206            # Read and discard a null-terminated string containing the filename
207            while True:
208                s = self.fileobj.read(1)
209                if not s or s=='\000':
210                    break
211        if flag & FCOMMENT:
212            # Read and discard a null-terminated string containing a comment
213            while True:
214                s = self.fileobj.read(1)
215                if not s or s=='\000':
216                    break
217        if flag & FHCRC:
218            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
219
220    def write(self,data):
221        self._check_closed()
222        if self.mode != WRITE:
223            import errno
224            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
225
226        if self.fileobj is None:
227            raise ValueError, "write() on closed GzipFile object"
228
229        # Convert data type if called by io.BufferedWriter.
230        if isinstance(data, memoryview):
231            data = data.tobytes()
232
233        if len(data) > 0:
234            self.size = self.size + len(data)
235            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
236            self.fileobj.write( self.compress.compress(data) )
237            self.offset += len(data)
238
239        return len(data)
240
241    def read(self, size=-1):
242        self._check_closed()
243        if self.mode != READ:
244            import errno
245            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
246
247        if self.extrasize <= 0 and self.fileobj is None:
248            return ''
249
250        readsize = 1024
251        if size < 0:        # get the whole thing
252            try:
253                while True:
254                    self._read(readsize)
255                    readsize = min(self.max_read_chunk, readsize * 2)
256            except EOFError:
257                size = self.extrasize
258        else:               # just get some more of it
259            try:
260                while size > self.extrasize:
261                    self._read(readsize)
262                    readsize = min(self.max_read_chunk, readsize * 2)
263            except EOFError:
264                if size > self.extrasize:
265                    size = self.extrasize
266
267        offset = self.offset - self.extrastart
268        chunk = self.extrabuf[offset: offset + size]
269        self.extrasize = self.extrasize - size
270
271        self.offset += size
272        return chunk
273
274    def _unread(self, buf):
275        self.extrasize = len(buf) + self.extrasize
276        self.offset -= len(buf)
277
278    def _read(self, size=1024):
279        if self.fileobj is None:
280            raise EOFError, "Reached EOF"
281
282        if self._new_member:
283            # If the _new_member flag is set, we have to
284            # jump to the next member, if there is one.
285            #
286            # First, check if we're at the end of the file;
287            # if so, it's time to stop; no more members to read.
288            pos = self.fileobj.tell()   # Save current position
289            self.fileobj.seek(0, 2)     # Seek to end of file
290            if pos == self.fileobj.tell():
291                raise EOFError, "Reached EOF"
292            else:
293                self.fileobj.seek( pos ) # Return to original position
294
295            self._init_read()
296            self._read_gzip_header()
297            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
298            self._new_member = False
299
300        # Read a chunk of data from the file
301        buf = self.fileobj.read(size)
302
303        # If the EOF has been reached, flush the decompression object
304        # and mark this object as finished.
305
306        if buf == "":
307            uncompress = self.decompress.flush()
308            self._read_eof()
309            self._add_read_data( uncompress )
310            raise EOFError, 'Reached EOF'
311
312        uncompress = self.decompress.decompress(buf)
313        self._add_read_data( uncompress )
314
315        if self.decompress.unused_data != "":
316            # Ending case: we've come to the end of a member in the file,
317            # so seek back to the start of the unused data, finish up
318            # this member, and read a new gzip header.
319            # (The number of bytes to seek back is the length of the unused
320            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
321            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
322
323            # Check the CRC and file size, and set the flag so we read
324            # a new member on the next call
325            self._read_eof()
326            self._new_member = True
327
328    def _add_read_data(self, data):
329        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
330        offset = self.offset - self.extrastart
331        self.extrabuf = self.extrabuf[offset:] + data
332        self.extrasize = self.extrasize + len(data)
333        self.extrastart = self.offset
334        self.size = self.size + len(data)
335
336    def _read_eof(self):
337        # We've read to the end of the file, so we have to rewind in order
338        # to reread the 8 bytes containing the CRC and the file size.
339        # We check the that the computed CRC and size of the
340        # uncompressed data matches the stored values.  Note that the size
341        # stored is the true file size mod 2**32.
342        self.fileobj.seek(-8, 1)
343        crc32 = read32(self.fileobj)
344        isize = read32(self.fileobj)  # may exceed 2GB
345        if crc32 != self.crc:
346            raise IOError("CRC check failed %s != %s" % (hex(crc32),
347                                                         hex(self.crc)))
348        elif isize != (self.size & 0xffffffffL):
349            raise IOError, "Incorrect length of data produced"
350
351        # Gzip files can be padded with zeroes and still have archives.
352        # Consume all zero bytes and set the file position to the first
353        # non-zero byte. See http://www.gzip.org/#faq8
354        c = "\x00"
355        while c == "\x00":
356            c = self.fileobj.read(1)
357        if c:
358            self.fileobj.seek(-1, 1)
359
360    @property
361    def closed(self):
362        return self.fileobj is None
363
364    def close(self):
365        if self.fileobj is None:
366            return
367        if self.mode == WRITE:
368            self.fileobj.write(self.compress.flush())
369            write32u(self.fileobj, self.crc)
370            # self.size may exceed 2GB, or even 4GB
371            write32u(self.fileobj, self.size & 0xffffffffL)
372            self.fileobj = None
373        elif self.mode == READ:
374            self.fileobj = None
375        if self.myfileobj:
376            self.myfileobj.close()
377            self.myfileobj = None
378
379    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
380        self._check_closed()
381        if self.mode == WRITE:
382            # Ensure the compressor's buffer is flushed
383            self.fileobj.write(self.compress.flush(zlib_mode))
384            self.fileobj.flush()
385
386    def fileno(self):
387        """Invoke the underlying file object's fileno() method.
388
389        This will raise AttributeError if the underlying file object
390        doesn't support fileno().
391        """
392        return self.fileobj.fileno()
393
394    def rewind(self):
395        '''Return the uncompressed stream file position indicator to the
396        beginning of the file'''
397        if self.mode != READ:
398            raise IOError("Can't rewind in write mode")
399        self.fileobj.seek(0)
400        self._new_member = True
401        self.extrabuf = ""
402        self.extrasize = 0
403        self.extrastart = 0
404        self.offset = 0
405
406    def readable(self):
407        return self.mode == READ
408
409    def writable(self):
410        return self.mode == WRITE
411
412    def seekable(self):
413        return True
414
415    def seek(self, offset, whence=0):
416        if whence:
417            if whence == 1:
418                offset = self.offset + offset
419            else:
420                raise ValueError('Seek from end not supported')
421        if self.mode == WRITE:
422            if offset < self.offset:
423                raise IOError('Negative seek in write mode')
424            count = offset - self.offset
425            for i in xrange(count // 1024):
426                self.write(1024 * '\0')
427            self.write((count % 1024) * '\0')
428        elif self.mode == READ:
429            if offset < self.offset:
430                # for negative seek, rewind and do positive seek
431                self.rewind()
432            count = offset - self.offset
433            for i in xrange(count // 1024):
434                self.read(1024)
435            self.read(count % 1024)
436
437        return self.offset
438
439    def readline(self, size=-1):
440        if size < 0:
441            # Shortcut common case - newline found in buffer.
442            offset = self.offset - self.extrastart
443            i = self.extrabuf.find('\n', offset) + 1
444            if i > 0:
445                self.extrasize -= i - offset
446                self.offset += i - offset
447                return self.extrabuf[offset: i]
448
449            size = sys.maxint
450            readsize = self.min_readsize
451        else:
452            readsize = size
453        bufs = []
454        while size != 0:
455            c = self.read(readsize)
456            i = c.find('\n')
457
458            # We set i=size to break out of the loop under two
459            # conditions: 1) there's no newline, and the chunk is
460            # larger than size, or 2) there is a newline, but the
461            # resulting line would be longer than 'size'.
462            if (size <= i) or (i == -1 and len(c) > size):
463                i = size - 1
464
465            if i >= 0 or c == '':
466                bufs.append(c[:i + 1])    # Add portion of last chunk
467                self._unread(c[i + 1:])   # Push back rest of chunk
468                break
469
470            # Append chunk to list, decrease 'size',
471            bufs.append(c)
472            size = size - len(c)
473            readsize = min(size, readsize * 2)
474        if readsize > self.min_readsize:
475            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
476        return ''.join(bufs) # Return resulting line
477
478
479def _test():
480    # Act like gzip; with -d, act like gunzip.
481    # The input file is not deleted, however, nor are any other gzip
482    # options or features supported.
483    args = sys.argv[1:]
484    decompress = args and args[0] == "-d"
485    if decompress:
486        args = args[1:]
487    if not args:
488        args = ["-"]
489    for arg in args:
490        if decompress:
491            if arg == "-":
492                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
493                g = sys.stdout
494            else:
495                if arg[-3:] != ".gz":
496                    print "filename doesn't end in .gz:", repr(arg)
497                    continue
498                f = open(arg, "rb")
499                g = __builtin__.open(arg[:-3], "wb")
500        else:
501            if arg == "-":
502                f = sys.stdin
503                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
504            else:
505                f = __builtin__.open(arg, "rb")
506                g = open(arg + ".gz", "wb")
507        while True:
508            chunk = f.read(1024)
509            if not chunk:
510                break
511            g.write(chunk)
512        if g is not sys.stdout:
513            g.close()
514        if f is not sys.stdin:
515            f.close()
516
517if __name__ == '__main__':
518    _test()
519