1# -*- coding: iso-8859-1 -*-
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gust�bel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission  is  hereby granted,  free  of charge,  to  any person
9# obtaining a  copy of  this software  and associated documentation
10# files  (the  "Software"),  to   deal  in  the  Software   without
11# restriction,  including  without limitation  the  rights to  use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies  of  the  Software,  and to  permit  persons  to  whom the
14# Software  is  furnished  to  do  so,  subject  to  the  following
15# conditions:
16#
17# The above copyright  notice and this  permission notice shall  be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
21# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
22# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
23# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
24# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
25# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32__version__ = "$Revision: 85213 $"
33# $Source$
34
35version     = "0.9.0"
36__author__  = "Lars Gust�bel (lars@gustaebel.de)"
37__date__    = "$Date$"
38__cvsid__   = "$Id$"
39__credits__ = "Gustavo Niemeyer, Niels Gust�bel, Richard Townsend."
40
41#---------
42# Imports
43#---------
44from __builtin__ import open as bltn_open
45import sys
46import os
47import shutil
48import stat
49import errno
50import time
51import struct
52import copy
53import re
54import operator
55
56try:
57    import grp, pwd
58except ImportError:
59    grp = pwd = None
60
61# from tarfile import *
62__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
63
64#---------------------------------------------------------
65# tar constants
66#---------------------------------------------------------
67NUL = "\0"                      # the null character
68BLOCKSIZE = 512                 # length of processing blocks
69RECORDSIZE = BLOCKSIZE * 20     # length of records
70GNU_MAGIC = "ustar  \0"         # magic gnu tar string
71POSIX_MAGIC = "ustar\x0000"     # magic posix tar string
72
73LENGTH_NAME = 100               # maximum length of a filename
74LENGTH_LINK = 100               # maximum length of a linkname
75LENGTH_PREFIX = 155             # maximum length of the prefix field
76
77REGTYPE = "0"                   # regular file
78AREGTYPE = "\0"                 # regular file
79LNKTYPE = "1"                   # link (inside tarfile)
80SYMTYPE = "2"                   # symbolic link
81CHRTYPE = "3"                   # character special device
82BLKTYPE = "4"                   # block special device
83DIRTYPE = "5"                   # directory
84FIFOTYPE = "6"                  # fifo special device
85CONTTYPE = "7"                  # contiguous file
86
87GNUTYPE_LONGNAME = "L"          # GNU tar longname
88GNUTYPE_LONGLINK = "K"          # GNU tar longlink
89GNUTYPE_SPARSE = "S"            # GNU tar sparse file
90
91XHDTYPE = "x"                   # POSIX.1-2001 extended header
92XGLTYPE = "g"                   # POSIX.1-2001 global header
93SOLARIS_XHDTYPE = "X"           # Solaris extended header
94
95USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
96GNU_FORMAT = 1                  # GNU tar format
97PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
98DEFAULT_FORMAT = GNU_FORMAT
99
100#---------------------------------------------------------
101# tarfile constants
102#---------------------------------------------------------
103# File types that tarfile supports:
104SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
105                   SYMTYPE, DIRTYPE, FIFOTYPE,
106                   CONTTYPE, CHRTYPE, BLKTYPE,
107                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
108                   GNUTYPE_SPARSE)
109
110# File types that will be treated as a regular file.
111REGULAR_TYPES = (REGTYPE, AREGTYPE,
112                 CONTTYPE, GNUTYPE_SPARSE)
113
114# File types that are part of the GNU tar format.
115GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
116             GNUTYPE_SPARSE)
117
118# Fields from a pax header that override a TarInfo attribute.
119PAX_FIELDS = ("path", "linkpath", "size", "mtime",
120              "uid", "gid", "uname", "gname")
121
122# Fields in a pax header that are numbers, all other fields
123# are treated as strings.
124PAX_NUMBER_FIELDS = {
125    "atime": float,
126    "ctime": float,
127    "mtime": float,
128    "uid": int,
129    "gid": int,
130    "size": int
131}
132
133#---------------------------------------------------------
134# Bits used in the mode field, values in octal.
135#---------------------------------------------------------
136S_IFLNK = 0120000        # symbolic link
137S_IFREG = 0100000        # regular file
138S_IFBLK = 0060000        # block device
139S_IFDIR = 0040000        # directory
140S_IFCHR = 0020000        # character device
141S_IFIFO = 0010000        # fifo
142
143TSUID   = 04000          # set UID on execution
144TSGID   = 02000          # set GID on execution
145TSVTX   = 01000          # reserved
146
147TUREAD  = 0400           # read by owner
148TUWRITE = 0200           # write by owner
149TUEXEC  = 0100           # execute/search by owner
150TGREAD  = 0040           # read by group
151TGWRITE = 0020           # write by group
152TGEXEC  = 0010           # execute/search by group
153TOREAD  = 0004           # read by other
154TOWRITE = 0002           # write by other
155TOEXEC  = 0001           # execute/search by other
156
157#---------------------------------------------------------
158# initialization
159#---------------------------------------------------------
160ENCODING = sys.getfilesystemencoding()
161if ENCODING is None:
162    ENCODING = sys.getdefaultencoding()
163
164#---------------------------------------------------------
165# Some useful functions
166#---------------------------------------------------------
167
168def stn(s, length):
169    """Convert a python string to a null-terminated string buffer.
170    """
171    return s[:length] + (length - len(s)) * NUL
172
173def nts(s):
174    """Convert a null-terminated string field to a python string.
175    """
176    # Use the string up to the first null char.
177    p = s.find("\0")
178    if p == -1:
179        return s
180    return s[:p]
181
182def nti(s):
183    """Convert a number field to a python number.
184    """
185    # There are two possible encodings for a number field, see
186    # itn() below.
187    if s[0] != chr(0200):
188        try:
189            n = int(nts(s).strip() or "0", 8)
190        except ValueError:
191            raise InvalidHeaderError("invalid header")
192    else:
193        n = 0L
194        for i in xrange(len(s) - 1):
195            n <<= 8
196            n += ord(s[i + 1])
197    return n
198
199def itn(n, digits=8, format=DEFAULT_FORMAT):
200    """Convert a python number to a number field.
201    """
202    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
203    # octal digits followed by a null-byte, this allows values up to
204    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
205    # that if necessary. A leading 0200 byte indicates this particular
206    # encoding, the following digits-1 bytes are a big-endian
207    # representation. This allows values up to (256**(digits-1))-1.
208    if 0 <= n < 8 ** (digits - 1):
209        s = "%0*o" % (digits - 1, n) + NUL
210    else:
211        if format != GNU_FORMAT or n >= 256 ** (digits - 1):
212            raise ValueError("overflow in number field")
213
214        if n < 0:
215            # XXX We mimic GNU tar's behaviour with negative numbers,
216            # this could raise OverflowError.
217            n = struct.unpack("L", struct.pack("l", n))[0]
218
219        s = ""
220        for i in xrange(digits - 1):
221            s = chr(n & 0377) + s
222            n >>= 8
223        s = chr(0200) + s
224    return s
225
226def uts(s, encoding, errors):
227    """Convert a unicode object to a string.
228    """
229    if errors == "utf-8":
230        # An extra error handler similar to the -o invalid=UTF-8 option
231        # in POSIX.1-2001. Replace untranslatable characters with their
232        # UTF-8 representation.
233        try:
234            return s.encode(encoding, "strict")
235        except UnicodeEncodeError:
236            x = []
237            for c in s:
238                try:
239                    x.append(c.encode(encoding, "strict"))
240                except UnicodeEncodeError:
241                    x.append(c.encode("utf8"))
242            return "".join(x)
243    else:
244        return s.encode(encoding, errors)
245
246def calc_chksums(buf):
247    """Calculate the checksum for a member's header by summing up all
248       characters except for the chksum field which is treated as if
249       it was filled with spaces. According to the GNU tar sources,
250       some tars (Sun and NeXT) calculate chksum with signed char,
251       which will be different if there are chars in the buffer with
252       the high bit set. So we calculate two checksums, unsigned and
253       signed.
254    """
255    unsigned_chksum = 256 + sum(struct.unpack("148B", buf[:148]) + struct.unpack("356B", buf[156:512]))
256    signed_chksum = 256 + sum(struct.unpack("148b", buf[:148]) + struct.unpack("356b", buf[156:512]))
257    return unsigned_chksum, signed_chksum
258
259def copyfileobj(src, dst, length=None):
260    """Copy length bytes from fileobj src to fileobj dst.
261       If length is None, copy the entire content.
262    """
263    if length == 0:
264        return
265    if length is None:
266        shutil.copyfileobj(src, dst)
267        return
268
269    BUFSIZE = 16 * 1024
270    blocks, remainder = divmod(length, BUFSIZE)
271    for b in xrange(blocks):
272        buf = src.read(BUFSIZE)
273        if len(buf) < BUFSIZE:
274            raise IOError("end of file reached")
275        dst.write(buf)
276
277    if remainder != 0:
278        buf = src.read(remainder)
279        if len(buf) < remainder:
280            raise IOError("end of file reached")
281        dst.write(buf)
282    return
283
284filemode_table = (
285    ((S_IFLNK,      "l"),
286     (S_IFREG,      "-"),
287     (S_IFBLK,      "b"),
288     (S_IFDIR,      "d"),
289     (S_IFCHR,      "c"),
290     (S_IFIFO,      "p")),
291
292    ((TUREAD,       "r"),),
293    ((TUWRITE,      "w"),),
294    ((TUEXEC|TSUID, "s"),
295     (TSUID,        "S"),
296     (TUEXEC,       "x")),
297
298    ((TGREAD,       "r"),),
299    ((TGWRITE,      "w"),),
300    ((TGEXEC|TSGID, "s"),
301     (TSGID,        "S"),
302     (TGEXEC,       "x")),
303
304    ((TOREAD,       "r"),),
305    ((TOWRITE,      "w"),),
306    ((TOEXEC|TSVTX, "t"),
307     (TSVTX,        "T"),
308     (TOEXEC,       "x"))
309)
310
311def filemode(mode):
312    """Convert a file's mode to a string of the form
313       -rwxrwxrwx.
314       Used by TarFile.list()
315    """
316    perm = []
317    for table in filemode_table:
318        for bit, char in table:
319            if mode & bit == bit:
320                perm.append(char)
321                break
322        else:
323            perm.append("-")
324    return "".join(perm)
325
326class TarError(Exception):
327    """Base exception."""
328    pass
329class ExtractError(TarError):
330    """General exception for extract errors."""
331    pass
332class ReadError(TarError):
333    """Exception for unreadable tar archives."""
334    pass
335class CompressionError(TarError):
336    """Exception for unavailable compression methods."""
337    pass
338class StreamError(TarError):
339    """Exception for unsupported operations on stream-like TarFiles."""
340    pass
341class HeaderError(TarError):
342    """Base exception for header errors."""
343    pass
344class EmptyHeaderError(HeaderError):
345    """Exception for empty headers."""
346    pass
347class TruncatedHeaderError(HeaderError):
348    """Exception for truncated headers."""
349    pass
350class EOFHeaderError(HeaderError):
351    """Exception for end of file headers."""
352    pass
353class InvalidHeaderError(HeaderError):
354    """Exception for invalid headers."""
355    pass
356class SubsequentHeaderError(HeaderError):
357    """Exception for missing and invalid extended headers."""
358    pass
359
360#---------------------------
361# internal stream interface
362#---------------------------
363class _LowLevelFile:
364    """Low-level file object. Supports reading and writing.
365       It is used instead of a regular file object for streaming
366       access.
367    """
368
369    def __init__(self, name, mode):
370        mode = {
371            "r": os.O_RDONLY,
372            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
373        }[mode]
374        if hasattr(os, "O_BINARY"):
375            mode |= os.O_BINARY
376        self.fd = os.open(name, mode, 0666)
377
378    def close(self):
379        os.close(self.fd)
380
381    def read(self, size):
382        return os.read(self.fd, size)
383
384    def write(self, s):
385        os.write(self.fd, s)
386
387class _Stream:
388    """Class that serves as an adapter between TarFile and
389       a stream-like object.  The stream-like object only
390       needs to have a read() or write() method and is accessed
391       blockwise.  Use of gzip or bzip2 compression is possible.
392       A stream-like object could be for example: sys.stdin,
393       sys.stdout, a socket, a tape device etc.
394
395       _Stream is intended to be used only internally.
396    """
397
398    def __init__(self, name, mode, comptype, fileobj, bufsize):
399        """Construct a _Stream object.
400        """
401        self._extfileobj = True
402        if fileobj is None:
403            fileobj = _LowLevelFile(name, mode)
404            self._extfileobj = False
405
406        if comptype == '*':
407            # Enable transparent compression detection for the
408            # stream interface
409            fileobj = _StreamProxy(fileobj)
410            comptype = fileobj.getcomptype()
411
412        self.name     = name or ""
413        self.mode     = mode
414        self.comptype = comptype
415        self.fileobj  = fileobj
416        self.bufsize  = bufsize
417        self.buf      = ""
418        self.pos      = 0L
419        self.closed   = False
420
421        try:
422            if comptype == "gz":
423                try:
424                    import zlib
425                except ImportError:
426                    raise CompressionError("zlib module is not available")
427                self.zlib = zlib
428                self.crc = zlib.crc32("") & 0xffffffffL
429                if mode == "r":
430                    self._init_read_gz()
431                else:
432                    self._init_write_gz()
433
434            elif comptype == "bz2":
435                try:
436                    import bz2
437                except ImportError:
438                    raise CompressionError("bz2 module is not available")
439                if mode == "r":
440                    self.dbuf = ""
441                    self.cmp = bz2.BZ2Decompressor()
442                else:
443                    self.cmp = bz2.BZ2Compressor()
444        except:
445            if not self._extfileobj:
446                self.fileobj.close()
447            self.closed = True
448            raise
449
450    def __del__(self):
451        if hasattr(self, "closed") and not self.closed:
452            self.close()
453
454    def _init_write_gz(self):
455        """Initialize for writing with gzip compression.
456        """
457        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
458                                            -self.zlib.MAX_WBITS,
459                                            self.zlib.DEF_MEM_LEVEL,
460                                            0)
461        timestamp = struct.pack("<L", long(time.time()))
462        self.__write("\037\213\010\010%s\002\377" % timestamp)
463        if type(self.name) is unicode:
464            self.name = self.name.encode("iso-8859-1", "replace")
465        if self.name.endswith(".gz"):
466            self.name = self.name[:-3]
467        self.__write(self.name + NUL)
468
469    def write(self, s):
470        """Write string s to the stream.
471        """
472        if self.comptype == "gz":
473            self.crc = self.zlib.crc32(s, self.crc) & 0xffffffffL
474        self.pos += len(s)
475        if self.comptype != "tar":
476            s = self.cmp.compress(s)
477        self.__write(s)
478
479    def __write(self, s):
480        """Write string s to the stream if a whole new block
481           is ready to be written.
482        """
483        self.buf += s
484        while len(self.buf) > self.bufsize:
485            self.fileobj.write(self.buf[:self.bufsize])
486            self.buf = self.buf[self.bufsize:]
487
488    def close(self):
489        """Close the _Stream object. No operation should be
490           done on it afterwards.
491        """
492        if self.closed:
493            return
494
495        self.closed = True
496        try:
497            if self.mode == "w" and self.comptype != "tar":
498                self.buf += self.cmp.flush()
499
500            if self.mode == "w" and self.buf:
501                self.fileobj.write(self.buf)
502                self.buf = ""
503                if self.comptype == "gz":
504                    # The native zlib crc is an unsigned 32-bit integer, but
505                    # the Python wrapper implicitly casts that to a signed C
506                    # long.  So, on a 32-bit box self.crc may "look negative",
507                    # while the same crc on a 64-bit box may "look positive".
508                    # To avoid irksome warnings from the `struct` module, force
509                    # it to look positive on all boxes.
510                    self.fileobj.write(struct.pack("<L", self.crc & 0xffffffffL))
511                    self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFFL))
512        finally:
513            if not self._extfileobj:
514                self.fileobj.close()
515
516    def _init_read_gz(self):
517        """Initialize for reading a gzip compressed fileobj.
518        """
519        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
520        self.dbuf = ""
521
522        # taken from gzip.GzipFile with some alterations
523        if self.__read(2) != "\037\213":
524            raise ReadError("not a gzip file")
525        if self.__read(1) != "\010":
526            raise CompressionError("unsupported compression method")
527
528        flag = ord(self.__read(1))
529        self.__read(6)
530
531        if flag & 4:
532            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
533            self.read(xlen)
534        if flag & 8:
535            while True:
536                s = self.__read(1)
537                if not s or s == NUL:
538                    break
539        if flag & 16:
540            while True:
541                s = self.__read(1)
542                if not s or s == NUL:
543                    break
544        if flag & 2:
545            self.__read(2)
546
547    def tell(self):
548        """Return the stream's file pointer position.
549        """
550        return self.pos
551
552    def seek(self, pos=0):
553        """Set the stream's file pointer to pos. Negative seeking
554           is forbidden.
555        """
556        if pos - self.pos >= 0:
557            blocks, remainder = divmod(pos - self.pos, self.bufsize)
558            for i in xrange(blocks):
559                self.read(self.bufsize)
560            self.read(remainder)
561        else:
562            raise StreamError("seeking backwards is not allowed")
563        return self.pos
564
565    def read(self, size=None):
566        """Return the next size number of bytes from the stream.
567           If size is not defined, return all bytes of the stream
568           up to EOF.
569        """
570        if size is None:
571            t = []
572            while True:
573                buf = self._read(self.bufsize)
574                if not buf:
575                    break
576                t.append(buf)
577            buf = "".join(t)
578        else:
579            buf = self._read(size)
580        self.pos += len(buf)
581        return buf
582
583    def _read(self, size):
584        """Return size bytes from the stream.
585        """
586        if self.comptype == "tar":
587            return self.__read(size)
588
589        c = len(self.dbuf)
590        t = [self.dbuf]
591        while c < size:
592            buf = self.__read(self.bufsize)
593            if not buf:
594                break
595            try:
596                buf = self.cmp.decompress(buf)
597            except IOError:
598                raise ReadError("invalid compressed data")
599            t.append(buf)
600            c += len(buf)
601        t = "".join(t)
602        self.dbuf = t[size:]
603        return t[:size]
604
605    def __read(self, size):
606        """Return size bytes from stream. If internal buffer is empty,
607           read another block from the stream.
608        """
609        c = len(self.buf)
610        t = [self.buf]
611        while c < size:
612            buf = self.fileobj.read(self.bufsize)
613            if not buf:
614                break
615            t.append(buf)
616            c += len(buf)
617        t = "".join(t)
618        self.buf = t[size:]
619        return t[:size]
620# class _Stream
621
622class _StreamProxy(object):
623    """Small proxy class that enables transparent compression
624       detection for the Stream interface (mode 'r|*').
625    """
626
627    def __init__(self, fileobj):
628        self.fileobj = fileobj
629        self.buf = self.fileobj.read(BLOCKSIZE)
630
631    def read(self, size):
632        self.read = self.fileobj.read
633        return self.buf
634
635    def getcomptype(self):
636        if self.buf.startswith("\037\213\010"):
637            return "gz"
638        if self.buf[0:3] == "BZh" and self.buf[4:10] == "1AY&SY":
639            return "bz2"
640        return "tar"
641
642    def close(self):
643        self.fileobj.close()
644# class StreamProxy
645
646class _BZ2Proxy(object):
647    """Small proxy class that enables external file object
648       support for "r:bz2" and "w:bz2" modes. This is actually
649       a workaround for a limitation in bz2 module's BZ2File
650       class which (unlike gzip.GzipFile) has no support for
651       a file object argument.
652    """
653
654    blocksize = 16 * 1024
655
656    def __init__(self, fileobj, mode):
657        self.fileobj = fileobj
658        self.mode = mode
659        self.name = getattr(self.fileobj, "name", None)
660        self.init()
661
662    def init(self):
663        import bz2
664        self.pos = 0
665        if self.mode == "r":
666            self.bz2obj = bz2.BZ2Decompressor()
667            self.fileobj.seek(0)
668            self.buf = ""
669        else:
670            self.bz2obj = bz2.BZ2Compressor()
671
672    def read(self, size):
673        b = [self.buf]
674        x = len(self.buf)
675        while x < size:
676            raw = self.fileobj.read(self.blocksize)
677            if not raw:
678                break
679            data = self.bz2obj.decompress(raw)
680            b.append(data)
681            x += len(data)
682        self.buf = "".join(b)
683
684        buf = self.buf[:size]
685        self.buf = self.buf[size:]
686        self.pos += len(buf)
687        return buf
688
689    def seek(self, pos):
690        if pos < self.pos:
691            self.init()
692        self.read(pos - self.pos)
693
694    def tell(self):
695        return self.pos
696
697    def write(self, data):
698        self.pos += len(data)
699        raw = self.bz2obj.compress(data)
700        self.fileobj.write(raw)
701
702    def close(self):
703        if self.mode == "w":
704            raw = self.bz2obj.flush()
705            self.fileobj.write(raw)
706# class _BZ2Proxy
707
708#------------------------
709# Extraction file object
710#------------------------
711class _FileInFile(object):
712    """A thin wrapper around an existing file object that
713       provides a part of its data as an individual file
714       object.
715    """
716
717    def __init__(self, fileobj, offset, size, sparse=None):
718        self.fileobj = fileobj
719        self.offset = offset
720        self.size = size
721        self.sparse = sparse
722        self.position = 0
723
724    def tell(self):
725        """Return the current file position.
726        """
727        return self.position
728
729    def seek(self, position):
730        """Seek to a position in the file.
731        """
732        self.position = position
733
734    def read(self, size=None):
735        """Read data from the file.
736        """
737        if size is None:
738            size = self.size - self.position
739        else:
740            size = min(size, self.size - self.position)
741
742        if self.sparse is None:
743            return self.readnormal(size)
744        else:
745            return self.readsparse(size)
746
747    def __read(self, size):
748        buf = self.fileobj.read(size)
749        if len(buf) != size:
750            raise ReadError("unexpected end of data")
751        return buf
752
753    def readnormal(self, size):
754        """Read operation for regular files.
755        """
756        self.fileobj.seek(self.offset + self.position)
757        self.position += size
758        return self.__read(size)
759
760    def readsparse(self, size):
761        """Read operation for sparse files.
762        """
763        data = []
764        while size > 0:
765            buf = self.readsparsesection(size)
766            if not buf:
767                break
768            size -= len(buf)
769            data.append(buf)
770        return "".join(data)
771
772    def readsparsesection(self, size):
773        """Read a single section of a sparse file.
774        """
775        section = self.sparse.find(self.position)
776
777        if section is None:
778            return ""
779
780        size = min(size, section.offset + section.size - self.position)
781
782        if isinstance(section, _data):
783            realpos = section.realpos + self.position - section.offset
784            self.fileobj.seek(self.offset + realpos)
785            self.position += size
786            return self.__read(size)
787        else:
788            self.position += size
789            return NUL * size
790#class _FileInFile
791
792
793class ExFileObject(object):
794    """File-like object for reading an archive member.
795       Is returned by TarFile.extractfile().
796    """
797    blocksize = 1024
798
799    def __init__(self, tarfile, tarinfo):
800        self.fileobj = _FileInFile(tarfile.fileobj,
801                                   tarinfo.offset_data,
802                                   tarinfo.size,
803                                   getattr(tarinfo, "sparse", None))
804        self.name = tarinfo.name
805        self.mode = "r"
806        self.closed = False
807        self.size = tarinfo.size
808
809        self.position = 0
810        self.buffer = ""
811
812    def read(self, size=None):
813        """Read at most size bytes from the file. If size is not
814           present or None, read all data until EOF is reached.
815        """
816        if self.closed:
817            raise ValueError("I/O operation on closed file")
818
819        buf = ""
820        if self.buffer:
821            if size is None:
822                buf = self.buffer
823                self.buffer = ""
824            else:
825                buf = self.buffer[:size]
826                self.buffer = self.buffer[size:]
827
828        if size is None:
829            buf += self.fileobj.read()
830        else:
831            buf += self.fileobj.read(size - len(buf))
832
833        self.position += len(buf)
834        return buf
835
836    def readline(self, size=-1):
837        """Read one entire line from the file. If size is present
838           and non-negative, return a string with at most that
839           size, which may be an incomplete line.
840        """
841        if self.closed:
842            raise ValueError("I/O operation on closed file")
843
844        if "\n" in self.buffer:
845            pos = self.buffer.find("\n") + 1
846        else:
847            buffers = [self.buffer]
848            while True:
849                buf = self.fileobj.read(self.blocksize)
850                buffers.append(buf)
851                if not buf or "\n" in buf:
852                    self.buffer = "".join(buffers)
853                    pos = self.buffer.find("\n") + 1
854                    if pos == 0:
855                        # no newline found.
856                        pos = len(self.buffer)
857                    break
858
859        if size != -1:
860            pos = min(size, pos)
861
862        buf = self.buffer[:pos]
863        self.buffer = self.buffer[pos:]
864        self.position += len(buf)
865        return buf
866
867    def readlines(self):
868        """Return a list with all remaining lines.
869        """
870        result = []
871        while True:
872            line = self.readline()
873            if not line: break
874            result.append(line)
875        return result
876
877    def tell(self):
878        """Return the current file position.
879        """
880        if self.closed:
881            raise ValueError("I/O operation on closed file")
882
883        return self.position
884
885    def seek(self, pos, whence=os.SEEK_SET):
886        """Seek to a position in the file.
887        """
888        if self.closed:
889            raise ValueError("I/O operation on closed file")
890
891        if whence == os.SEEK_SET:
892            self.position = min(max(pos, 0), self.size)
893        elif whence == os.SEEK_CUR:
894            if pos < 0:
895                self.position = max(self.position + pos, 0)
896            else:
897                self.position = min(self.position + pos, self.size)
898        elif whence == os.SEEK_END:
899            self.position = max(min(self.size + pos, self.size), 0)
900        else:
901            raise ValueError("Invalid argument")
902
903        self.buffer = ""
904        self.fileobj.seek(self.position)
905
906    def close(self):
907        """Close the file object.
908        """
909        self.closed = True
910
911    def __iter__(self):
912        """Get an iterator over the file's lines.
913        """
914        while True:
915            line = self.readline()
916            if not line:
917                break
918            yield line
919#class ExFileObject
920
921#------------------
922# Exported Classes
923#------------------
924class TarInfo(object):
925    """Informational class which holds the details about an
926       archive member given by a tar header block.
927       TarInfo objects are returned by TarFile.getmember(),
928       TarFile.getmembers() and TarFile.gettarinfo() and are
929       usually created internally.
930    """
931
932    def __init__(self, name=""):
933        """Construct a TarInfo object. name is the optional name
934           of the member.
935        """
936        self.name = name        # member name
937        self.mode = 0644        # file permissions
938        self.uid = 0            # user id
939        self.gid = 0            # group id
940        self.size = 0           # file size
941        self.mtime = 0          # modification time
942        self.chksum = 0         # header checksum
943        self.type = REGTYPE     # member type
944        self.linkname = ""      # link name
945        self.uname = ""         # user name
946        self.gname = ""         # group name
947        self.devmajor = 0       # device major number
948        self.devminor = 0       # device minor number
949
950        self.offset = 0         # the tar header starts here
951        self.offset_data = 0    # the file's data starts here
952
953        self.pax_headers = {}   # pax header information
954
955    # In pax headers the "name" and "linkname" field are called
956    # "path" and "linkpath".
957    def _getpath(self):
958        return self.name
959    def _setpath(self, name):
960        self.name = name
961    path = property(_getpath, _setpath)
962
963    def _getlinkpath(self):
964        return self.linkname
965    def _setlinkpath(self, linkname):
966        self.linkname = linkname
967    linkpath = property(_getlinkpath, _setlinkpath)
968
969    def __repr__(self):
970        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
971
972    def get_info(self, encoding, errors):
973        """Return the TarInfo's attributes as a dictionary.
974        """
975        info = {
976            "name":     self.name,
977            "mode":     self.mode & 07777,
978            "uid":      self.uid,
979            "gid":      self.gid,
980            "size":     self.size,
981            "mtime":    self.mtime,
982            "chksum":   self.chksum,
983            "type":     self.type,
984            "linkname": self.linkname,
985            "uname":    self.uname,
986            "gname":    self.gname,
987            "devmajor": self.devmajor,
988            "devminor": self.devminor
989        }
990
991        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
992            info["name"] += "/"
993
994        for key in ("name", "linkname", "uname", "gname"):
995            if type(info[key]) is unicode:
996                info[key] = info[key].encode(encoding, errors)
997
998        return info
999
1000    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="strict"):
1001        """Return a tar header as a string of 512 byte blocks.
1002        """
1003        info = self.get_info(encoding, errors)
1004
1005        if format == USTAR_FORMAT:
1006            return self.create_ustar_header(info)
1007        elif format == GNU_FORMAT:
1008            return self.create_gnu_header(info)
1009        elif format == PAX_FORMAT:
1010            return self.create_pax_header(info, encoding, errors)
1011        else:
1012            raise ValueError("invalid format")
1013
1014    def create_ustar_header(self, info):
1015        """Return the object as a ustar header block.
1016        """
1017        info["magic"] = POSIX_MAGIC
1018
1019        if len(info["linkname"]) > LENGTH_LINK:
1020            raise ValueError("linkname is too long")
1021
1022        if len(info["name"]) > LENGTH_NAME:
1023            info["prefix"], info["name"] = self._posix_split_name(info["name"])
1024
1025        return self._create_header(info, USTAR_FORMAT)
1026
1027    def create_gnu_header(self, info):
1028        """Return the object as a GNU header block sequence.
1029        """
1030        info["magic"] = GNU_MAGIC
1031
1032        buf = ""
1033        if len(info["linkname"]) > LENGTH_LINK:
1034            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK)
1035
1036        if len(info["name"]) > LENGTH_NAME:
1037            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME)
1038
1039        return buf + self._create_header(info, GNU_FORMAT)
1040
1041    def create_pax_header(self, info, encoding, errors):
1042        """Return the object as a ustar header block. If it cannot be
1043           represented this way, prepend a pax extended header sequence
1044           with supplement information.
1045        """
1046        info["magic"] = POSIX_MAGIC
1047        pax_headers = self.pax_headers.copy()
1048
1049        # Test string fields for values that exceed the field length or cannot
1050        # be represented in ASCII encoding.
1051        for name, hname, length in (
1052                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
1053                ("uname", "uname", 32), ("gname", "gname", 32)):
1054
1055            if hname in pax_headers:
1056                # The pax header has priority.
1057                continue
1058
1059            val = info[name].decode(encoding, errors)
1060
1061            # Try to encode the string as ASCII.
1062            try:
1063                val.encode("ascii")
1064            except UnicodeEncodeError:
1065                pax_headers[hname] = val
1066                continue
1067
1068            if len(info[name]) > length:
1069                pax_headers[hname] = val
1070
1071        # Test number fields for values that exceed the field limit or values
1072        # that like to be stored as float.
1073        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
1074            if name in pax_headers:
1075                # The pax header has priority. Avoid overflow.
1076                info[name] = 0
1077                continue
1078
1079            val = info[name]
1080            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
1081                pax_headers[name] = unicode(val)
1082                info[name] = 0
1083
1084        # Create a pax extended header if necessary.
1085        if pax_headers:
1086            buf = self._create_pax_generic_header(pax_headers)
1087        else:
1088            buf = ""
1089
1090        return buf + self._create_header(info, USTAR_FORMAT)
1091
1092    @classmethod
1093    def create_pax_global_header(cls, pax_headers):
1094        """Return the object as a pax global header block sequence.
1095        """
1096        return cls._create_pax_generic_header(pax_headers, type=XGLTYPE)
1097
1098    def _posix_split_name(self, name):
1099        """Split a name longer than 100 chars into a prefix
1100           and a name part.
1101        """
1102        prefix = name[:LENGTH_PREFIX + 1]
1103        while prefix and prefix[-1] != "/":
1104            prefix = prefix[:-1]
1105
1106        name = name[len(prefix):]
1107        prefix = prefix[:-1]
1108
1109        if not prefix or len(name) > LENGTH_NAME:
1110            raise ValueError("name is too long")
1111        return prefix, name
1112
1113    @staticmethod
1114    def _create_header(info, format):
1115        """Return a header block. info is a dictionary with file
1116           information, format must be one of the *_FORMAT constants.
1117        """
1118        parts = [
1119            stn(info.get("name", ""), 100),
1120            itn(info.get("mode", 0) & 07777, 8, format),
1121            itn(info.get("uid", 0), 8, format),
1122            itn(info.get("gid", 0), 8, format),
1123            itn(info.get("size", 0), 12, format),
1124            itn(info.get("mtime", 0), 12, format),
1125            "        ", # checksum field
1126            info.get("type", REGTYPE),
1127            stn(info.get("linkname", ""), 100),
1128            stn(info.get("magic", POSIX_MAGIC), 8),
1129            stn(info.get("uname", ""), 32),
1130            stn(info.get("gname", ""), 32),
1131            itn(info.get("devmajor", 0), 8, format),
1132            itn(info.get("devminor", 0), 8, format),
1133            stn(info.get("prefix", ""), 155)
1134        ]
1135
1136        buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
1137        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
1138        buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
1139        return buf
1140
1141    @staticmethod
1142    def _create_payload(payload):
1143        """Return the string payload filled with zero bytes
1144           up to the next 512 byte border.
1145        """
1146        blocks, remainder = divmod(len(payload), BLOCKSIZE)
1147        if remainder > 0:
1148            payload += (BLOCKSIZE - remainder) * NUL
1149        return payload
1150
1151    @classmethod
1152    def _create_gnu_long_header(cls, name, type):
1153        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
1154           for name.
1155        """
1156        name += NUL
1157
1158        info = {}
1159        info["name"] = "././@LongLink"
1160        info["type"] = type
1161        info["size"] = len(name)
1162        info["magic"] = GNU_MAGIC
1163
1164        # create extended header + name blocks.
1165        return cls._create_header(info, USTAR_FORMAT) + \
1166                cls._create_payload(name)
1167
1168    @classmethod
1169    def _create_pax_generic_header(cls, pax_headers, type=XHDTYPE):
1170        """Return a POSIX.1-2001 extended or global header sequence
1171           that contains a list of keyword, value pairs. The values
1172           must be unicode objects.
1173        """
1174        records = []
1175        for keyword, value in pax_headers.iteritems():
1176            keyword = keyword.encode("utf8")
1177            value = value.encode("utf8")
1178            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1179            n = p = 0
1180            while True:
1181                n = l + len(str(p))
1182                if n == p:
1183                    break
1184                p = n
1185            records.append("%d %s=%s\n" % (p, keyword, value))
1186        records = "".join(records)
1187
1188        # We use a hardcoded "././@PaxHeader" name like star does
1189        # instead of the one that POSIX recommends.
1190        info = {}
1191        info["name"] = "././@PaxHeader"
1192        info["type"] = type
1193        info["size"] = len(records)
1194        info["magic"] = POSIX_MAGIC
1195
1196        # Create pax header + record blocks.
1197        return cls._create_header(info, USTAR_FORMAT) + \
1198                cls._create_payload(records)
1199
1200    @classmethod
1201    def frombuf(cls, buf):
1202        """Construct a TarInfo object from a 512 byte string buffer.
1203        """
1204        if len(buf) == 0:
1205            raise EmptyHeaderError("empty header")
1206        if len(buf) != BLOCKSIZE:
1207            raise TruncatedHeaderError("truncated header")
1208        if buf.count(NUL) == BLOCKSIZE:
1209            raise EOFHeaderError("end of file header")
1210
1211        chksum = nti(buf[148:156])
1212        if chksum not in calc_chksums(buf):
1213            raise InvalidHeaderError("bad checksum")
1214
1215        obj = cls()
1216        obj.buf = buf
1217        obj.name = nts(buf[0:100])
1218        obj.mode = nti(buf[100:108])
1219        obj.uid = nti(buf[108:116])
1220        obj.gid = nti(buf[116:124])
1221        obj.size = nti(buf[124:136])
1222        obj.mtime = nti(buf[136:148])
1223        obj.chksum = chksum
1224        obj.type = buf[156:157]
1225        obj.linkname = nts(buf[157:257])
1226        obj.uname = nts(buf[265:297])
1227        obj.gname = nts(buf[297:329])
1228        obj.devmajor = nti(buf[329:337])
1229        obj.devminor = nti(buf[337:345])
1230        prefix = nts(buf[345:500])
1231
1232        # Old V7 tar format represents a directory as a regular
1233        # file with a trailing slash.
1234        if obj.type == AREGTYPE and obj.name.endswith("/"):
1235            obj.type = DIRTYPE
1236
1237        # Remove redundant slashes from directories.
1238        if obj.isdir():
1239            obj.name = obj.name.rstrip("/")
1240
1241        # Reconstruct a ustar longname.
1242        if prefix and obj.type not in GNU_TYPES:
1243            obj.name = prefix + "/" + obj.name
1244        return obj
1245
1246    @classmethod
1247    def fromtarfile(cls, tarfile):
1248        """Return the next TarInfo object from TarFile object
1249           tarfile.
1250        """
1251        buf = tarfile.fileobj.read(BLOCKSIZE)
1252        obj = cls.frombuf(buf)
1253        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1254        return obj._proc_member(tarfile)
1255
1256    #--------------------------------------------------------------------------
1257    # The following are methods that are called depending on the type of a
1258    # member. The entry point is _proc_member() which can be overridden in a
1259    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1260    # implement the following
1261    # operations:
1262    # 1. Set self.offset_data to the position where the data blocks begin,
1263    #    if there is data that follows.
1264    # 2. Set tarfile.offset to the position where the next member's header will
1265    #    begin.
1266    # 3. Return self or another valid TarInfo object.
1267    def _proc_member(self, tarfile):
1268        """Choose the right processing method depending on
1269           the type and call it.
1270        """
1271        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1272            return self._proc_gnulong(tarfile)
1273        elif self.type == GNUTYPE_SPARSE:
1274            return self._proc_sparse(tarfile)
1275        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1276            return self._proc_pax(tarfile)
1277        else:
1278            return self._proc_builtin(tarfile)
1279
1280    def _proc_builtin(self, tarfile):
1281        """Process a builtin type or an unknown type which
1282           will be treated as a regular file.
1283        """
1284        self.offset_data = tarfile.fileobj.tell()
1285        offset = self.offset_data
1286        if self.isreg() or self.type not in SUPPORTED_TYPES:
1287            # Skip the following data blocks.
1288            offset += self._block(self.size)
1289        tarfile.offset = offset
1290
1291        # Patch the TarInfo object with saved global
1292        # header information.
1293        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1294
1295        return self
1296
1297    def _proc_gnulong(self, tarfile):
1298        """Process the blocks that hold a GNU longname
1299           or longlink member.
1300        """
1301        buf = tarfile.fileobj.read(self._block(self.size))
1302
1303        # Fetch the next header and process it.
1304        try:
1305            next = self.fromtarfile(tarfile)
1306        except HeaderError:
1307            raise SubsequentHeaderError("missing or bad subsequent header")
1308
1309        # Patch the TarInfo object from the next header with
1310        # the longname information.
1311        next.offset = self.offset
1312        if self.type == GNUTYPE_LONGNAME:
1313            next.name = nts(buf)
1314        elif self.type == GNUTYPE_LONGLINK:
1315            next.linkname = nts(buf)
1316
1317        return next
1318
1319    def _proc_sparse(self, tarfile):
1320        """Process a GNU sparse header plus extra headers.
1321        """
1322        buf = self.buf
1323        sp = _ringbuffer()
1324        pos = 386
1325        lastpos = 0L
1326        realpos = 0L
1327        # There are 4 possible sparse structs in the
1328        # first header.
1329        for i in xrange(4):
1330            try:
1331                offset = nti(buf[pos:pos + 12])
1332                numbytes = nti(buf[pos + 12:pos + 24])
1333            except ValueError:
1334                break
1335            if offset > lastpos:
1336                sp.append(_hole(lastpos, offset - lastpos))
1337            sp.append(_data(offset, numbytes, realpos))
1338            realpos += numbytes
1339            lastpos = offset + numbytes
1340            pos += 24
1341
1342        isextended = ord(buf[482])
1343        origsize = nti(buf[483:495])
1344
1345        # If the isextended flag is given,
1346        # there are extra headers to process.
1347        while isextended == 1:
1348            buf = tarfile.fileobj.read(BLOCKSIZE)
1349            pos = 0
1350            for i in xrange(21):
1351                try:
1352                    offset = nti(buf[pos:pos + 12])
1353                    numbytes = nti(buf[pos + 12:pos + 24])
1354                except ValueError:
1355                    break
1356                if offset > lastpos:
1357                    sp.append(_hole(lastpos, offset - lastpos))
1358                sp.append(_data(offset, numbytes, realpos))
1359                realpos += numbytes
1360                lastpos = offset + numbytes
1361                pos += 24
1362            isextended = ord(buf[504])
1363
1364        if lastpos < origsize:
1365            sp.append(_hole(lastpos, origsize - lastpos))
1366
1367        self.sparse = sp
1368
1369        self.offset_data = tarfile.fileobj.tell()
1370        tarfile.offset = self.offset_data + self._block(self.size)
1371        self.size = origsize
1372
1373        return self
1374
1375    def _proc_pax(self, tarfile):
1376        """Process an extended or global header as described in
1377           POSIX.1-2001.
1378        """
1379        # Read the header information.
1380        buf = tarfile.fileobj.read(self._block(self.size))
1381
1382        # A pax header stores supplemental information for either
1383        # the following file (extended) or all following files
1384        # (global).
1385        if self.type == XGLTYPE:
1386            pax_headers = tarfile.pax_headers
1387        else:
1388            pax_headers = tarfile.pax_headers.copy()
1389
1390        # Parse pax header information. A record looks like that:
1391        # "%d %s=%s\n" % (length, keyword, value). length is the size
1392        # of the complete record including the length field itself and
1393        # the newline. keyword and value are both UTF-8 encoded strings.
1394        regex = re.compile(r"(\d+) ([^=]+)=", re.U)
1395        pos = 0
1396        while True:
1397            match = regex.match(buf, pos)
1398            if not match:
1399                break
1400
1401            length, keyword = match.groups()
1402            length = int(length)
1403            value = buf[match.end(2) + 1:match.start(1) + length - 1]
1404
1405            keyword = keyword.decode("utf8")
1406            value = value.decode("utf8")
1407
1408            pax_headers[keyword] = value
1409            pos += length
1410
1411        # Fetch the next header.
1412        try:
1413            next = self.fromtarfile(tarfile)
1414        except HeaderError:
1415            raise SubsequentHeaderError("missing or bad subsequent header")
1416
1417        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1418            # Patch the TarInfo object with the extended header info.
1419            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1420            next.offset = self.offset
1421
1422            if "size" in pax_headers:
1423                # If the extended header replaces the size field,
1424                # we need to recalculate the offset where the next
1425                # header starts.
1426                offset = next.offset_data
1427                if next.isreg() or next.type not in SUPPORTED_TYPES:
1428                    offset += next._block(next.size)
1429                tarfile.offset = offset
1430
1431        return next
1432
1433    def _apply_pax_info(self, pax_headers, encoding, errors):
1434        """Replace fields with supplemental information from a previous
1435           pax extended or global header.
1436        """
1437        for keyword, value in pax_headers.iteritems():
1438            if keyword not in PAX_FIELDS:
1439                continue
1440
1441            if keyword == "path":
1442                value = value.rstrip("/")
1443
1444            if keyword in PAX_NUMBER_FIELDS:
1445                try:
1446                    value = PAX_NUMBER_FIELDS[keyword](value)
1447                except ValueError:
1448                    value = 0
1449            else:
1450                value = uts(value, encoding, errors)
1451
1452            setattr(self, keyword, value)
1453
1454        self.pax_headers = pax_headers.copy()
1455
1456    def _block(self, count):
1457        """Round up a byte count by BLOCKSIZE and return it,
1458           e.g. _block(834) => 1024.
1459        """
1460        blocks, remainder = divmod(count, BLOCKSIZE)
1461        if remainder:
1462            blocks += 1
1463        return blocks * BLOCKSIZE
1464
1465    def isreg(self):
1466        return self.type in REGULAR_TYPES
1467    def isfile(self):
1468        return self.isreg()
1469    def isdir(self):
1470        return self.type == DIRTYPE
1471    def issym(self):
1472        return self.type == SYMTYPE
1473    def islnk(self):
1474        return self.type == LNKTYPE
1475    def ischr(self):
1476        return self.type == CHRTYPE
1477    def isblk(self):
1478        return self.type == BLKTYPE
1479    def isfifo(self):
1480        return self.type == FIFOTYPE
1481    def issparse(self):
1482        return self.type == GNUTYPE_SPARSE
1483    def isdev(self):
1484        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1485# class TarInfo
1486
1487class TarFile(object):
1488    """The TarFile Class provides an interface to tar archives.
1489    """
1490
1491    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1492
1493    dereference = False         # If true, add content of linked file to the
1494                                # tar file, else the link.
1495
1496    ignore_zeros = False        # If true, skips empty or invalid blocks and
1497                                # continues processing.
1498
1499    errorlevel = 1              # If 0, fatal errors only appear in debug
1500                                # messages (if debug >= 0). If > 0, errors
1501                                # are passed to the caller as exceptions.
1502
1503    format = DEFAULT_FORMAT     # The format to use when creating an archive.
1504
1505    encoding = ENCODING         # Encoding for 8-bit character strings.
1506
1507    errors = None               # Error handler for unicode conversion.
1508
1509    tarinfo = TarInfo           # The default TarInfo class to use.
1510
1511    fileobject = ExFileObject   # The default ExFileObject class to use.
1512
1513    def __init__(self, name=None, mode="r", fileobj=None, format=None,
1514            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1515            errors=None, pax_headers=None, debug=None, errorlevel=None):
1516        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1517           read from an existing archive, 'a' to append data to an existing
1518           file or 'w' to create a new file overwriting an existing one. `mode'
1519           defaults to 'r'.
1520           If `fileobj' is given, it is used for reading or writing data. If it
1521           can be determined, `mode' is overridden by `fileobj's mode.
1522           `fileobj' is not closed, when TarFile is closed.
1523        """
1524        modes = {"r": "rb", "a": "r+b", "w": "wb"}
1525        if mode not in modes:
1526            raise ValueError("mode must be 'r', 'a' or 'w'")
1527        self.mode = mode
1528        self._mode = modes[mode]
1529
1530        if not fileobj:
1531            if self.mode == "a" and not os.path.exists(name):
1532                # Create nonexistent files in append mode.
1533                self.mode = "w"
1534                self._mode = "wb"
1535            fileobj = bltn_open(name, self._mode)
1536            self._extfileobj = False
1537        else:
1538            if name is None and hasattr(fileobj, "name"):
1539                name = fileobj.name
1540            if hasattr(fileobj, "mode"):
1541                self._mode = fileobj.mode
1542            self._extfileobj = True
1543        self.name = os.path.abspath(name) if name else None
1544        self.fileobj = fileobj
1545
1546        # Init attributes.
1547        if format is not None:
1548            self.format = format
1549        if tarinfo is not None:
1550            self.tarinfo = tarinfo
1551        if dereference is not None:
1552            self.dereference = dereference
1553        if ignore_zeros is not None:
1554            self.ignore_zeros = ignore_zeros
1555        if encoding is not None:
1556            self.encoding = encoding
1557
1558        if errors is not None:
1559            self.errors = errors
1560        elif mode == "r":
1561            self.errors = "utf-8"
1562        else:
1563            self.errors = "strict"
1564
1565        if pax_headers is not None and self.format == PAX_FORMAT:
1566            self.pax_headers = pax_headers
1567        else:
1568            self.pax_headers = {}
1569
1570        if debug is not None:
1571            self.debug = debug
1572        if errorlevel is not None:
1573            self.errorlevel = errorlevel
1574
1575        # Init datastructures.
1576        self.closed = False
1577        self.members = []       # list of members as TarInfo objects
1578        self._loaded = False    # flag if all members have been read
1579        self.offset = self.fileobj.tell()
1580                                # current position in the archive file
1581        self.inodes = {}        # dictionary caching the inodes of
1582                                # archive members already added
1583
1584        try:
1585            if self.mode == "r":
1586                self.firstmember = None
1587                self.firstmember = self.next()
1588
1589            if self.mode == "a":
1590                # Move to the end of the archive,
1591                # before the first empty block.
1592                while True:
1593                    self.fileobj.seek(self.offset)
1594                    try:
1595                        tarinfo = self.tarinfo.fromtarfile(self)
1596                        self.members.append(tarinfo)
1597                    except EOFHeaderError:
1598                        self.fileobj.seek(self.offset)
1599                        break
1600                    except HeaderError, e:
1601                        raise ReadError(str(e))
1602
1603            if self.mode in "aw":
1604                self._loaded = True
1605
1606                if self.pax_headers:
1607                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1608                    self.fileobj.write(buf)
1609                    self.offset += len(buf)
1610        except:
1611            if not self._extfileobj:
1612                self.fileobj.close()
1613            self.closed = True
1614            raise
1615
1616    def _getposix(self):
1617        return self.format == USTAR_FORMAT
1618    def _setposix(self, value):
1619        import warnings
1620        warnings.warn("use the format attribute instead", DeprecationWarning,
1621                      2)
1622        if value:
1623            self.format = USTAR_FORMAT
1624        else:
1625            self.format = GNU_FORMAT
1626    posix = property(_getposix, _setposix)
1627
1628    #--------------------------------------------------------------------------
1629    # Below are the classmethods which act as alternate constructors to the
1630    # TarFile class. The open() method is the only one that is needed for
1631    # public use; it is the "super"-constructor and is able to select an
1632    # adequate "sub"-constructor for a particular compression using the mapping
1633    # from OPEN_METH.
1634    #
1635    # This concept allows one to subclass TarFile without losing the comfort of
1636    # the super-constructor. A sub-constructor is registered and made available
1637    # by adding it to the mapping in OPEN_METH.
1638
1639    @classmethod
1640    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1641        """Open a tar archive for reading, writing or appending. Return
1642           an appropriate TarFile class.
1643
1644           mode:
1645           'r' or 'r:*' open for reading with transparent compression
1646           'r:'         open for reading exclusively uncompressed
1647           'r:gz'       open for reading with gzip compression
1648           'r:bz2'      open for reading with bzip2 compression
1649           'a' or 'a:'  open for appending, creating the file if necessary
1650           'w' or 'w:'  open for writing without compression
1651           'w:gz'       open for writing with gzip compression
1652           'w:bz2'      open for writing with bzip2 compression
1653
1654           'r|*'        open a stream of tar blocks with transparent compression
1655           'r|'         open an uncompressed stream of tar blocks for reading
1656           'r|gz'       open a gzip compressed stream of tar blocks
1657           'r|bz2'      open a bzip2 compressed stream of tar blocks
1658           'w|'         open an uncompressed stream for writing
1659           'w|gz'       open a gzip compressed stream for writing
1660           'w|bz2'      open a bzip2 compressed stream for writing
1661        """
1662
1663        if not name and not fileobj:
1664            raise ValueError("nothing to open")
1665
1666        if mode in ("r", "r:*"):
1667            # Find out which *open() is appropriate for opening the file.
1668            def not_compressed(comptype):
1669                return cls.OPEN_METH[comptype] == 'taropen'
1670            for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1671                func = getattr(cls, cls.OPEN_METH[comptype])
1672                if fileobj is not None:
1673                    saved_pos = fileobj.tell()
1674                try:
1675                    return func(name, "r", fileobj, **kwargs)
1676                except (ReadError, CompressionError), e:
1677                    if fileobj is not None:
1678                        fileobj.seek(saved_pos)
1679                    continue
1680            raise ReadError("file could not be opened successfully")
1681
1682        elif ":" in mode:
1683            filemode, comptype = mode.split(":", 1)
1684            filemode = filemode or "r"
1685            comptype = comptype or "tar"
1686
1687            # Select the *open() function according to
1688            # given compression.
1689            if comptype in cls.OPEN_METH:
1690                func = getattr(cls, cls.OPEN_METH[comptype])
1691            else:
1692                raise CompressionError("unknown compression type %r" % comptype)
1693            return func(name, filemode, fileobj, **kwargs)
1694
1695        elif "|" in mode:
1696            filemode, comptype = mode.split("|", 1)
1697            filemode = filemode or "r"
1698            comptype = comptype or "tar"
1699
1700            if filemode not in ("r", "w"):
1701                raise ValueError("mode must be 'r' or 'w'")
1702
1703            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1704            try:
1705                t = cls(name, filemode, stream, **kwargs)
1706            except:
1707                stream.close()
1708                raise
1709            t._extfileobj = False
1710            return t
1711
1712        elif mode in ("a", "w"):
1713            return cls.taropen(name, mode, fileobj, **kwargs)
1714
1715        raise ValueError("undiscernible mode")
1716
1717    @classmethod
1718    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1719        """Open uncompressed tar archive name for reading or writing.
1720        """
1721        if mode not in ("r", "a", "w"):
1722            raise ValueError("mode must be 'r', 'a' or 'w'")
1723        return cls(name, mode, fileobj, **kwargs)
1724
1725    @classmethod
1726    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1727        """Open gzip compressed tar archive name for reading or writing.
1728           Appending is not allowed.
1729        """
1730        if mode not in ("r", "w"):
1731            raise ValueError("mode must be 'r' or 'w'")
1732
1733        try:
1734            import gzip
1735            gzip.GzipFile
1736        except (ImportError, AttributeError):
1737            raise CompressionError("gzip module is not available")
1738
1739        try:
1740            fileobj = gzip.GzipFile(name, mode, compresslevel, fileobj)
1741        except OSError:
1742            if fileobj is not None and mode == 'r':
1743                raise ReadError("not a gzip file")
1744            raise
1745
1746        try:
1747            t = cls.taropen(name, mode, fileobj, **kwargs)
1748        except IOError:
1749            fileobj.close()
1750            if mode == 'r':
1751                raise ReadError("not a gzip file")
1752            raise
1753        except:
1754            fileobj.close()
1755            raise
1756        t._extfileobj = False
1757        return t
1758
1759    @classmethod
1760    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1761        """Open bzip2 compressed tar archive name for reading or writing.
1762           Appending is not allowed.
1763        """
1764        if mode not in ("r", "w"):
1765            raise ValueError("mode must be 'r' or 'w'.")
1766
1767        try:
1768            import bz2
1769        except ImportError:
1770            raise CompressionError("bz2 module is not available")
1771
1772        if fileobj is not None:
1773            fileobj = _BZ2Proxy(fileobj, mode)
1774        else:
1775            fileobj = bz2.BZ2File(name, mode, compresslevel=compresslevel)
1776
1777        try:
1778            t = cls.taropen(name, mode, fileobj, **kwargs)
1779        except (IOError, EOFError):
1780            fileobj.close()
1781            if mode == 'r':
1782                raise ReadError("not a bzip2 file")
1783            raise
1784        except:
1785            fileobj.close()
1786            raise
1787        t._extfileobj = False
1788        return t
1789
1790    # All *open() methods are registered here.
1791    OPEN_METH = {
1792        "tar": "taropen",   # uncompressed tar
1793        "gz":  "gzopen",    # gzip compressed tar
1794        "bz2": "bz2open"    # bzip2 compressed tar
1795    }
1796
1797    #--------------------------------------------------------------------------
1798    # The public methods which TarFile provides:
1799
1800    def close(self):
1801        """Close the TarFile. In write-mode, two finishing zero blocks are
1802           appended to the archive.
1803        """
1804        if self.closed:
1805            return
1806
1807        self.closed = True
1808        try:
1809            if self.mode in "aw":
1810                self.fileobj.write(NUL * (BLOCKSIZE * 2))
1811                self.offset += (BLOCKSIZE * 2)
1812                # fill up the end with zero-blocks
1813                # (like option -b20 for tar does)
1814                blocks, remainder = divmod(self.offset, RECORDSIZE)
1815                if remainder > 0:
1816                    self.fileobj.write(NUL * (RECORDSIZE - remainder))
1817        finally:
1818            if not self._extfileobj:
1819                self.fileobj.close()
1820
1821    def getmember(self, name):
1822        """Return a TarInfo object for member `name'. If `name' can not be
1823           found in the archive, KeyError is raised. If a member occurs more
1824           than once in the archive, its last occurrence is assumed to be the
1825           most up-to-date version.
1826        """
1827        tarinfo = self._getmember(name)
1828        if tarinfo is None:
1829            raise KeyError("filename %r not found" % name)
1830        return tarinfo
1831
1832    def getmembers(self):
1833        """Return the members of the archive as a list of TarInfo objects. The
1834           list has the same order as the members in the archive.
1835        """
1836        self._check()
1837        if not self._loaded:    # if we want to obtain a list of
1838            self._load()        # all members, we first have to
1839                                # scan the whole archive.
1840        return self.members
1841
1842    def getnames(self):
1843        """Return the members of the archive as a list of their names. It has
1844           the same order as the list returned by getmembers().
1845        """
1846        return [tarinfo.name for tarinfo in self.getmembers()]
1847
1848    def gettarinfo(self, name=None, arcname=None, fileobj=None):
1849        """Create a TarInfo object from the result of os.stat or equivalent
1850           on an existing file. The file is either named by `name', or
1851           specified as a file object `fileobj' with a file descriptor. If
1852           given, `arcname' specifies an alternative name for the file in the
1853           archive, otherwise, the name is taken from the 'name' attribute of
1854           'fileobj', or the 'name' argument.
1855        """
1856        self._check("aw")
1857
1858        # When fileobj is given, replace name by
1859        # fileobj's real name.
1860        if fileobj is not None:
1861            name = fileobj.name
1862
1863        # Building the name of the member in the archive.
1864        # Backward slashes are converted to forward slashes,
1865        # Absolute paths are turned to relative paths.
1866        if arcname is None:
1867            arcname = name
1868        drv, arcname = os.path.splitdrive(arcname)
1869        arcname = arcname.replace(os.sep, "/")
1870        arcname = arcname.lstrip("/")
1871
1872        # Now, fill the TarInfo object with
1873        # information specific for the file.
1874        tarinfo = self.tarinfo()
1875        tarinfo.tarfile = self  # Not needed
1876
1877        # Use os.stat or os.lstat, depending on platform
1878        # and if symlinks shall be resolved.
1879        if fileobj is None:
1880            if hasattr(os, "lstat") and not self.dereference:
1881                statres = os.lstat(name)
1882            else:
1883                statres = os.stat(name)
1884        else:
1885            statres = os.fstat(fileobj.fileno())
1886        linkname = ""
1887
1888        stmd = statres.st_mode
1889        if stat.S_ISREG(stmd):
1890            inode = (statres.st_ino, statres.st_dev)
1891            if not self.dereference and statres.st_nlink > 1 and \
1892                    inode in self.inodes and arcname != self.inodes[inode]:
1893                # Is it a hardlink to an already
1894                # archived file?
1895                type = LNKTYPE
1896                linkname = self.inodes[inode]
1897            else:
1898                # The inode is added only if its valid.
1899                # For win32 it is always 0.
1900                type = REGTYPE
1901                if inode[0]:
1902                    self.inodes[inode] = arcname
1903        elif stat.S_ISDIR(stmd):
1904            type = DIRTYPE
1905        elif stat.S_ISFIFO(stmd):
1906            type = FIFOTYPE
1907        elif stat.S_ISLNK(stmd):
1908            type = SYMTYPE
1909            linkname = os.readlink(name)
1910        elif stat.S_ISCHR(stmd):
1911            type = CHRTYPE
1912        elif stat.S_ISBLK(stmd):
1913            type = BLKTYPE
1914        else:
1915            return None
1916
1917        # Fill the TarInfo object with all
1918        # information we can get.
1919        tarinfo.name = arcname
1920        tarinfo.mode = stmd
1921        tarinfo.uid = statres.st_uid
1922        tarinfo.gid = statres.st_gid
1923        if type == REGTYPE:
1924            tarinfo.size = statres.st_size
1925        else:
1926            tarinfo.size = 0L
1927        tarinfo.mtime = statres.st_mtime
1928        tarinfo.type = type
1929        tarinfo.linkname = linkname
1930        if pwd:
1931            try:
1932                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1933            except KeyError:
1934                pass
1935        if grp:
1936            try:
1937                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1938            except KeyError:
1939                pass
1940
1941        if type in (CHRTYPE, BLKTYPE):
1942            if hasattr(os, "major") and hasattr(os, "minor"):
1943                tarinfo.devmajor = os.major(statres.st_rdev)
1944                tarinfo.devminor = os.minor(statres.st_rdev)
1945        return tarinfo
1946
1947    def list(self, verbose=True):
1948        """Print a table of contents to sys.stdout. If `verbose' is False, only
1949           the names of the members are printed. If it is True, an `ls -l'-like
1950           output is produced.
1951        """
1952        self._check()
1953
1954        for tarinfo in self:
1955            if verbose:
1956                print filemode(tarinfo.mode),
1957                print "%s/%s" % (tarinfo.uname or tarinfo.uid,
1958                                 tarinfo.gname or tarinfo.gid),
1959                if tarinfo.ischr() or tarinfo.isblk():
1960                    print "%10s" % ("%d,%d" \
1961                                    % (tarinfo.devmajor, tarinfo.devminor)),
1962                else:
1963                    print "%10d" % tarinfo.size,
1964                print "%d-%02d-%02d %02d:%02d:%02d" \
1965                      % time.localtime(tarinfo.mtime)[:6],
1966
1967            print tarinfo.name + ("/" if tarinfo.isdir() else ""),
1968
1969            if verbose:
1970                if tarinfo.issym():
1971                    print "->", tarinfo.linkname,
1972                if tarinfo.islnk():
1973                    print "link to", tarinfo.linkname,
1974            print
1975
1976    def add(self, name, arcname=None, recursive=True, exclude=None, filter=None):
1977        """Add the file `name' to the archive. `name' may be any type of file
1978           (directory, fifo, symbolic link, etc.). If given, `arcname'
1979           specifies an alternative name for the file in the archive.
1980           Directories are added recursively by default. This can be avoided by
1981           setting `recursive' to False. `exclude' is a function that should
1982           return True for each filename to be excluded. `filter' is a function
1983           that expects a TarInfo object argument and returns the changed
1984           TarInfo object, if it returns None the TarInfo object will be
1985           excluded from the archive.
1986        """
1987        self._check("aw")
1988
1989        if arcname is None:
1990            arcname = name
1991
1992        # Exclude pathnames.
1993        if exclude is not None:
1994            import warnings
1995            warnings.warn("use the filter argument instead",
1996                    DeprecationWarning, 2)
1997            if exclude(name):
1998                self._dbg(2, "tarfile: Excluded %r" % name)
1999                return
2000
2001        # Skip if somebody tries to archive the archive...
2002        if self.name is not None and os.path.abspath(name) == self.name:
2003            self._dbg(2, "tarfile: Skipped %r" % name)
2004            return
2005
2006        self._dbg(1, name)
2007
2008        # Create a TarInfo object from the file.
2009        tarinfo = self.gettarinfo(name, arcname)
2010
2011        if tarinfo is None:
2012            self._dbg(1, "tarfile: Unsupported type %r" % name)
2013            return
2014
2015        # Change or exclude the TarInfo object.
2016        if filter is not None:
2017            tarinfo = filter(tarinfo)
2018            if tarinfo is None:
2019                self._dbg(2, "tarfile: Excluded %r" % name)
2020                return
2021
2022        # Append the tar header and data to the archive.
2023        if tarinfo.isreg():
2024            with bltn_open(name, "rb") as f:
2025                self.addfile(tarinfo, f)
2026
2027        elif tarinfo.isdir():
2028            self.addfile(tarinfo)
2029            if recursive:
2030                for f in os.listdir(name):
2031                    self.add(os.path.join(name, f), os.path.join(arcname, f),
2032                            recursive, exclude, filter)
2033
2034        else:
2035            self.addfile(tarinfo)
2036
2037    def addfile(self, tarinfo, fileobj=None):
2038        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
2039           given, tarinfo.size bytes are read from it and added to the archive.
2040           You can create TarInfo objects directly, or by using gettarinfo().
2041           On Windows platforms, `fileobj' should always be opened with mode
2042           'rb' to avoid irritation about the file size.
2043        """
2044        self._check("aw")
2045
2046        tarinfo = copy.copy(tarinfo)
2047
2048        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2049        self.fileobj.write(buf)
2050        self.offset += len(buf)
2051
2052        # If there's data to follow, append it.
2053        if fileobj is not None:
2054            copyfileobj(fileobj, self.fileobj, tarinfo.size)
2055            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2056            if remainder > 0:
2057                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2058                blocks += 1
2059            self.offset += blocks * BLOCKSIZE
2060
2061        self.members.append(tarinfo)
2062
2063    def extractall(self, path=".", members=None):
2064        """Extract all members from the archive to the current working
2065           directory and set owner, modification time and permissions on
2066           directories afterwards. `path' specifies a different directory
2067           to extract to. `members' is optional and must be a subset of the
2068           list returned by getmembers().
2069        """
2070        directories = []
2071
2072        if members is None:
2073            members = self
2074
2075        for tarinfo in members:
2076            if tarinfo.isdir():
2077                # Extract directories with a safe mode.
2078                directories.append(tarinfo)
2079                tarinfo = copy.copy(tarinfo)
2080                tarinfo.mode = 0700
2081            self.extract(tarinfo, path)
2082
2083        # Reverse sort directories.
2084        directories.sort(key=operator.attrgetter('name'))
2085        directories.reverse()
2086
2087        # Set correct owner, mtime and filemode on directories.
2088        for tarinfo in directories:
2089            dirpath = os.path.join(path, tarinfo.name)
2090            try:
2091                self.chown(tarinfo, dirpath)
2092                self.utime(tarinfo, dirpath)
2093                self.chmod(tarinfo, dirpath)
2094            except ExtractError, e:
2095                if self.errorlevel > 1:
2096                    raise
2097                else:
2098                    self._dbg(1, "tarfile: %s" % e)
2099
2100    def extract(self, member, path=""):
2101        """Extract a member from the archive to the current working directory,
2102           using its full name. Its file information is extracted as accurately
2103           as possible. `member' may be a filename or a TarInfo object. You can
2104           specify a different directory using `path'.
2105        """
2106        self._check("r")
2107
2108        if isinstance(member, basestring):
2109            tarinfo = self.getmember(member)
2110        else:
2111            tarinfo = member
2112
2113        # Prepare the link target for makelink().
2114        if tarinfo.islnk():
2115            tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2116
2117        try:
2118            self._extract_member(tarinfo, os.path.join(path, tarinfo.name))
2119        except EnvironmentError, e:
2120            if self.errorlevel > 0:
2121                raise
2122            else:
2123                if e.filename is None:
2124                    self._dbg(1, "tarfile: %s" % e.strerror)
2125                else:
2126                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2127        except ExtractError, e:
2128            if self.errorlevel > 1:
2129                raise
2130            else:
2131                self._dbg(1, "tarfile: %s" % e)
2132
2133    def extractfile(self, member):
2134        """Extract a member from the archive as a file object. `member' may be
2135           a filename or a TarInfo object. If `member' is a regular file, a
2136           file-like object is returned. If `member' is a link, a file-like
2137           object is constructed from the link's target. If `member' is none of
2138           the above, None is returned.
2139           The file-like object is read-only and provides the following
2140           methods: read(), readline(), readlines(), seek() and tell()
2141        """
2142        self._check("r")
2143
2144        if isinstance(member, basestring):
2145            tarinfo = self.getmember(member)
2146        else:
2147            tarinfo = member
2148
2149        if tarinfo.isreg():
2150            return self.fileobject(self, tarinfo)
2151
2152        elif tarinfo.type not in SUPPORTED_TYPES:
2153            # If a member's type is unknown, it is treated as a
2154            # regular file.
2155            return self.fileobject(self, tarinfo)
2156
2157        elif tarinfo.islnk() or tarinfo.issym():
2158            if isinstance(self.fileobj, _Stream):
2159                # A small but ugly workaround for the case that someone tries
2160                # to extract a (sym)link as a file-object from a non-seekable
2161                # stream of tar blocks.
2162                raise StreamError("cannot extract (sym)link as file object")
2163            else:
2164                # A (sym)link's file object is its target's file object.
2165                return self.extractfile(self._find_link_target(tarinfo))
2166        else:
2167            # If there's no data associated with the member (directory, chrdev,
2168            # blkdev, etc.), return None instead of a file object.
2169            return None
2170
2171    def _extract_member(self, tarinfo, targetpath):
2172        """Extract the TarInfo object tarinfo to a physical
2173           file called targetpath.
2174        """
2175        # Fetch the TarInfo object for the given name
2176        # and build the destination pathname, replacing
2177        # forward slashes to platform specific separators.
2178        targetpath = targetpath.rstrip("/")
2179        targetpath = targetpath.replace("/", os.sep)
2180
2181        # Create all upper directories.
2182        upperdirs = os.path.dirname(targetpath)
2183        if upperdirs and not os.path.exists(upperdirs):
2184            # Create directories that are not part of the archive with
2185            # default permissions.
2186            os.makedirs(upperdirs)
2187
2188        if tarinfo.islnk() or tarinfo.issym():
2189            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2190        else:
2191            self._dbg(1, tarinfo.name)
2192
2193        if tarinfo.isreg():
2194            self.makefile(tarinfo, targetpath)
2195        elif tarinfo.isdir():
2196            self.makedir(tarinfo, targetpath)
2197        elif tarinfo.isfifo():
2198            self.makefifo(tarinfo, targetpath)
2199        elif tarinfo.ischr() or tarinfo.isblk():
2200            self.makedev(tarinfo, targetpath)
2201        elif tarinfo.islnk() or tarinfo.issym():
2202            self.makelink(tarinfo, targetpath)
2203        elif tarinfo.type not in SUPPORTED_TYPES:
2204            self.makeunknown(tarinfo, targetpath)
2205        else:
2206            self.makefile(tarinfo, targetpath)
2207
2208        self.chown(tarinfo, targetpath)
2209        if not tarinfo.issym():
2210            self.chmod(tarinfo, targetpath)
2211            self.utime(tarinfo, targetpath)
2212
2213    #--------------------------------------------------------------------------
2214    # Below are the different file methods. They are called via
2215    # _extract_member() when extract() is called. They can be replaced in a
2216    # subclass to implement other functionality.
2217
2218    def makedir(self, tarinfo, targetpath):
2219        """Make a directory called targetpath.
2220        """
2221        try:
2222            # Use a safe mode for the directory, the real mode is set
2223            # later in _extract_member().
2224            os.mkdir(targetpath, 0700)
2225        except EnvironmentError, e:
2226            if e.errno != errno.EEXIST:
2227                raise
2228
2229    def makefile(self, tarinfo, targetpath):
2230        """Make a file called targetpath.
2231        """
2232        source = self.extractfile(tarinfo)
2233        try:
2234            with bltn_open(targetpath, "wb") as target:
2235                copyfileobj(source, target)
2236        finally:
2237            source.close()
2238
2239    def makeunknown(self, tarinfo, targetpath):
2240        """Make a file from a TarInfo object with an unknown type
2241           at targetpath.
2242        """
2243        self.makefile(tarinfo, targetpath)
2244        self._dbg(1, "tarfile: Unknown file type %r, " \
2245                     "extracted as regular file." % tarinfo.type)
2246
2247    def makefifo(self, tarinfo, targetpath):
2248        """Make a fifo called targetpath.
2249        """
2250        if hasattr(os, "mkfifo"):
2251            os.mkfifo(targetpath)
2252        else:
2253            raise ExtractError("fifo not supported by system")
2254
2255    def makedev(self, tarinfo, targetpath):
2256        """Make a character or block device called targetpath.
2257        """
2258        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2259            raise ExtractError("special devices not supported by system")
2260
2261        mode = tarinfo.mode
2262        if tarinfo.isblk():
2263            mode |= stat.S_IFBLK
2264        else:
2265            mode |= stat.S_IFCHR
2266
2267        os.mknod(targetpath, mode,
2268                 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2269
2270    def makelink(self, tarinfo, targetpath):
2271        """Make a (symbolic) link called targetpath. If it cannot be created
2272          (platform limitation), we try to make a copy of the referenced file
2273          instead of a link.
2274        """
2275        if hasattr(os, "symlink") and hasattr(os, "link"):
2276            # For systems that support symbolic and hard links.
2277            if tarinfo.issym():
2278                if os.path.lexists(targetpath):
2279                    os.unlink(targetpath)
2280                os.symlink(tarinfo.linkname, targetpath)
2281            else:
2282                # See extract().
2283                if os.path.exists(tarinfo._link_target):
2284                    if os.path.lexists(targetpath):
2285                        os.unlink(targetpath)
2286                    os.link(tarinfo._link_target, targetpath)
2287                else:
2288                    self._extract_member(self._find_link_target(tarinfo), targetpath)
2289        else:
2290            try:
2291                self._extract_member(self._find_link_target(tarinfo), targetpath)
2292            except KeyError:
2293                raise ExtractError("unable to resolve link inside archive")
2294
2295    def chown(self, tarinfo, targetpath):
2296        """Set owner of targetpath according to tarinfo.
2297        """
2298        if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2299            # We have to be root to do so.
2300            try:
2301                g = grp.getgrnam(tarinfo.gname)[2]
2302            except KeyError:
2303                g = tarinfo.gid
2304            try:
2305                u = pwd.getpwnam(tarinfo.uname)[2]
2306            except KeyError:
2307                u = tarinfo.uid
2308            try:
2309                if tarinfo.issym() and hasattr(os, "lchown"):
2310                    os.lchown(targetpath, u, g)
2311                else:
2312                    if sys.platform != "os2emx":
2313                        os.chown(targetpath, u, g)
2314            except EnvironmentError, e:
2315                raise ExtractError("could not change owner")
2316
2317    def chmod(self, tarinfo, targetpath):
2318        """Set file permissions of targetpath according to tarinfo.
2319        """
2320        if hasattr(os, 'chmod'):
2321            try:
2322                os.chmod(targetpath, tarinfo.mode)
2323            except EnvironmentError, e:
2324                raise ExtractError("could not change mode")
2325
2326    def utime(self, tarinfo, targetpath):
2327        """Set modification time of targetpath according to tarinfo.
2328        """
2329        if not hasattr(os, 'utime'):
2330            return
2331        try:
2332            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2333        except EnvironmentError, e:
2334            raise ExtractError("could not change modification time")
2335
2336    #--------------------------------------------------------------------------
2337    def next(self):
2338        """Return the next member of the archive as a TarInfo object, when
2339           TarFile is opened for reading. Return None if there is no more
2340           available.
2341        """
2342        self._check("ra")
2343        if self.firstmember is not None:
2344            m = self.firstmember
2345            self.firstmember = None
2346            return m
2347
2348        # Advance the file pointer.
2349        if self.offset != self.fileobj.tell():
2350            self.fileobj.seek(self.offset - 1)
2351            if not self.fileobj.read(1):
2352                raise ReadError("unexpected end of data")
2353
2354        # Read the next block.
2355        tarinfo = None
2356        while True:
2357            try:
2358                tarinfo = self.tarinfo.fromtarfile(self)
2359            except EOFHeaderError, e:
2360                if self.ignore_zeros:
2361                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2362                    self.offset += BLOCKSIZE
2363                    continue
2364            except InvalidHeaderError, e:
2365                if self.ignore_zeros:
2366                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2367                    self.offset += BLOCKSIZE
2368                    continue
2369                elif self.offset == 0:
2370                    raise ReadError(str(e))
2371            except EmptyHeaderError:
2372                if self.offset == 0:
2373                    raise ReadError("empty file")
2374            except TruncatedHeaderError, e:
2375                if self.offset == 0:
2376                    raise ReadError(str(e))
2377            except SubsequentHeaderError, e:
2378                raise ReadError(str(e))
2379            break
2380
2381        if tarinfo is not None:
2382            self.members.append(tarinfo)
2383        else:
2384            self._loaded = True
2385
2386        return tarinfo
2387
2388    #--------------------------------------------------------------------------
2389    # Little helper methods:
2390
2391    def _getmember(self, name, tarinfo=None, normalize=False):
2392        """Find an archive member by name from bottom to top.
2393           If tarinfo is given, it is used as the starting point.
2394        """
2395        # Ensure that all members have been loaded.
2396        members = self.getmembers()
2397
2398        # Limit the member search list up to tarinfo.
2399        if tarinfo is not None:
2400            members = members[:members.index(tarinfo)]
2401
2402        if normalize:
2403            name = os.path.normpath(name)
2404
2405        for member in reversed(members):
2406            if normalize:
2407                member_name = os.path.normpath(member.name)
2408            else:
2409                member_name = member.name
2410
2411            if name == member_name:
2412                return member
2413
2414    def _load(self):
2415        """Read through the entire archive file and look for readable
2416           members.
2417        """
2418        while True:
2419            tarinfo = self.next()
2420            if tarinfo is None:
2421                break
2422        self._loaded = True
2423
2424    def _check(self, mode=None):
2425        """Check if TarFile is still open, and if the operation's mode
2426           corresponds to TarFile's mode.
2427        """
2428        if self.closed:
2429            raise IOError("%s is closed" % self.__class__.__name__)
2430        if mode is not None and self.mode not in mode:
2431            raise IOError("bad operation for mode %r" % self.mode)
2432
2433    def _find_link_target(self, tarinfo):
2434        """Find the target member of a symlink or hardlink member in the
2435           archive.
2436        """
2437        if tarinfo.issym():
2438            # Always search the entire archive.
2439            linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2440            limit = None
2441        else:
2442            # Search the archive before the link, because a hard link is
2443            # just a reference to an already archived file.
2444            linkname = tarinfo.linkname
2445            limit = tarinfo
2446
2447        member = self._getmember(linkname, tarinfo=limit, normalize=True)
2448        if member is None:
2449            raise KeyError("linkname %r not found" % linkname)
2450        return member
2451
2452    def __iter__(self):
2453        """Provide an iterator object.
2454        """
2455        if self._loaded:
2456            return iter(self.members)
2457        else:
2458            return TarIter(self)
2459
2460    def _dbg(self, level, msg):
2461        """Write debugging output to sys.stderr.
2462        """
2463        if level <= self.debug:
2464            print >> sys.stderr, msg
2465
2466    def __enter__(self):
2467        self._check()
2468        return self
2469
2470    def __exit__(self, type, value, traceback):
2471        if type is None:
2472            self.close()
2473        else:
2474            # An exception occurred. We must not call close() because
2475            # it would try to write end-of-archive blocks and padding.
2476            if not self._extfileobj:
2477                self.fileobj.close()
2478            self.closed = True
2479# class TarFile
2480
2481class TarIter:
2482    """Iterator Class.
2483
2484       for tarinfo in TarFile(...):
2485           suite...
2486    """
2487
2488    def __init__(self, tarfile):
2489        """Construct a TarIter object.
2490        """
2491        self.tarfile = tarfile
2492        self.index = 0
2493    def __iter__(self):
2494        """Return iterator object.
2495        """
2496        return self
2497    def next(self):
2498        """Return the next item using TarFile's next() method.
2499           When all members have been read, set TarFile as _loaded.
2500        """
2501        # Fix for SF #1100429: Under rare circumstances it can
2502        # happen that getmembers() is called during iteration,
2503        # which will cause TarIter to stop prematurely.
2504
2505        if self.index == 0 and self.tarfile.firstmember is not None:
2506            tarinfo = self.tarfile.next()
2507        elif self.index < len(self.tarfile.members):
2508            tarinfo = self.tarfile.members[self.index]
2509        elif not self.tarfile._loaded:
2510            tarinfo = self.tarfile.next()
2511            if not tarinfo:
2512                self.tarfile._loaded = True
2513                raise StopIteration
2514        else:
2515            raise StopIteration
2516        self.index += 1
2517        return tarinfo
2518
2519# Helper classes for sparse file support
2520class _section:
2521    """Base class for _data and _hole.
2522    """
2523    def __init__(self, offset, size):
2524        self.offset = offset
2525        self.size = size
2526    def __contains__(self, offset):
2527        return self.offset <= offset < self.offset + self.size
2528
2529class _data(_section):
2530    """Represent a data section in a sparse file.
2531    """
2532    def __init__(self, offset, size, realpos):
2533        _section.__init__(self, offset, size)
2534        self.realpos = realpos
2535
2536class _hole(_section):
2537    """Represent a hole section in a sparse file.
2538    """
2539    pass
2540
2541class _ringbuffer(list):
2542    """Ringbuffer class which increases performance
2543       over a regular list.
2544    """
2545    def __init__(self):
2546        self.idx = 0
2547    def find(self, offset):
2548        idx = self.idx
2549        while True:
2550            item = self[idx]
2551            if offset in item:
2552                break
2553            idx += 1
2554            if idx == len(self):
2555                idx = 0
2556            if idx == self.idx:
2557                # End of File
2558                return None
2559        self.idx = idx
2560        return item
2561
2562#---------------------------------------------
2563# zipfile compatible TarFile class
2564#---------------------------------------------
2565TAR_PLAIN = 0           # zipfile.ZIP_STORED
2566TAR_GZIPPED = 8         # zipfile.ZIP_DEFLATED
2567class TarFileCompat:
2568    """TarFile class compatible with standard module zipfile's
2569       ZipFile class.
2570    """
2571    def __init__(self, file, mode="r", compression=TAR_PLAIN):
2572        from warnings import warnpy3k
2573        warnpy3k("the TarFileCompat class has been removed in Python 3.0",
2574                stacklevel=2)
2575        if compression == TAR_PLAIN:
2576            self.tarfile = TarFile.taropen(file, mode)
2577        elif compression == TAR_GZIPPED:
2578            self.tarfile = TarFile.gzopen(file, mode)
2579        else:
2580            raise ValueError("unknown compression constant")
2581        if mode[0:1] == "r":
2582            members = self.tarfile.getmembers()
2583            for m in members:
2584                m.filename = m.name
2585                m.file_size = m.size
2586                m.date_time = time.gmtime(m.mtime)[:6]
2587    def namelist(self):
2588        return map(lambda m: m.name, self.infolist())
2589    def infolist(self):
2590        return filter(lambda m: m.type in REGULAR_TYPES,
2591                      self.tarfile.getmembers())
2592    def printdir(self):
2593        self.tarfile.list()
2594    def testzip(self):
2595        return
2596    def getinfo(self, name):
2597        return self.tarfile.getmember(name)
2598    def read(self, name):
2599        return self.tarfile.extractfile(self.tarfile.getmember(name)).read()
2600    def write(self, filename, arcname=None, compress_type=None):
2601        self.tarfile.add(filename, arcname)
2602    def writestr(self, zinfo, bytes):
2603        try:
2604            from cStringIO import StringIO
2605        except ImportError:
2606            from StringIO import StringIO
2607        import calendar
2608        tinfo = TarInfo(zinfo.filename)
2609        tinfo.size = len(bytes)
2610        tinfo.mtime = calendar.timegm(zinfo.date_time)
2611        self.tarfile.addfile(tinfo, StringIO(bytes))
2612    def close(self):
2613        self.tarfile.close()
2614#class TarFileCompat
2615
2616#--------------------
2617# exported functions
2618#--------------------
2619def is_tarfile(name):
2620    """Return True if name points to a tar archive that we
2621       are able to handle, else return False.
2622    """
2623    try:
2624        t = open(name)
2625        t.close()
2626        return True
2627    except TarError:
2628        return False
2629
2630open = TarFile.open
2631