1e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# Copyright (c) 2015, Google Inc. 2e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# 3e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# Permission to use, copy, modify, and/or distribute this software for any 4e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# purpose with or without fee is hereby granted, provided that the above 5e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# copyright notice and this permission notice appear in all copies. 6e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# 7e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 15e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley"""Extracts archives.""" 16e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 17e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 18e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport optparse 19e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport os 20e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport os.path 21e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport tarfile 22e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport shutil 23e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport sys 24e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport zipfile 25e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 26e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 27e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef CheckedJoin(output, path): 28e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 29e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley CheckedJoin returns os.path.join(output, path). It does sanity checks to 30e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley ensure the resulting path is under output, but shouldn't be used on untrusted 31e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley input. 32e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 33e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley path = os.path.normpath(path) 34e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if os.path.isabs(path) or path.startswith('.'): 35e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley raise ValueError(path) 36e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley return os.path.join(output, path) 37e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 38e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 39e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef IterateZip(path): 40e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 41e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley IterateZip opens the zip file at path and returns a generator of 42e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley (filename, mode, fileobj) tuples for each file in it. 43e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 44e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley with zipfile.ZipFile(path, 'r') as zip_file: 45e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley for info in zip_file.infolist(): 46e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if info.filename.endswith('/'): 47e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley continue 48e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley yield (info.filename, None, zip_file.open(info)) 49e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 50e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 51e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef IterateTar(path): 52e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 53e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley IterateTar opens the tar.gz file at path and returns a generator of 54e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley (filename, mode, fileobj) tuples for each file in it. 55e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley """ 56e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley with tarfile.open(path, 'r:gz') as tar_file: 57e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley for info in tar_file: 58e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if info.isdir(): 59e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley continue 60e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if not info.isfile(): 61e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley raise ValueError('Unknown entry type "%s"' % (info.name, )) 62e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley yield (info.name, info.mode, tar_file.extractfile(info)) 63e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 64e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 65e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef main(args): 66e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT') 67e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley parser.add_option('--no-prefix', dest='no_prefix', action='store_true', 68e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley help='Do not remove a prefix from paths in the archive.') 69e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley options, args = parser.parse_args(args) 70e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 71e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if len(args) != 2: 72e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley parser.print_help() 73e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley return 1 74e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 75e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley archive, output = args 76e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 77e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if not os.path.exists(archive): 78e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Skip archives that weren't downloaded. 79e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley return 0 80e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 81e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if archive.endswith('.zip'): 82e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley entries = IterateZip(archive) 83e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley elif archive.endswith('.tar.gz'): 84e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley entries = IterateTar(archive) 85e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley else: 86e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley raise ValueError(archive) 87e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 88e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley try: 89e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if os.path.exists(output): 90e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley print "Removing %s" % (output, ) 91e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley shutil.rmtree(output) 92e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 93e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley print "Extracting %s to %s" % (archive, output) 94e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley prefix = None 95e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley num_extracted = 0 96e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley for path, mode, inp in entries: 97e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Even on Windows, zip files must always use forward slashes. 98e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if '\\' in path or path.startswith('/'): 99e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley raise ValueError(path) 100e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 101e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if not options.no_prefix: 102e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley new_prefix, rest = path.split('/', 1) 103e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 104e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Ensure the archive is consistent. 105e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if prefix is None: 106e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley prefix = new_prefix 107e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if prefix != new_prefix: 108e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley raise ValueError((prefix, new_prefix)) 109e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley else: 110e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley rest = path 111e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 112e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Extract the file into the output directory. 113e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley fixed_path = CheckedJoin(output, rest) 114e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if not os.path.isdir(os.path.dirname(fixed_path)): 115e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley os.makedirs(os.path.dirname(fixed_path)) 116e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley with open(fixed_path, 'wb') as out: 117e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley shutil.copyfileobj(inp, out) 118e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 119e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Fix up permissions if needbe. 120e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # TODO(davidben): To be extra tidy, this should only track the execute bit 121e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # as in git. 122e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if mode is not None: 123e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley os.chmod(fixed_path, mode) 124e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 125e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley # Print every 100 files, so bots do not time out on large archives. 126e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley num_extracted += 1 127e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if num_extracted % 100 == 0: 128e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley print "Extracted %d files..." % (num_extracted,) 129e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley finally: 130e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley entries.close() 131e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 132e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley if num_extracted % 100 == 0: 133e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley print "Done. Extracted %d files." % (num_extracted,) 134e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 135e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley return 0 136e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 137e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley 138e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif __name__ == '__main__': 139e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley sys.exit(main(sys.argv[1:])) 140