1e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# Copyright (c) 2015, Google Inc.
2e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#
3e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# Permission to use, copy, modify, and/or distribute this software for any
4e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# purpose with or without fee is hereby granted, provided that the above
5e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# copyright notice and this permission notice appear in all copies.
6e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley#
7e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
15e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley"""Extracts archives."""
16e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
17e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
18e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport optparse
19e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport os
20e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport os.path
21e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport tarfile
22e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport shutil
23e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport sys
24e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyimport zipfile
25e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
26e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
27e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef CheckedJoin(output, path):
28e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
29e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  CheckedJoin returns os.path.join(output, path). It does sanity checks to
30e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  ensure the resulting path is under output, but shouldn't be used on untrusted
31e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  input.
32e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
33e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  path = os.path.normpath(path)
34e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  if os.path.isabs(path) or path.startswith('.'):
35e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    raise ValueError(path)
36e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  return os.path.join(output, path)
37e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
38e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
39e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef IterateZip(path):
40e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
41e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  IterateZip opens the zip file at path and returns a generator of
42e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  (filename, mode, fileobj) tuples for each file in it.
43e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
44e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  with zipfile.ZipFile(path, 'r') as zip_file:
45e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    for info in zip_file.infolist():
46e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if info.filename.endswith('/'):
47e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        continue
48e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      yield (info.filename, None, zip_file.open(info))
49e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
50e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
51e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef IterateTar(path):
52e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
53e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  IterateTar opens the tar.gz file at path and returns a generator of
54e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  (filename, mode, fileobj) tuples for each file in it.
55e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  """
56e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  with tarfile.open(path, 'r:gz') as tar_file:
57e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    for info in tar_file:
58e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if info.isdir():
59e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        continue
60e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if not info.isfile():
61e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        raise ValueError('Unknown entry type "%s"' % (info.name, ))
62e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      yield (info.name, info.mode, tar_file.extractfile(info))
63e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
64e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
65e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleydef main(args):
66e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  parser = optparse.OptionParser(usage='Usage: %prog ARCHIVE OUTPUT')
67e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  parser.add_option('--no-prefix', dest='no_prefix', action='store_true',
68e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley                    help='Do not remove a prefix from paths in the archive.')
69e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  options, args = parser.parse_args(args)
70e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
71e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  if len(args) != 2:
72e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    parser.print_help()
73e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    return 1
74e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
75e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  archive, output = args
76e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
77e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  if not os.path.exists(archive):
78e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    # Skip archives that weren't downloaded.
79e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    return 0
80e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
81e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  if archive.endswith('.zip'):
82e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    entries = IterateZip(archive)
83e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  elif archive.endswith('.tar.gz'):
84e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    entries = IterateTar(archive)
85e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  else:
86e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    raise ValueError(archive)
87e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
88e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  try:
89e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    if os.path.exists(output):
90e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      print "Removing %s" % (output, )
91e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      shutil.rmtree(output)
92e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
93e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    print "Extracting %s to %s" % (archive, output)
94e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    prefix = None
95e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    num_extracted = 0
96e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    for path, mode, inp in entries:
97e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # Even on Windows, zip files must always use forward slashes.
98e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if '\\' in path or path.startswith('/'):
99e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        raise ValueError(path)
100e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
101e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if not options.no_prefix:
102e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        new_prefix, rest = path.split('/', 1)
103e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
104e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        # Ensure the archive is consistent.
105e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        if prefix is None:
106e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley          prefix = new_prefix
107e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        if prefix != new_prefix:
108e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley          raise ValueError((prefix, new_prefix))
109e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      else:
110e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        rest = path
111e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
112e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # Extract the file into the output directory.
113e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      fixed_path = CheckedJoin(output, rest)
114e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if not os.path.isdir(os.path.dirname(fixed_path)):
115e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        os.makedirs(os.path.dirname(fixed_path))
116e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      with open(fixed_path, 'wb') as out:
117e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        shutil.copyfileobj(inp, out)
118e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
119e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # Fix up permissions if needbe.
120e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # TODO(davidben): To be extra tidy, this should only track the execute bit
121e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # as in git.
122e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if mode is not None:
123e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        os.chmod(fixed_path, mode)
124e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
125e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      # Print every 100 files, so bots do not time out on large archives.
126e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      num_extracted += 1
127e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley      if num_extracted % 100 == 0:
128e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley        print "Extracted %d files..." % (num_extracted,)
129e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  finally:
130e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    entries.close()
131e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
132e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  if num_extracted % 100 == 0:
133e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley    print "Done. Extracted %d files." % (num_extracted,)
134e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
135e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  return 0
136e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
137e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley
138e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langleyif __name__ == '__main__':
139e9ada863a7b3e81f5d2b1e3bdd2305da902a87f5Adam Langley  sys.exit(main(sys.argv[1:]))
140