1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import logging
7from cStringIO import StringIO
8import posixpath
9import traceback
10from zipfile import ZipFile
11
12import appengine_blobstore as blobstore
13from appengine_url_fetcher import AppEngineUrlFetcher
14from appengine_wrappers import urlfetch
15from docs_server_utils import StringIdentity
16from file_system import FileNotFoundError, FileSystem, FileSystemError, StatInfo
17from future import Future
18from object_store_creator import ObjectStoreCreator
19from path_util import AssertIsDirectory, IsDirectory
20import url_constants
21
22
23_GITHUB_REPOS_NAMESPACE = 'GithubRepos'
24
25
26def _LoadCredentials(object_store_creator):
27  '''Returns (username, password) from |password_store|.
28  '''
29  password_store = object_store_creator.Create(
30      GithubFileSystem,
31      app_version=None,
32      category='password',
33      start_empty=False)
34  password_data = password_store.GetMulti(('username', 'password')).Get()
35  return password_data.get('username'), password_data.get('password')
36
37
38class _GithubZipFile(object):
39  '''A view of a ZipFile with a more convenient interface which ignores the
40  'zipball' prefix that all paths have. The zip files that come straight from
41  GitHub have paths like ['zipball/foo.txt', 'zipball/bar.txt'] but we only
42  care about ['foo.txt', 'bar.txt'].
43  '''
44
45  @classmethod
46  def Create(cls, repo_name, blob):
47    try:
48      zipball = ZipFile(StringIO(blob))
49    except:
50      logging.warning('zipball "%s" is not a valid zip' % repo_name)
51      return None
52
53    if not zipball.namelist():
54      logging.warning('zipball "%s" is empty' % repo_name)
55      return None
56
57    name_prefix = None  # probably 'zipball'
58    paths = []
59    for name in zipball.namelist():
60      prefix, path = name.split('/', 1)
61      if name_prefix and prefix != name_prefix:
62        logging.warning('zipball "%s" has names with inconsistent prefix: %s' %
63                        (repo_name, zipball.namelist()))
64        return None
65      name_prefix = prefix
66      paths.append(path)
67    return cls(zipball, name_prefix, paths)
68
69  def __init__(self, zipball, name_prefix, paths):
70    self._zipball = zipball
71    self._name_prefix = name_prefix
72    self._paths = paths
73
74  def Paths(self):
75    '''Return all file paths in this zip file.
76    '''
77    return self._paths
78
79  def List(self, path):
80    '''Returns all files within a directory at |path|. Not recursive. Paths
81    are returned relative to |path|.
82    '''
83    AssertIsDirectory(path)
84    return [p[len(path):] for p in self._paths
85            if p != path and
86               p.startswith(path) and
87               '/' not in p[len(path):].rstrip('/')]
88
89  def Read(self, path):
90    '''Returns the contents of |path|. Raises a KeyError if it doesn't exist.
91    '''
92    return self._zipball.read(posixpath.join(self._name_prefix, path))
93
94
95class GithubFileSystem(FileSystem):
96  '''Allows reading from a github.com repository.
97  '''
98  @staticmethod
99  def Create(owner, repo, object_store_creator):
100    '''Creates a GithubFileSystem that corresponds to a single github repository
101    specified by |owner| and |repo|.
102    '''
103    return GithubFileSystem(
104        url_constants.GITHUB_REPOS,
105        owner,
106        repo,
107        object_store_creator,
108        AppEngineUrlFetcher)
109
110  @staticmethod
111  def ForTest(repo, fake_fetcher, path=None, object_store_creator=None):
112    '''Creates a GithubFileSystem that can be used for testing. It reads zip
113    files and commit data from server2/test_data/github_file_system/test_owner
114    instead of github.com. It reads from files specified by |repo|.
115    '''
116    return GithubFileSystem(
117        path if path is not None else 'test_data/github_file_system',
118        'test_owner',
119        repo,
120        object_store_creator or ObjectStoreCreator.ForTest(),
121        fake_fetcher)
122
123  def __init__(self, base_url, owner, repo, object_store_creator, Fetcher):
124    self._repo_key = posixpath.join(owner, repo)
125    self._repo_url = posixpath.join(base_url, owner, repo)
126    self._username, self._password = _LoadCredentials(object_store_creator)
127    self._blobstore = blobstore.AppEngineBlobstore()
128    self._fetcher = Fetcher(self._repo_url)
129    # Stores whether the github is up-to-date. This will either be True or
130    # empty, the emptiness most likely due to this being a cron run.
131    self._up_to_date_cache = object_store_creator.Create(
132        GithubFileSystem, category='up-to-date')
133    # Caches the zip file's stat. Overrides start_empty=False and use
134    # |self._up_to_date_cache| to determine whether we need to refresh.
135    self._stat_cache = object_store_creator.Create(
136        GithubFileSystem, category='stat-cache', start_empty=False)
137
138    # Created lazily in |_EnsureRepoZip|.
139    self._repo_zip = None
140
141  def _EnsureRepoZip(self):
142    '''Initializes |self._repo_zip| if it hasn't already been (i.e. if
143    _EnsureRepoZip has never been called before). In that case |self._repo_zip|
144    will be set to a Future of _GithubZipFile and the fetch process started,
145    whether that be from a blobstore or if necessary all the way from GitHub.
146    '''
147    if self._repo_zip is not None:
148      return
149
150    repo_key, repo_url, username, password = (
151        self._repo_key, self._repo_url, self._username, self._password)
152
153    def fetch_from_blobstore(version):
154      '''Returns a Future which resolves to the _GithubZipFile for this repo
155      fetched from blobstore.
156      '''
157      blob = None
158      try:
159        blob = self._blobstore.Get(repo_url, _GITHUB_REPOS_NAMESPACE)
160      except blobstore.BlobNotFoundError:
161        pass
162
163      if blob is None:
164        logging.warning('No blob for %s found in datastore' % repo_key)
165        return fetch_from_github(version)
166
167      repo_zip = _GithubZipFile.Create(repo_key, blob)
168      if repo_zip is None:
169        logging.warning('Blob for %s was corrupted in blobstore!?' % repo_key)
170        return fetch_from_github(version)
171
172      return Future(value=repo_zip)
173
174    def fetch_from_github(version):
175      '''Returns a Future which resolves to the _GithubZipFile for this repo
176      fetched new from GitHub, then writes it to blobstore and |version| to the
177      stat caches.
178      '''
179      def get_zip(github_zip):
180        try:
181          blob = github_zip.content
182        except urlfetch.DownloadError:
183          raise FileSystemError('Failed to download repo %s file from %s' %
184                                (repo_key, repo_url))
185
186        repo_zip = _GithubZipFile.Create(repo_key, blob)
187        if repo_zip is None:
188          raise FileSystemError('Blob for %s was fetched corrupted from %s' %
189                                (repo_key, repo_url))
190
191        self._blobstore.Set(self._repo_url, blob, _GITHUB_REPOS_NAMESPACE)
192        self._up_to_date_cache.Set(repo_key, True)
193        self._stat_cache.Set(repo_key, version)
194        return repo_zip
195      return self._fetcher.FetchAsync(
196          'zipball', username=username, password=password).Then(get_zip)
197
198    # To decide whether we need to re-stat, and from there whether to re-fetch,
199    # make use of ObjectStore's start-empty configuration. If
200    # |object_store_creator| is configured to start empty then our creator
201    # wants to refresh (e.g. running a cron), so fetch the live stat from
202    # GitHub. If the stat hasn't changed since last time then no reason to
203    # re-fetch from GitHub, just take from blobstore.
204
205    cached_version = self._stat_cache.Get(repo_key).Get()
206    if self._up_to_date_cache.Get(repo_key).Get() is None:
207      # This is either a cron or an instance where a cron has never been run.
208      live_version = self._FetchLiveVersion(username, password)
209      if cached_version != live_version:
210        # Note: branch intentionally triggered if |cached_version| is None.
211        logging.info('%s has changed, fetching from GitHub.' % repo_url)
212        self._repo_zip = fetch_from_github(live_version)
213      else:
214        # Already up to date. Fetch from blobstore. No need to set up-to-date
215        # to True here since it'll already be set for instances, and it'll
216        # never be set for crons.
217        logging.info('%s is up to date.' % repo_url)
218        self._repo_zip = fetch_from_blobstore(cached_version)
219    else:
220      # Instance where cron has been run. It should be in blobstore.
221      self._repo_zip = fetch_from_blobstore(cached_version)
222
223    assert self._repo_zip is not None
224
225  def _FetchLiveVersion(self, username, password):
226    '''Fetches the current repository version from github.com and returns it.
227    The version is a 'sha' hash value.
228    '''
229    # TODO(kalman): Do this asynchronously (use FetchAsync).
230    result = self._fetcher.Fetch(
231        'commits/HEAD', username=username, password=password)
232
233    try:
234      return json.loads(result.content)['sha']
235    except (KeyError, ValueError):
236      raise FileSystemError('Error parsing JSON from repo %s: %s' %
237                            (self._repo_url, traceback.format_exc()))
238
239  def Refresh(self):
240    return self.ReadSingle('')
241
242  def Read(self, paths, skip_not_found=False):
243    '''Returns a directory mapping |paths| to the contents of the file at each
244    path. If path ends with a '/', it is treated as a directory and is mapped to
245    a list of filenames in that directory.
246    '''
247    self._EnsureRepoZip()
248    def read(repo_zip):
249      reads = {}
250      for path in paths:
251        if path not in repo_zip.Paths():
252          raise FileNotFoundError('"%s": %s not found' % (self._repo_key, path))
253        if IsDirectory(path):
254          reads[path] = repo_zip.List(path)
255        else:
256          reads[path] = repo_zip.Read(path)
257      return reads
258    return self._repo_zip.Then(read)
259
260  def Stat(self, path):
261    '''Stats |path| returning its version as as StatInfo object. If |path| ends
262    with a '/', it is assumed to be a directory and the StatInfo object returned
263    includes child_versions for all paths in the directory.
264
265    File paths do not include the name of the zip file, which is arbitrary and
266    useless to consumers.
267
268    Because the repository will only be downloaded once per server version, all
269    stat versions are always 0.
270    '''
271    self._EnsureRepoZip()
272    repo_zip = self._repo_zip.Get()
273
274    if path not in repo_zip.Paths():
275      raise FileNotFoundError('"%s" does not contain file "%s"' %
276                              (self._repo_key, path))
277
278    version = self._stat_cache.Get(self._repo_key).Get()
279    assert version is not None, ('There was a zipball in datastore; there '
280                                 'should be a version cached for it')
281
282    stat_info = StatInfo(version)
283    if IsDirectory(path):
284      stat_info.child_versions = dict((p, StatInfo(version))
285                                      for p in repo_zip.List(path))
286    return stat_info
287
288  def GetIdentity(self):
289    return '%s' % StringIdentity(self.__class__.__name__ + self._repo_key)
290
291  def __repr__(self):
292    return '%s(key=%s, url=%s)' % (type(self).__name__,
293                                   self._repo_key,
294                                   self._repo_url)
295