1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import json 6import logging 7from cStringIO import StringIO 8import posixpath 9import traceback 10from zipfile import ZipFile 11 12import appengine_blobstore as blobstore 13from appengine_url_fetcher import AppEngineUrlFetcher 14from appengine_wrappers import urlfetch 15from docs_server_utils import StringIdentity 16from file_system import FileNotFoundError, FileSystem, FileSystemError, StatInfo 17from future import Future 18from object_store_creator import ObjectStoreCreator 19from path_util import AssertIsDirectory, IsDirectory 20import url_constants 21 22 23_GITHUB_REPOS_NAMESPACE = 'GithubRepos' 24 25 26def _LoadCredentials(object_store_creator): 27 '''Returns (username, password) from |password_store|. 28 ''' 29 password_store = object_store_creator.Create( 30 GithubFileSystem, 31 app_version=None, 32 category='password', 33 start_empty=False) 34 password_data = password_store.GetMulti(('username', 'password')).Get() 35 return password_data.get('username'), password_data.get('password') 36 37 38class _GithubZipFile(object): 39 '''A view of a ZipFile with a more convenient interface which ignores the 40 'zipball' prefix that all paths have. The zip files that come straight from 41 GitHub have paths like ['zipball/foo.txt', 'zipball/bar.txt'] but we only 42 care about ['foo.txt', 'bar.txt']. 43 ''' 44 45 @classmethod 46 def Create(cls, repo_name, blob): 47 try: 48 zipball = ZipFile(StringIO(blob)) 49 except: 50 logging.warning('zipball "%s" is not a valid zip' % repo_name) 51 return None 52 53 if not zipball.namelist(): 54 logging.warning('zipball "%s" is empty' % repo_name) 55 return None 56 57 name_prefix = None # probably 'zipball' 58 paths = [] 59 for name in zipball.namelist(): 60 prefix, path = name.split('/', 1) 61 if name_prefix and prefix != name_prefix: 62 logging.warning('zipball "%s" has names with inconsistent prefix: %s' % 63 (repo_name, zipball.namelist())) 64 return None 65 name_prefix = prefix 66 paths.append(path) 67 return cls(zipball, name_prefix, paths) 68 69 def __init__(self, zipball, name_prefix, paths): 70 self._zipball = zipball 71 self._name_prefix = name_prefix 72 self._paths = paths 73 74 def Paths(self): 75 '''Return all file paths in this zip file. 76 ''' 77 return self._paths 78 79 def List(self, path): 80 '''Returns all files within a directory at |path|. Not recursive. Paths 81 are returned relative to |path|. 82 ''' 83 AssertIsDirectory(path) 84 return [p[len(path):] for p in self._paths 85 if p != path and 86 p.startswith(path) and 87 '/' not in p[len(path):].rstrip('/')] 88 89 def Read(self, path): 90 '''Returns the contents of |path|. Raises a KeyError if it doesn't exist. 91 ''' 92 return self._zipball.read(posixpath.join(self._name_prefix, path)) 93 94 95class GithubFileSystem(FileSystem): 96 '''Allows reading from a github.com repository. 97 ''' 98 @staticmethod 99 def Create(owner, repo, object_store_creator): 100 '''Creates a GithubFileSystem that corresponds to a single github repository 101 specified by |owner| and |repo|. 102 ''' 103 return GithubFileSystem( 104 url_constants.GITHUB_REPOS, 105 owner, 106 repo, 107 object_store_creator, 108 AppEngineUrlFetcher) 109 110 @staticmethod 111 def ForTest(repo, fake_fetcher, path=None, object_store_creator=None): 112 '''Creates a GithubFileSystem that can be used for testing. It reads zip 113 files and commit data from server2/test_data/github_file_system/test_owner 114 instead of github.com. It reads from files specified by |repo|. 115 ''' 116 return GithubFileSystem( 117 path if path is not None else 'test_data/github_file_system', 118 'test_owner', 119 repo, 120 object_store_creator or ObjectStoreCreator.ForTest(), 121 fake_fetcher) 122 123 def __init__(self, base_url, owner, repo, object_store_creator, Fetcher): 124 self._repo_key = posixpath.join(owner, repo) 125 self._repo_url = posixpath.join(base_url, owner, repo) 126 self._username, self._password = _LoadCredentials(object_store_creator) 127 self._blobstore = blobstore.AppEngineBlobstore() 128 self._fetcher = Fetcher(self._repo_url) 129 # Stores whether the github is up-to-date. This will either be True or 130 # empty, the emptiness most likely due to this being a cron run. 131 self._up_to_date_cache = object_store_creator.Create( 132 GithubFileSystem, category='up-to-date') 133 # Caches the zip file's stat. Overrides start_empty=False and use 134 # |self._up_to_date_cache| to determine whether we need to refresh. 135 self._stat_cache = object_store_creator.Create( 136 GithubFileSystem, category='stat-cache', start_empty=False) 137 138 # Created lazily in |_EnsureRepoZip|. 139 self._repo_zip = None 140 141 def _EnsureRepoZip(self): 142 '''Initializes |self._repo_zip| if it hasn't already been (i.e. if 143 _EnsureRepoZip has never been called before). In that case |self._repo_zip| 144 will be set to a Future of _GithubZipFile and the fetch process started, 145 whether that be from a blobstore or if necessary all the way from GitHub. 146 ''' 147 if self._repo_zip is not None: 148 return 149 150 repo_key, repo_url, username, password = ( 151 self._repo_key, self._repo_url, self._username, self._password) 152 153 def fetch_from_blobstore(version): 154 '''Returns a Future which resolves to the _GithubZipFile for this repo 155 fetched from blobstore. 156 ''' 157 blob = None 158 try: 159 blob = self._blobstore.Get(repo_url, _GITHUB_REPOS_NAMESPACE) 160 except blobstore.BlobNotFoundError: 161 pass 162 163 if blob is None: 164 logging.warning('No blob for %s found in datastore' % repo_key) 165 return fetch_from_github(version) 166 167 repo_zip = _GithubZipFile.Create(repo_key, blob) 168 if repo_zip is None: 169 logging.warning('Blob for %s was corrupted in blobstore!?' % repo_key) 170 return fetch_from_github(version) 171 172 return Future(value=repo_zip) 173 174 def fetch_from_github(version): 175 '''Returns a Future which resolves to the _GithubZipFile for this repo 176 fetched new from GitHub, then writes it to blobstore and |version| to the 177 stat caches. 178 ''' 179 def get_zip(github_zip): 180 try: 181 blob = github_zip.content 182 except urlfetch.DownloadError: 183 raise FileSystemError('Failed to download repo %s file from %s' % 184 (repo_key, repo_url)) 185 186 repo_zip = _GithubZipFile.Create(repo_key, blob) 187 if repo_zip is None: 188 raise FileSystemError('Blob for %s was fetched corrupted from %s' % 189 (repo_key, repo_url)) 190 191 self._blobstore.Set(self._repo_url, blob, _GITHUB_REPOS_NAMESPACE) 192 self._up_to_date_cache.Set(repo_key, True) 193 self._stat_cache.Set(repo_key, version) 194 return repo_zip 195 return self._fetcher.FetchAsync( 196 'zipball', username=username, password=password).Then(get_zip) 197 198 # To decide whether we need to re-stat, and from there whether to re-fetch, 199 # make use of ObjectStore's start-empty configuration. If 200 # |object_store_creator| is configured to start empty then our creator 201 # wants to refresh (e.g. running a cron), so fetch the live stat from 202 # GitHub. If the stat hasn't changed since last time then no reason to 203 # re-fetch from GitHub, just take from blobstore. 204 205 cached_version = self._stat_cache.Get(repo_key).Get() 206 if self._up_to_date_cache.Get(repo_key).Get() is None: 207 # This is either a cron or an instance where a cron has never been run. 208 live_version = self._FetchLiveVersion(username, password) 209 if cached_version != live_version: 210 # Note: branch intentionally triggered if |cached_version| is None. 211 logging.info('%s has changed, fetching from GitHub.' % repo_url) 212 self._repo_zip = fetch_from_github(live_version) 213 else: 214 # Already up to date. Fetch from blobstore. No need to set up-to-date 215 # to True here since it'll already be set for instances, and it'll 216 # never be set for crons. 217 logging.info('%s is up to date.' % repo_url) 218 self._repo_zip = fetch_from_blobstore(cached_version) 219 else: 220 # Instance where cron has been run. It should be in blobstore. 221 self._repo_zip = fetch_from_blobstore(cached_version) 222 223 assert self._repo_zip is not None 224 225 def _FetchLiveVersion(self, username, password): 226 '''Fetches the current repository version from github.com and returns it. 227 The version is a 'sha' hash value. 228 ''' 229 # TODO(kalman): Do this asynchronously (use FetchAsync). 230 result = self._fetcher.Fetch( 231 'commits/HEAD', username=username, password=password) 232 233 try: 234 return json.loads(result.content)['sha'] 235 except (KeyError, ValueError): 236 raise FileSystemError('Error parsing JSON from repo %s: %s' % 237 (self._repo_url, traceback.format_exc())) 238 239 def Refresh(self): 240 return self.ReadSingle('') 241 242 def Read(self, paths, skip_not_found=False): 243 '''Returns a directory mapping |paths| to the contents of the file at each 244 path. If path ends with a '/', it is treated as a directory and is mapped to 245 a list of filenames in that directory. 246 ''' 247 self._EnsureRepoZip() 248 def read(repo_zip): 249 reads = {} 250 for path in paths: 251 if path not in repo_zip.Paths(): 252 raise FileNotFoundError('"%s": %s not found' % (self._repo_key, path)) 253 if IsDirectory(path): 254 reads[path] = repo_zip.List(path) 255 else: 256 reads[path] = repo_zip.Read(path) 257 return reads 258 return self._repo_zip.Then(read) 259 260 def Stat(self, path): 261 '''Stats |path| returning its version as as StatInfo object. If |path| ends 262 with a '/', it is assumed to be a directory and the StatInfo object returned 263 includes child_versions for all paths in the directory. 264 265 File paths do not include the name of the zip file, which is arbitrary and 266 useless to consumers. 267 268 Because the repository will only be downloaded once per server version, all 269 stat versions are always 0. 270 ''' 271 self._EnsureRepoZip() 272 repo_zip = self._repo_zip.Get() 273 274 if path not in repo_zip.Paths(): 275 raise FileNotFoundError('"%s" does not contain file "%s"' % 276 (self._repo_key, path)) 277 278 version = self._stat_cache.Get(self._repo_key).Get() 279 assert version is not None, ('There was a zipball in datastore; there ' 280 'should be a version cached for it') 281 282 stat_info = StatInfo(version) 283 if IsDirectory(path): 284 stat_info.child_versions = dict((p, StatInfo(version)) 285 for p in repo_zip.List(path)) 286 return stat_info 287 288 def GetIdentity(self): 289 return '%s' % StringIdentity(self.__class__.__name__ + self._repo_key) 290 291 def __repr__(self): 292 return '%s(key=%s, url=%s)' % (type(self).__name__, 293 self._repo_key, 294 self._repo_url) 295