subversion_file_system.py revision e5d81f57cb97b3b6b7fccc9c5610d21eb81db09d
1# Copyright (c) 2012 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import posixpath 6import traceback 7import xml.dom.minidom as xml 8from xml.parsers.expat import ExpatError 9 10from appengine_url_fetcher import AppEngineUrlFetcher 11from appengine_wrappers import IsDownloadError 12from docs_server_utils import StringIdentity 13from file_system import ( 14 FileNotFoundError, FileSystem, FileSystemError, StatInfo) 15from future import Future 16import url_constants 17 18 19def _ParseHTML(html): 20 '''Unfortunately, the viewvc page has a stray </div> tag, so this takes care 21 of all mismatched tags. 22 ''' 23 try: 24 return xml.parseString(html) 25 except ExpatError as e: 26 return _ParseHTML('\n'.join( 27 line for (i, line) in enumerate(html.split('\n')) 28 if e.lineno != i + 1)) 29 30def _InnerText(node): 31 '''Like node.innerText in JS DOM, but strips surrounding whitespace. 32 ''' 33 text = [] 34 if node.nodeValue: 35 text.append(node.nodeValue) 36 if hasattr(node, 'childNodes'): 37 for child_node in node.childNodes: 38 text.append(_InnerText(child_node)) 39 return ''.join(text).strip() 40 41def _CreateStatInfo(html): 42 parent_version = None 43 child_versions = {} 44 45 # Try all of the tables until we find the ones that contain the data (the 46 # directory and file versions are in different tables). 47 for table in _ParseHTML(html).getElementsByTagName('table'): 48 # Within the table there is a list of files. However, there may be some 49 # things beforehand; a header, "parent directory" list, etc. We will deal 50 # with that below by being generous and just ignoring such rows. 51 rows = table.getElementsByTagName('tr') 52 53 for row in rows: 54 cells = row.getElementsByTagName('td') 55 56 # The version of the directory will eventually appear in the soup of 57 # table rows, like this: 58 # 59 # <tr> 60 # <td>Directory revision:</td> 61 # <td><a href=... title="Revision 214692">214692</a> (of...)</td> 62 # </tr> 63 # 64 # So look out for that. 65 if len(cells) == 2 and _InnerText(cells[0]) == 'Directory revision:': 66 links = cells[1].getElementsByTagName('a') 67 if len(links) != 2: 68 raise FileSystemError('ViewVC assumption invalid: directory ' + 69 'revision content did not have 2 <a> ' + 70 ' elements, instead %s' % _InnerText(cells[1])) 71 this_parent_version = _InnerText(links[0]) 72 int(this_parent_version) # sanity check 73 if parent_version is not None: 74 raise FileSystemError('There was already a parent version %s, and ' + 75 ' we just found a second at %s' % 76 (parent_version, this_parent_version)) 77 parent_version = this_parent_version 78 79 # The version of each file is a list of rows with 5 cells: name, version, 80 # age, author, and last log entry. Maybe the columns will change; we're 81 # at the mercy viewvc, but this constant can be easily updated. 82 if len(cells) != 5: 83 continue 84 name_element, version_element, _, __, ___ = cells 85 86 name = _InnerText(name_element) # note: will end in / for directories 87 try: 88 version = int(_InnerText(version_element)) 89 except StandardError: 90 continue 91 child_versions[name] = str(version) 92 93 if parent_version and child_versions: 94 break 95 96 return StatInfo(parent_version, child_versions) 97 98def _GetAsyncFetchCallback(paths, fetcher, args=None, skip_not_found=False): 99 def apply_args(path): 100 return path if args is None else '%s?%s' % (path, args) 101 102 def list_dir(directory): 103 dom = xml.parseString(directory) 104 files = [elem.childNodes[0].data for elem in dom.getElementsByTagName('a')] 105 if '..' in files: 106 files.remove('..') 107 return files 108 109 # A list of tuples of the form (path, Future). 110 fetches = [(path, fetcher.FetchAsync(apply_args(path))) for path in paths] 111 112 def resolve(): 113 value = {} 114 for path, future in fetches: 115 try: 116 result = future.Get() 117 except Exception as e: 118 if skip_not_found and IsDownloadError(e): continue 119 exc_type = FileNotFoundError if IsDownloadError(e) else FileSystemError 120 raise exc_type('%s fetching %s for Get: %s' % 121 (type(e).__name__, path, traceback.format_exc())) 122 if result.status_code == 404: 123 if skip_not_found: continue 124 raise FileNotFoundError('Got 404 when fetching %s for Get, content %s' % 125 (path, result.content)) 126 if result.status_code != 200: 127 raise FileSystemError('Got %s when fetching %s for Get, content %s' % 128 (result.status_code, path, result.content)) 129 if path.endswith('/'): 130 value[path] = list_dir(result.content) 131 else: 132 value[path] = result.content 133 return value 134 135 return resolve 136 137class SubversionFileSystem(FileSystem): 138 '''Class to fetch resources from src.chromium.org. 139 ''' 140 @staticmethod 141 def Create(branch='trunk', revision=None): 142 if branch == 'trunk': 143 svn_path = 'trunk/src' 144 else: 145 svn_path = 'branches/%s/src' % branch 146 return SubversionFileSystem( 147 AppEngineUrlFetcher('%s/%s' % (url_constants.SVN_URL, svn_path)), 148 AppEngineUrlFetcher('%s/%s' % (url_constants.VIEWVC_URL, svn_path)), 149 svn_path, 150 revision=revision) 151 152 def __init__(self, file_fetcher, stat_fetcher, svn_path, revision=None): 153 self._file_fetcher = file_fetcher 154 self._stat_fetcher = stat_fetcher 155 self._svn_path = svn_path 156 self._revision = revision 157 158 def Read(self, paths, skip_not_found=False): 159 args = None 160 if self._revision is not None: 161 # |fetcher| gets from svn.chromium.org which uses p= for version. 162 args = 'p=%s' % self._revision 163 return Future(callback=_GetAsyncFetchCallback( 164 paths, 165 self._file_fetcher, 166 args=args, 167 skip_not_found=skip_not_found)) 168 169 def Refresh(self): 170 return Future(value=()) 171 172 def Stat(self, path): 173 return self.StatAsync(path).Get() 174 175 def StatAsync(self, path): 176 directory, filename = posixpath.split(path) 177 if self._revision is not None: 178 # |stat_fetch| uses viewvc which uses pathrev= for version. 179 directory += '?pathrev=%s' % self._revision 180 181 result_future = self._stat_fetcher.FetchAsync(directory) 182 def resolve(): 183 try: 184 result = result_future.Get() 185 except Exception as e: 186 exc_type = FileNotFoundError if IsDownloadError(e) else FileSystemError 187 raise exc_type('%s fetching %s for Stat: %s' % 188 (type(e).__name__, path, traceback.format_exc())) 189 190 if result.status_code == 404: 191 raise FileNotFoundError('Got 404 when fetching %s for Stat, ' 192 'content %s' % (path, result.content)) 193 if result.status_code != 200: 194 raise FileNotFoundError('Got %s when fetching %s for Stat, content %s' % 195 (result.status_code, path, result.content)) 196 197 stat_info = _CreateStatInfo(result.content) 198 if stat_info.version is None: 199 raise FileSystemError('Failed to find version of dir %s' % directory) 200 if path == '' or path.endswith('/'): 201 return stat_info 202 if filename not in stat_info.child_versions: 203 raise FileNotFoundError( 204 '%s from %s was not in child versions for Stat' % (filename, path)) 205 return StatInfo(stat_info.child_versions[filename]) 206 207 return Future(callback=resolve) 208 209 def GetIdentity(self): 210 # NOTE: no revision here, since it would mess up the caching of reads. It 211 # probably doesn't matter since all the caching classes will use the result 212 # of Stat to decide whether to re-read - and Stat has a ceiling of the 213 # revision - so when the revision changes, so might Stat. That is enough. 214 return '@'.join((self.__class__.__name__, StringIdentity(self._svn_path))) 215