subversion_file_system.py revision c2e0dbddbe15c98d52c4786dac06cb8952a8ae6d
1# Copyright (c) 2012 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from file_system import FileSystem, FileNotFoundError, StatInfo, ToUnicode 6from future import Future 7import logging 8import re 9import posixpath 10import xml.dom.minidom as xml 11from xml.parsers.expat import ExpatError 12 13class _AsyncFetchFuture(object): 14 def __init__(self, paths, fetcher, binary): 15 # A list of tuples of the form (path, Future). 16 self._fetches = [(path, fetcher.FetchAsync(path)) for path in paths] 17 self._value = {} 18 self._error = None 19 self._binary = binary 20 21 def _ListDir(self, directory): 22 dom = xml.parseString(directory) 23 files = [elem.childNodes[0].data for elem in dom.getElementsByTagName('a')] 24 if '..' in files: 25 files.remove('..') 26 return files 27 28 def Get(self): 29 for path, future in self._fetches: 30 try: 31 result = future.Get() 32 except Exception as e: 33 raise FileNotFoundError( 34 'Error when fetching %s for Get: %s' % (path, e)) 35 if result.status_code == 404: 36 raise FileNotFoundError('Got 404 when fetching %s for Get' % path) 37 elif path.endswith('/'): 38 self._value[path] = self._ListDir(result.content) 39 elif not self._binary: 40 self._value[path] = ToUnicode(result.content) 41 else: 42 self._value[path] = result.content 43 if self._error is not None: 44 raise self._error 45 return self._value 46 47class SubversionFileSystem(FileSystem): 48 """Class to fetch resources from src.chromium.org. 49 """ 50 def __init__(self, fetcher, stat_fetcher): 51 self._fetcher = fetcher 52 self._stat_fetcher = stat_fetcher 53 54 def Read(self, paths, binary=False): 55 return Future(delegate=_AsyncFetchFuture(paths, self._fetcher, binary)) 56 57 def _ParseHTML(self, html): 58 """Unfortunately, the viewvc page has a stray </div> tag, so this takes care 59 of all mismatched tags. 60 """ 61 try: 62 return xml.parseString(html) 63 except ExpatError as e: 64 return self._ParseHTML('\n'.join( 65 line for (i, line) in enumerate(html.split('\n')) 66 if e.lineno != i + 1)) 67 68 def _CreateStatInfo(self, html): 69 def inner_text(node): 70 '''Like node.innerText in JS DOM, but strips surrounding whitespace. 71 ''' 72 text = [] 73 if node.nodeValue: 74 text.append(node.nodeValue) 75 if hasattr(node, 'childNodes'): 76 for child_node in node.childNodes: 77 text.append(inner_text(child_node)) 78 return ''.join(text).strip() 79 80 dom = self._ParseHTML(html) 81 82 # Try all of the tables until we find the one that contains the data. 83 for table in dom.getElementsByTagName('table'): 84 # Within the table there is a list of files. However, there may be some 85 # things beforehand; a header, "parent directory" list, etc. We will deal 86 # with that below by being generous and just ignoring such rows. 87 rows = table.getElementsByTagName('tr') 88 child_versions = {} 89 90 for row in rows: 91 # Within each row there are probably 5 cells; name, version, age, 92 # author, and last log entry. Maybe the columns will change; we're at 93 # the mercy viewvc, but this constant can be easily updated. 94 elements = row.getElementsByTagName('td') 95 if len(elements) != 5: 96 continue 97 name_element, version_element, _, __, ___ = elements 98 99 name = inner_text(name_element) # note: will end in / for directories 100 try: 101 version = int(inner_text(version_element)) 102 except ValueError: 103 continue 104 child_versions[name] = version 105 106 if not child_versions: 107 continue 108 109 # Parent version is max version of all children, since it's SVN. 110 parent_version = max(child_versions.values()) 111 112 # All versions in StatInfo need to be strings. 113 return StatInfo(str(parent_version), 114 dict((path, str(version)) 115 for path, version in child_versions.iteritems())) 116 117 # Bleh, but, this data is so unreliable. There are actually some empty file 118 # listings caused by git/svn/something not cleaning up empty dirs. 119 return StatInfo('0', {}) 120 121 def Stat(self, path): 122 directory, filename = posixpath.split(path) 123 result = self._stat_fetcher.Fetch(directory + '/') 124 if result.status_code == 404: 125 raise FileNotFoundError( 126 'Got 404 when fetching %s from %s for Stat' % (path, directory)) 127 stat_info = self._CreateStatInfo(result.content) 128 if path.endswith('/'): 129 return stat_info 130 if filename not in stat_info.child_versions: 131 raise FileNotFoundError('%s was not in child versions' % filename) 132 return StatInfo(stat_info.child_versions[filename]) 133