subversion_file_system.py revision c2e0dbddbe15c98d52c4786dac06cb8952a8ae6d
1# Copyright (c) 2012 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from file_system import FileSystem, FileNotFoundError, StatInfo, ToUnicode
6from future import Future
7import logging
8import re
9import posixpath
10import xml.dom.minidom as xml
11from xml.parsers.expat import ExpatError
12
13class _AsyncFetchFuture(object):
14  def __init__(self, paths, fetcher, binary):
15    # A list of tuples of the form (path, Future).
16    self._fetches = [(path, fetcher.FetchAsync(path)) for path in paths]
17    self._value = {}
18    self._error = None
19    self._binary = binary
20
21  def _ListDir(self, directory):
22    dom = xml.parseString(directory)
23    files = [elem.childNodes[0].data for elem in dom.getElementsByTagName('a')]
24    if '..' in files:
25      files.remove('..')
26    return files
27
28  def Get(self):
29    for path, future in self._fetches:
30      try:
31        result = future.Get()
32      except Exception as e:
33        raise FileNotFoundError(
34            'Error when fetching %s for Get: %s' % (path, e))
35      if result.status_code == 404:
36        raise FileNotFoundError('Got 404 when fetching %s for Get' % path)
37      elif path.endswith('/'):
38        self._value[path] = self._ListDir(result.content)
39      elif not self._binary:
40        self._value[path] = ToUnicode(result.content)
41      else:
42        self._value[path] = result.content
43    if self._error is not None:
44      raise self._error
45    return self._value
46
47class SubversionFileSystem(FileSystem):
48  """Class to fetch resources from src.chromium.org.
49  """
50  def __init__(self, fetcher, stat_fetcher):
51    self._fetcher = fetcher
52    self._stat_fetcher = stat_fetcher
53
54  def Read(self, paths, binary=False):
55    return Future(delegate=_AsyncFetchFuture(paths, self._fetcher, binary))
56
57  def _ParseHTML(self, html):
58    """Unfortunately, the viewvc page has a stray </div> tag, so this takes care
59    of all mismatched tags.
60    """
61    try:
62      return xml.parseString(html)
63    except ExpatError as e:
64      return self._ParseHTML('\n'.join(
65          line for (i, line) in enumerate(html.split('\n'))
66          if e.lineno != i + 1))
67
68  def _CreateStatInfo(self, html):
69    def inner_text(node):
70      '''Like node.innerText in JS DOM, but strips surrounding whitespace.
71      '''
72      text = []
73      if node.nodeValue:
74        text.append(node.nodeValue)
75      if hasattr(node, 'childNodes'):
76        for child_node in node.childNodes:
77          text.append(inner_text(child_node))
78      return ''.join(text).strip()
79
80    dom = self._ParseHTML(html)
81
82    # Try all of the tables until we find the one that contains the data.
83    for table in dom.getElementsByTagName('table'):
84      # Within the table there is a list of files. However, there may be some
85      # things beforehand; a header, "parent directory" list, etc. We will deal
86      # with that below by being generous and just ignoring such rows.
87      rows = table.getElementsByTagName('tr')
88      child_versions = {}
89
90      for row in rows:
91        # Within each row there are probably 5 cells; name, version, age,
92        # author, and last log entry. Maybe the columns will change; we're at
93        # the mercy viewvc, but this constant can be easily updated.
94        elements = row.getElementsByTagName('td')
95        if len(elements) != 5:
96          continue
97        name_element, version_element, _, __, ___ = elements
98
99        name = inner_text(name_element)  # note: will end in / for directories
100        try:
101          version = int(inner_text(version_element))
102        except ValueError:
103          continue
104        child_versions[name] = version
105
106      if not child_versions:
107        continue
108
109      # Parent version is max version of all children, since it's SVN.
110      parent_version = max(child_versions.values())
111
112      # All versions in StatInfo need to be strings.
113      return StatInfo(str(parent_version),
114                      dict((path, str(version))
115                           for path, version in child_versions.iteritems()))
116
117    # Bleh, but, this data is so unreliable. There are actually some empty file
118    # listings caused by git/svn/something not cleaning up empty dirs.
119    return StatInfo('0', {})
120
121  def Stat(self, path):
122    directory, filename = posixpath.split(path)
123    result = self._stat_fetcher.Fetch(directory + '/')
124    if result.status_code == 404:
125      raise FileNotFoundError(
126          'Got 404 when fetching %s from %s for Stat' % (path, directory))
127    stat_info = self._CreateStatInfo(result.content)
128    if path.endswith('/'):
129      return stat_info
130    if filename not in stat_info.child_versions:
131      raise FileNotFoundError('%s was not in child versions' % filename)
132    return StatInfo(stat_info.child_versions[filename])
133