docs/server2/subversion_file_system.py

# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import posixpath
import traceback
import xml.dom.minidom as xml
from xml.parsers.expat import ExpatError

from appengine_url_fetcher import AppEngineUrlFetcher
from appengine_wrappers import IsDownloadError
from docs_server_utils import StringIdentity
from file_system import (
    FileNotFoundError, FileSystem, FileSystemError, StatInfo)
from future import Future
import url_constants


def _ParseHTML(html):
  '''Unfortunately, the viewvc page has a stray </div> tag, so this takes care
  of all mismatched tags.
  '''
  try:
    return xml.parseString(html)
  except ExpatError as e:
    return _ParseHTML('\n'.join(
        line for (i, line) in enumerate(html.split('\n'))
        if e.lineno != i + 1))

def _InnerText(node):
  '''Like node.innerText in JS DOM, but strips surrounding whitespace.
  '''
  text = []
  if node.nodeValue:
    text.append(node.nodeValue)
  if hasattr(node, 'childNodes'):
    for child_node in node.childNodes:
      text.append(_InnerText(child_node))
  return ''.join(text).strip()

def _CreateStatInfo(html):
  parent_version = None
  child_versions = {}

  # Try all of the tables until we find the ones that contain the data (the
  # directory and file versions are in different tables).
  for table in _ParseHTML(html).getElementsByTagName('table'):
    # Within the table there is a list of files. However, there may be some
    # things beforehand; a header, "parent directory" list, etc. We will deal
    # with that below by being generous and just ignoring such rows.
    rows = table.getElementsByTagName('tr')

    for row in rows:
      cells = row.getElementsByTagName('td')

      # The version of the directory will eventually appear in the soup of
      # table rows, like this:
      #
      # <tr>
      #   <td>Directory revision:</td>
      #   <td><a href=... title="Revision 214692">214692</a> (of...)</td>
      # </tr>
      #
      # So look out for that.
      if len(cells) == 2 and _InnerText(cells[0]) == 'Directory revision:':
        links = cells[1].getElementsByTagName('a')
        if len(links) != 2:
          raise FileSystemError('ViewVC assumption invalid: directory ' +
                                'revision content did not have 2 <a> ' +
                                ' elements, instead %s' % _InnerText(cells[1]))
        this_parent_version = _InnerText(links[0])
        int(this_parent_version)  # sanity check
        if parent_version is not None:
          raise FileSystemError('There was already a parent version %s, and ' +
                                ' we just found a second at %s' %
                                (parent_version, this_parent_version))
        parent_version = this_parent_version

      # The version of each file is a list of rows with 5 cells: name, version,
      # age, author, and last log entry. Maybe the columns will change; we're
      # at the mercy viewvc, but this constant can be easily updated.
      if len(cells) != 5:
        continue
      name_element, version_element, _, __, ___ = cells

      name = _InnerText(name_element)  # note: will end in / for directories
      try:
        version = int(_InnerText(version_element))
      except StandardError:
        continue
      child_versions[name] = str(version)

    if parent_version and child_versions:
      break

  return StatInfo(parent_version, child_versions)

def _GetAsyncFetchCallback(paths, fetcher, args=None, skip_not_found=False):
  def apply_args(path):
    return path if args is None else '%s?%s' % (path, args)

  def list_dir(directory):
    dom = xml.parseString(directory)
    files = [elem.childNodes[0].data for elem in dom.getElementsByTagName('a')]
    if '..' in files:
      files.remove('..')
    return files

  # A list of tuples of the form (path, Future).
  fetches = [(path, fetcher.FetchAsync(apply_args(path))) for path in paths]

  def resolve():
    value = {}
    for path, future in fetches:
      try:
        result = future.Get()
      except Exception as e:
        if skip_not_found and IsDownloadError(e): continue
        exc_type = FileNotFoundError if IsDownloadError(e) else FileSystemError
        raise exc_type('%s fetching %s for Get: %s' %
                       (type(e).__name__, path, traceback.format_exc()))
      if result.status_code == 404:
        if skip_not_found: continue
        raise FileNotFoundError('Got 404 when fetching %s for Get, content %s' %
            (path, result.content))
      if result.status_code != 200:
        raise FileSystemError('Got %s when fetching %s for Get, content %s' %
            (result.status_code, path, result.content))
      if path.endswith('/'):
        value[path] = list_dir(result.content)
      else:
        value[path] = result.content
    return value

  return resolve

class SubversionFileSystem(FileSystem):
  '''Class to fetch resources from src.chromium.org.
  '''
  @staticmethod
  def Create(branch='trunk', revision=None):
    if branch == 'trunk':
      svn_path = 'trunk/src'
    else:
      svn_path = 'branches/%s/src' % branch
    return SubversionFileSystem(
        AppEngineUrlFetcher('%s/%s' % (url_constants.SVN_URL, svn_path)),
        AppEngineUrlFetcher('%s/%s' % (url_constants.VIEWVC_URL, svn_path)),
        svn_path,
        revision=revision)

  def __init__(self, file_fetcher, stat_fetcher, svn_path, revision=None):
    self._file_fetcher = file_fetcher
    self._stat_fetcher = stat_fetcher
    self._svn_path = svn_path
    self._revision = revision

  def Read(self, paths, skip_not_found=False):
    args = None
    if self._revision is not None:
      # |fetcher| gets from svn.chromium.org which uses p= for version.
      args = 'p=%s' % self._revision
    return Future(callback=_GetAsyncFetchCallback(
        paths,
        self._file_fetcher,
        args=args,
        skip_not_found=skip_not_found))

  def Refresh(self):
    return Future(value=())

  def Stat(self, path):
    return self.StatAsync(path).Get()

  def StatAsync(self, path):
    directory, filename = posixpath.split(path)
    if self._revision is not None:
      # |stat_fetch| uses viewvc which uses pathrev= for version.
      directory += '?pathrev=%s' % self._revision

    result_future = self._stat_fetcher.FetchAsync(directory)
    def resolve():
      try:
        result = result_future.Get()
      except Exception as e:
        exc_type = FileNotFoundError if IsDownloadError(e) else FileSystemError
        raise exc_type('%s fetching %s for Stat: %s' %
                       (type(e).__name__, path, traceback.format_exc()))

      if result.status_code == 404:
        raise FileNotFoundError('Got 404 when fetching %s for Stat, '
                                'content %s' % (path, result.content))
      if result.status_code != 200:
        raise FileNotFoundError('Got %s when fetching %s for Stat, content %s' %
                                (result.status_code, path, result.content))

      stat_info = _CreateStatInfo(result.content)
      if stat_info.version is None:
        raise FileSystemError('Failed to find version of dir %s' % directory)
      if path == '' or path.endswith('/'):
        return stat_info
      if filename not in stat_info.child_versions:
        raise FileNotFoundError(
            '%s from %s was not in child versions for Stat' % (filename, path))
      return StatInfo(stat_info.child_versions[filename])

    return Future(callback=resolve)

  def GetIdentity(self):
    # NOTE: no revision here, since it would mess up the caching of reads. It
    # probably doesn't matter since all the caching classes will use the result
    # of Stat to decide whether to re-read - and Stat has a ceiling of the
    # revision - so when the revision changes, so might Stat. That is enough.
    return '@'.join((self.__class__.__name__, StringIdentity(self._svn_path)))