tools/findit/svn_repository_parser.py

# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import xml.dom.minidom as minidom
from xml.parsers.expat import ExpatError

import crash_utils
from repository_parser_interface import ParserInterface


# This number is 6 because each linediff page in src.chromium.org should
# contain the following tables: table with revision number, table with actual
# diff, table with dropdown menu, table with legend, a border table and a table
# containing page information.
NUM_TABLES_IN_LINEDIFF_PAGE = 6
# Each of the linediff info should contain 3 tds, one for changed line number,
# and two for line contents before/after.
NUM_TDS_IN_LINEDIFF_PAGE = 3


class SVNParser(ParserInterface):
  """Parser for SVN repository using chromium.org, for components in config.

  Attributes:
    url_map: A map from component to the urls, where urls are for changelog,
             revision, line diff and annotation.
  """

  def __init__(self, url_map):
    self.component_to_urls_map = url_map

  def ParseChangelog(self, component, range_start, range_end):
    file_to_revision_map = {}
    revision_map = {}

    # Check if the current component is supported by reading the components
    # parsed from config file. If it is not, fail.

    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return (revision_map, file_to_revision_map)

    # Retrieve data from the url, return empty map if fails.
    revision_range_str = '%s:%s' % (range_start, range_end)
    url = url_map['changelog_url'] % revision_range_str
    response = crash_utils.GetDataFromURL(url)
    if not response:
      return (revision_map, file_to_revision_map)

    # Parse xml out of the returned string. If it fails, return empty map.
    try:
      xml_revisions = minidom.parseString(response)
    except ExpatError:
      return (revision_map, file_to_revision_map)

    # Iterate through the returned XML object.
    revisions = xml_revisions.getElementsByTagName('logentry')
    for revision in revisions:
      # Create new revision object for each of the revision.
      revision_object = {}

      # Set author of the CL.
      revision_object['author'] = revision.getElementsByTagName(
          'author')[0].firstChild.nodeValue

      # Get the revision number from xml.
      revision_number = int(revision.getAttribute('revision'))

      # Iterate through the changed paths in the CL.
      paths = revision.getElementsByTagName('paths')
      if paths:
        for changed_path in paths[0].getElementsByTagName('path'):
          # Get path and file change type from the xml.
          file_path = changed_path.firstChild.nodeValue
          file_change_type = changed_path.getAttribute('action')

          if file_path.startswith('/trunk/'):
            file_path = file_path[len('/trunk/'):]

          # Add file to the map.
          if file_path not in file_to_revision_map:
            file_to_revision_map[file_path] = []
          file_to_revision_map[file_path].append(
              (revision_number, file_change_type))

      # Set commit message of the CL.
      revision_object['message'] = revision.getElementsByTagName('msg')[
          0].firstChild.nodeValue

      # Set url of this CL.
      revision_url = url_map['revision_url'] % revision_number
      revision_object['url'] = revision_url

      # Add this CL to the revision map.
      revision_map[revision_number] = revision_object

    return (revision_map, file_to_revision_map)

  def ParseLineDiff(self, path, component, file_change_type, revision_number):
    changed_line_numbers = []
    changed_line_contents = []

    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return (None, None, None)

    # If the file is added (not modified), treat it as if it is not changed.
    backup_url = url_map['revision_url'] % revision_number
    if file_change_type == 'A':
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Retrieve data from the url. If no data is retrieved, return empty lists.
    url = url_map['diff_url'] % (path, revision_number - 1,
                                 revision_number, revision_number)
    data = crash_utils.GetDataFromURL(url)
    if not data:
      return (backup_url, changed_line_numbers, changed_line_contents)

    line_diff_html = minidom.parseString(data)
    tables = line_diff_html.getElementsByTagName('table')
    # If there are not NUM_TABLES tables in the html page, there should be an
    # error in the html page.
    if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Diff content is in the second table. Each line of the diff content
    # is in <tr>.
    trs = tables[1].getElementsByTagName('tr')
    prefix_len = len('vc_diff_')

    # Filter trs so that it only contains diff chunk with contents.
    filtered_trs = []
    for tr in trs:
      tr_class = tr.getAttribute('class')

      # Check for the classes of the <tr>s.
      if tr_class:
        tr_class = tr_class[prefix_len:]

        # Do not have to add header.
        if tr_class == 'header' or tr_class == 'chunk_header':
          continue

        # If the class of tr is empty, this page does not have any change.
        if tr_class == 'empty':
          return (backup_url, changed_line_numbers, changed_line_contents)

      filtered_trs.append(tr)

    # Iterate through filtered trs, and grab line diff information.
    for tr in filtered_trs:
      tds = tr.getElementsByTagName('td')

      # If there aren't 3 tds, this line does should not contain line diff.
      if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
        continue

      # If line number information is not in hyperlink, ignore this line.
      try:
        line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
        left_diff_type = tds[1].getAttribute('class')[prefix_len:]
        right_diff_type = tds[2].getAttribute('class')[prefix_len:]
      except IndexError:
        continue

      # Treat the line as modified only if both left and right diff has type
      # changed or both have different change type, and if the change is not
      # deletion.
      if (left_diff_type != right_diff_type) or (
          left_diff_type == 'change' and right_diff_type == 'change'):

        # Check if the line content is not empty.
        try:
          new_line = tds[2].firstChild.nodeValue
        except AttributeError:
          new_line = ''

        if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
          changed_line_numbers.append(int(line_num))
          changed_line_contents.append(new_line.strip())

    return (url, changed_line_numbers, changed_line_contents)

  def ParseBlameInfo(self, component, file_path, line, revision):
    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return None

    # Retrieve blame data from url, return None if fails.
    url = url_map['blame_url'] % (file_path, revision, revision)
    data = crash_utils.GetDataFromURL(url)
    if not data:
      return None

    blame_html = minidom.parseString(data)

    title = blame_html.getElementsByTagName('title')
    # If the returned html page is an exception page, return None.
    if title[0].firstChild.nodeValue == 'ViewVC Exception':
      return None

    # Each of the blame result is in <tr>.
    blame_results = blame_html.getElementsByTagName('tr')
    try:
      blame_result = blame_results[line]
    except IndexError:
      return None

    # There must be 4 <td> for each <tr>. If not, this page is wrong.
    tds = blame_result.getElementsByTagName('td')
    if len(tds) != 4:
      return None

    # The third <td> has the line content, separated by <span>s. Combine
    # those to get a string of changed line. If it has nothing, the line
    # is empty.
    line_content = ''
    if tds[3].hasChildNodes():
      contents = tds[3].childNodes

      for content in contents:
        # Nodetype 3 means it is text node.
        if content.nodeType == minidom.Node.TEXT_NODE:
          line_content += content.nodeValue
        else:
          line_content += content.firstChild.nodeValue

      line_content = line_content.strip()

    # If the current line has the same author/revision as the previous lines,
    # the result is not shown. Propagate up until we find the line with info.
    while not tds[1].firstChild:
      line -= 1
      blame_result = blame_results[line]
      tds = blame_result.getElementsByTagName('td')
    author = tds[1].firstChild.nodeValue

    # Revision can either be in hyperlink or plain text.
    try:
      revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
    except IndexError:
      revision = tds[2].firstChild.nodeValue

    (revision_info, _) = self.ParseChangelog(component, revision, revision)
    message = revision_info[int(revision)]['message']

    # Return the parsed information.
    revision_url = url_map['revision_url'] % int(revision)
    return (line_content, revision, author, revision_url, message)