1# Copyright (c) 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import xml.dom.minidom as minidom
6from xml.parsers.expat import ExpatError
7
8import crash_utils
9from repository_parser_interface import ParserInterface
10
11
12# This number is 6 because each linediff page in src.chromium.org should
13# contain the following tables: table with revision number, table with actual
14# diff, table with dropdown menu, table with legend, a border table and a table
15# containing page information.
16NUM_TABLES_IN_LINEDIFF_PAGE = 6
17# Each of the linediff info should contain 3 tds, one for changed line number,
18# and two for line contents before/after.
19NUM_TDS_IN_LINEDIFF_PAGE = 3
20
21
22class SVNParser(ParserInterface):
23  """Parser for SVN repository using chromium.org, for components in config.
24
25  Attributes:
26    url_map: A map from component to the urls, where urls are for changelog,
27             revision, line diff and annotation.
28  """
29
30  def __init__(self, url_map):
31    self.component_to_urls_map = url_map
32
33  def ParseChangelog(self, component, range_start, range_end):
34    file_to_revision_map = {}
35    revision_map = {}
36
37    # Check if the current component is supported by reading the components
38    # parsed from config file. If it is not, fail.
39
40    url_map = self.component_to_urls_map.get(component)
41    if not url_map:
42      return (revision_map, file_to_revision_map)
43
44    # Retrieve data from the url, return empty map if fails.
45    revision_range_str = '%s:%s' % (range_start, range_end)
46    url = url_map['changelog_url'] % revision_range_str
47    response = crash_utils.GetDataFromURL(url)
48    if not response:
49      return (revision_map, file_to_revision_map)
50
51    # Parse xml out of the returned string. If it fails, return empty map.
52    try:
53      xml_revisions = minidom.parseString(response)
54    except ExpatError:
55      return (revision_map, file_to_revision_map)
56
57    # Iterate through the returned XML object.
58    revisions = xml_revisions.getElementsByTagName('logentry')
59    for revision in revisions:
60      # Create new revision object for each of the revision.
61      revision_object = {}
62
63      # Set author of the CL.
64      revision_object['author'] = revision.getElementsByTagName(
65          'author')[0].firstChild.nodeValue
66
67      # Get the revision number from xml.
68      revision_number = int(revision.getAttribute('revision'))
69
70      # Iterate through the changed paths in the CL.
71      paths = revision.getElementsByTagName('paths')
72      if paths:
73        for changed_path in paths[0].getElementsByTagName('path'):
74          # Get path and file change type from the xml.
75          file_path = changed_path.firstChild.nodeValue
76          file_change_type = changed_path.getAttribute('action')
77
78          if file_path.startswith('/trunk/'):
79            file_path = file_path[len('/trunk/'):]
80
81          # Add file to the map.
82          if file_path not in file_to_revision_map:
83            file_to_revision_map[file_path] = []
84          file_to_revision_map[file_path].append(
85              (revision_number, file_change_type))
86
87      # Set commit message of the CL.
88      revision_object['message'] = revision.getElementsByTagName('msg')[
89          0].firstChild.nodeValue
90
91      # Set url of this CL.
92      revision_url = url_map['revision_url'] % revision_number
93      revision_object['url'] = revision_url
94
95      # Add this CL to the revision map.
96      revision_map[revision_number] = revision_object
97
98    return (revision_map, file_to_revision_map)
99
100  def ParseLineDiff(self, path, component, file_change_type, revision_number):
101    changed_line_numbers = []
102    changed_line_contents = []
103
104    url_map = self.component_to_urls_map.get(component)
105    if not url_map:
106      return (None, None, None)
107
108    # If the file is added (not modified), treat it as if it is not changed.
109    backup_url = url_map['revision_url'] % revision_number
110    if file_change_type == 'A':
111      return (backup_url, changed_line_numbers, changed_line_contents)
112
113    # Retrieve data from the url. If no data is retrieved, return empty lists.
114    url = url_map['diff_url'] % (path, revision_number - 1,
115                                 revision_number, revision_number)
116    data = crash_utils.GetDataFromURL(url)
117    if not data:
118      return (backup_url, changed_line_numbers, changed_line_contents)
119
120    line_diff_html = minidom.parseString(data)
121    tables = line_diff_html.getElementsByTagName('table')
122    # If there are not NUM_TABLES tables in the html page, there should be an
123    # error in the html page.
124    if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
125      return (backup_url, changed_line_numbers, changed_line_contents)
126
127    # Diff content is in the second table. Each line of the diff content
128    # is in <tr>.
129    trs = tables[1].getElementsByTagName('tr')
130    prefix_len = len('vc_diff_')
131
132    # Filter trs so that it only contains diff chunk with contents.
133    filtered_trs = []
134    for tr in trs:
135      tr_class = tr.getAttribute('class')
136
137      # Check for the classes of the <tr>s.
138      if tr_class:
139        tr_class = tr_class[prefix_len:]
140
141        # Do not have to add header.
142        if tr_class == 'header' or tr_class == 'chunk_header':
143          continue
144
145        # If the class of tr is empty, this page does not have any change.
146        if tr_class == 'empty':
147          return (backup_url, changed_line_numbers, changed_line_contents)
148
149      filtered_trs.append(tr)
150
151    # Iterate through filtered trs, and grab line diff information.
152    for tr in filtered_trs:
153      tds = tr.getElementsByTagName('td')
154
155      # If there aren't 3 tds, this line does should not contain line diff.
156      if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
157        continue
158
159      # If line number information is not in hyperlink, ignore this line.
160      try:
161        line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
162        left_diff_type = tds[1].getAttribute('class')[prefix_len:]
163        right_diff_type = tds[2].getAttribute('class')[prefix_len:]
164      except IndexError:
165        continue
166
167      # Treat the line as modified only if both left and right diff has type
168      # changed or both have different change type, and if the change is not
169      # deletion.
170      if (left_diff_type != right_diff_type) or (
171          left_diff_type == 'change' and right_diff_type == 'change'):
172
173        # Check if the line content is not empty.
174        try:
175          new_line = tds[2].firstChild.nodeValue
176        except AttributeError:
177          new_line = ''
178
179        if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
180          changed_line_numbers.append(int(line_num))
181          changed_line_contents.append(new_line.strip())
182
183    return (url, changed_line_numbers, changed_line_contents)
184
185  def ParseBlameInfo(self, component, file_path, line, revision):
186    url_map = self.component_to_urls_map.get(component)
187    if not url_map:
188      return None
189
190    # Retrieve blame data from url, return None if fails.
191    url = url_map['blame_url'] % (file_path, revision, revision)
192    data = crash_utils.GetDataFromURL(url)
193    if not data:
194      return None
195
196    blame_html = minidom.parseString(data)
197
198    title = blame_html.getElementsByTagName('title')
199    # If the returned html page is an exception page, return None.
200    if title[0].firstChild.nodeValue == 'ViewVC Exception':
201      return None
202
203    # Each of the blame result is in <tr>.
204    blame_results = blame_html.getElementsByTagName('tr')
205    try:
206      blame_result = blame_results[line]
207    except IndexError:
208      return None
209
210    # There must be 4 <td> for each <tr>. If not, this page is wrong.
211    tds = blame_result.getElementsByTagName('td')
212    if len(tds) != 4:
213      return None
214
215    # The third <td> has the line content, separated by <span>s. Combine
216    # those to get a string of changed line. If it has nothing, the line
217    # is empty.
218    line_content = ''
219    if tds[3].hasChildNodes():
220      contents = tds[3].childNodes
221
222      for content in contents:
223        # Nodetype 3 means it is text node.
224        if content.nodeType == minidom.Node.TEXT_NODE:
225          line_content += content.nodeValue
226        else:
227          line_content += content.firstChild.nodeValue
228
229      line_content = line_content.strip()
230
231    # If the current line has the same author/revision as the previous lines,
232    # the result is not shown. Propagate up until we find the line with info.
233    while not tds[1].firstChild:
234      line -= 1
235      blame_result = blame_results[line]
236      tds = blame_result.getElementsByTagName('td')
237    author = tds[1].firstChild.nodeValue
238
239    # Revision can either be in hyperlink or plain text.
240    try:
241      revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
242    except IndexError:
243      revision = tds[2].firstChild.nodeValue
244
245    (revision_info, _) = self.ParseChangelog(component, revision, revision)
246    message = revision_info[int(revision)]['message']
247
248    # Return the parsed information.
249    revision_url = url_map['revision_url'] % int(revision)
250    return (line_content, revision, author, revision_url, message)
251