1# Copyright (c) 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import xml.dom.minidom as minidom 6from xml.parsers.expat import ExpatError 7 8import crash_utils 9from repository_parser_interface import ParserInterface 10 11 12# This number is 6 because each linediff page in src.chromium.org should 13# contain the following tables: table with revision number, table with actual 14# diff, table with dropdown menu, table with legend, a border table and a table 15# containing page information. 16NUM_TABLES_IN_LINEDIFF_PAGE = 6 17# Each of the linediff info should contain 3 tds, one for changed line number, 18# and two for line contents before/after. 19NUM_TDS_IN_LINEDIFF_PAGE = 3 20 21 22class SVNParser(ParserInterface): 23 """Parser for SVN repository using chromium.org, for components in config. 24 25 Attributes: 26 url_map: A map from component to the urls, where urls are for changelog, 27 revision, line diff and annotation. 28 """ 29 30 def __init__(self, url_map): 31 self.component_to_urls_map = url_map 32 33 def ParseChangelog(self, component, range_start, range_end): 34 file_to_revision_map = {} 35 revision_map = {} 36 37 # Check if the current component is supported by reading the components 38 # parsed from config file. If it is not, fail. 39 40 url_map = self.component_to_urls_map.get(component) 41 if not url_map: 42 return (revision_map, file_to_revision_map) 43 44 # Retrieve data from the url, return empty map if fails. 45 revision_range_str = '%s:%s' % (range_start, range_end) 46 url = url_map['changelog_url'] % revision_range_str 47 response = crash_utils.GetDataFromURL(url) 48 if not response: 49 return (revision_map, file_to_revision_map) 50 51 # Parse xml out of the returned string. If it fails, return empty map. 52 try: 53 xml_revisions = minidom.parseString(response) 54 except ExpatError: 55 return (revision_map, file_to_revision_map) 56 57 # Iterate through the returned XML object. 58 revisions = xml_revisions.getElementsByTagName('logentry') 59 for revision in revisions: 60 # Create new revision object for each of the revision. 61 revision_object = {} 62 63 # Set author of the CL. 64 revision_object['author'] = revision.getElementsByTagName( 65 'author')[0].firstChild.nodeValue 66 67 # Get the revision number from xml. 68 revision_number = int(revision.getAttribute('revision')) 69 70 # Iterate through the changed paths in the CL. 71 paths = revision.getElementsByTagName('paths') 72 if paths: 73 for changed_path in paths[0].getElementsByTagName('path'): 74 # Get path and file change type from the xml. 75 file_path = changed_path.firstChild.nodeValue 76 file_change_type = changed_path.getAttribute('action') 77 78 if file_path.startswith('/trunk/'): 79 file_path = file_path[len('/trunk/'):] 80 81 # Add file to the map. 82 if file_path not in file_to_revision_map: 83 file_to_revision_map[file_path] = [] 84 file_to_revision_map[file_path].append( 85 (revision_number, file_change_type)) 86 87 # Set commit message of the CL. 88 revision_object['message'] = revision.getElementsByTagName('msg')[ 89 0].firstChild.nodeValue 90 91 # Set url of this CL. 92 revision_url = url_map['revision_url'] % revision_number 93 revision_object['url'] = revision_url 94 95 # Add this CL to the revision map. 96 revision_map[revision_number] = revision_object 97 98 return (revision_map, file_to_revision_map) 99 100 def ParseLineDiff(self, path, component, file_change_type, revision_number): 101 changed_line_numbers = [] 102 changed_line_contents = [] 103 104 url_map = self.component_to_urls_map.get(component) 105 if not url_map: 106 return (None, None, None) 107 108 # If the file is added (not modified), treat it as if it is not changed. 109 backup_url = url_map['revision_url'] % revision_number 110 if file_change_type == 'A': 111 return (backup_url, changed_line_numbers, changed_line_contents) 112 113 # Retrieve data from the url. If no data is retrieved, return empty lists. 114 url = url_map['diff_url'] % (path, revision_number - 1, 115 revision_number, revision_number) 116 data = crash_utils.GetDataFromURL(url) 117 if not data: 118 return (backup_url, changed_line_numbers, changed_line_contents) 119 120 line_diff_html = minidom.parseString(data) 121 tables = line_diff_html.getElementsByTagName('table') 122 # If there are not NUM_TABLES tables in the html page, there should be an 123 # error in the html page. 124 if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE: 125 return (backup_url, changed_line_numbers, changed_line_contents) 126 127 # Diff content is in the second table. Each line of the diff content 128 # is in <tr>. 129 trs = tables[1].getElementsByTagName('tr') 130 prefix_len = len('vc_diff_') 131 132 # Filter trs so that it only contains diff chunk with contents. 133 filtered_trs = [] 134 for tr in trs: 135 tr_class = tr.getAttribute('class') 136 137 # Check for the classes of the <tr>s. 138 if tr_class: 139 tr_class = tr_class[prefix_len:] 140 141 # Do not have to add header. 142 if tr_class == 'header' or tr_class == 'chunk_header': 143 continue 144 145 # If the class of tr is empty, this page does not have any change. 146 if tr_class == 'empty': 147 return (backup_url, changed_line_numbers, changed_line_contents) 148 149 filtered_trs.append(tr) 150 151 # Iterate through filtered trs, and grab line diff information. 152 for tr in filtered_trs: 153 tds = tr.getElementsByTagName('td') 154 155 # If there aren't 3 tds, this line does should not contain line diff. 156 if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE: 157 continue 158 159 # If line number information is not in hyperlink, ignore this line. 160 try: 161 line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue 162 left_diff_type = tds[1].getAttribute('class')[prefix_len:] 163 right_diff_type = tds[2].getAttribute('class')[prefix_len:] 164 except IndexError: 165 continue 166 167 # Treat the line as modified only if both left and right diff has type 168 # changed or both have different change type, and if the change is not 169 # deletion. 170 if (left_diff_type != right_diff_type) or ( 171 left_diff_type == 'change' and right_diff_type == 'change'): 172 173 # Check if the line content is not empty. 174 try: 175 new_line = tds[2].firstChild.nodeValue 176 except AttributeError: 177 new_line = '' 178 179 if not (left_diff_type == 'remove' and right_diff_type == 'empty'): 180 changed_line_numbers.append(int(line_num)) 181 changed_line_contents.append(new_line.strip()) 182 183 return (url, changed_line_numbers, changed_line_contents) 184 185 def ParseBlameInfo(self, component, file_path, line, revision): 186 url_map = self.component_to_urls_map.get(component) 187 if not url_map: 188 return None 189 190 # Retrieve blame data from url, return None if fails. 191 url = url_map['blame_url'] % (file_path, revision, revision) 192 data = crash_utils.GetDataFromURL(url) 193 if not data: 194 return None 195 196 blame_html = minidom.parseString(data) 197 198 title = blame_html.getElementsByTagName('title') 199 # If the returned html page is an exception page, return None. 200 if title[0].firstChild.nodeValue == 'ViewVC Exception': 201 return None 202 203 # Each of the blame result is in <tr>. 204 blame_results = blame_html.getElementsByTagName('tr') 205 try: 206 blame_result = blame_results[line] 207 except IndexError: 208 return None 209 210 # There must be 4 <td> for each <tr>. If not, this page is wrong. 211 tds = blame_result.getElementsByTagName('td') 212 if len(tds) != 4: 213 return None 214 215 # The third <td> has the line content, separated by <span>s. Combine 216 # those to get a string of changed line. If it has nothing, the line 217 # is empty. 218 line_content = '' 219 if tds[3].hasChildNodes(): 220 contents = tds[3].childNodes 221 222 for content in contents: 223 # Nodetype 3 means it is text node. 224 if content.nodeType == minidom.Node.TEXT_NODE: 225 line_content += content.nodeValue 226 else: 227 line_content += content.firstChild.nodeValue 228 229 line_content = line_content.strip() 230 231 # If the current line has the same author/revision as the previous lines, 232 # the result is not shown. Propagate up until we find the line with info. 233 while not tds[1].firstChild: 234 line -= 1 235 blame_result = blame_results[line] 236 tds = blame_result.getElementsByTagName('td') 237 author = tds[1].firstChild.nodeValue 238 239 # Revision can either be in hyperlink or plain text. 240 try: 241 revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue 242 except IndexError: 243 revision = tds[2].firstChild.nodeValue 244 245 (revision_info, _) = self.ParseChangelog(component, revision, revision) 246 message = revision_info[int(revision)]['message'] 247 248 # Return the parsed information. 249 revision_url = url_map['revision_url'] % int(revision) 250 return (line_content, revision, author, revision_url, message) 251