1# Copyright (c) 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import base64
6import xml.dom.minidom as minidom
7from xml.parsers.expat import ExpatError
8
9import crash_utils
10from repository_parser_interface import ParserInterface
11
12FILE_CHANGE_TYPE_MAP = {
13    'add': 'A',
14    'copy': 'C',
15    'delete': 'D',
16    'modify': 'M',
17    'rename': 'R'
18}
19
20
21def _ConvertToFileChangeType(file_action):
22  # TODO(stgao): verify impact on code that checks the file change type.
23  return file_action[0].upper()
24
25
26class GitParser(ParserInterface):
27  """Parser for Git repository in googlesource.
28
29  Attributes:
30    parsed_deps: A map from component path to its repository name, regression,
31                 etc.
32    url_parts_map: A map from url type to its url parts. This parts are added
33                   the base url to form different urls.
34  """
35
36  def __init__(self, parsed_deps, url_parts_map):
37    self.component_to_url_map = parsed_deps
38    self.url_parts_map = url_parts_map
39
40  def ParseChangelog(self, component_path, range_start, range_end):
41    file_to_revision_map = {}
42    revision_map = {}
43    base_url = self.component_to_url_map[component_path]['repository']
44    changelog_url = base_url + self.url_parts_map['changelog_url']
45    revision_url = base_url + self.url_parts_map['revision_url']
46
47    # Retrieve data from the url, return empty maps if fails. Html url is a\
48    # url where the changelog can be parsed from html.
49    url = changelog_url % (range_start, range_end)
50    html_url = url + '?pretty=fuller'
51    response = crash_utils.GetDataFromURL(html_url)
52    if not response:
53      return (revision_map, file_to_revision_map)
54
55    # Parse xml out of the returned string. If it failes, Try parsing
56    # from JSON objects.
57    try:
58      dom = minidom.parseString(response)
59    except ExpatError:
60      self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
61                                  revision_url, revision_map,
62                                  file_to_revision_map)
63      return (revision_map, file_to_revision_map)
64
65    # The revisions information are in from the third divs to the second
66    # to last one.
67    divs = dom.getElementsByTagName('div')[2:-1]
68    pres = dom.getElementsByTagName('pre')
69    uls = dom.getElementsByTagName('ul')
70
71    # Divs, pres and uls each contain revision information for one CL, so
72    # they should have same length.
73    if not divs or len(divs) != len(pres) or len(pres) != len(uls):
74      self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
75                                  revision_url, revision_map,
76                                  file_to_revision_map)
77      return (revision_map, file_to_revision_map)
78
79    # Iterate through divs and parse revisions
80    for (div, pre, ul) in zip(divs, pres, uls):
81      # Create new revision object for each revision.
82      revision = {}
83
84      # There must be three <tr>s. If not, this page is wrong.
85      trs = div.getElementsByTagName('tr')
86      if len(trs) != 3:
87        continue
88
89      # Retrieve git hash.
90      githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
91
92      # Retrieve and set author.
93      author = trs[1].getElementsByTagName(
94          'td')[0].firstChild.nodeValue.split('<')[0]
95      revision['author'] = author
96
97      # Retrive and set message.
98      revision['message'] = pre.firstChild.nodeValue
99
100      # Set url of this CL.
101      revision_url_part = self.url_parts_map['revision_url'] % githash
102      revision['url'] = base_url + revision_url_part
103
104      # Go through changed files, they are in li.
105      lis = ul.getElementsByTagName('li')
106      for li in lis:
107        # Retrieve path and action of the changed file
108        file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
109        file_change_type = li.getElementsByTagName('span')[
110            0].getAttribute('class')
111
112        # Normalize file action so that it is same as SVN parser.
113        file_change_type = _ConvertToFileChangeType(file_change_type)
114
115        # Add the changed file to the map.
116        if file_path not in file_to_revision_map:
117          file_to_revision_map[file_path] = []
118        file_to_revision_map[file_path].append((githash, file_change_type))
119
120      # Add this revision object to the map.
121      revision_map[githash] = revision
122
123    # Parse one revision for the start range, because googlesource does not
124    # include the start of the range.
125    self.ParseRevision(revision_url, range_start, revision_map,
126                       file_to_revision_map)
127
128    return (revision_map, file_to_revision_map)
129
130  def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
131                             revision_url, revision_map, file_to_revision_map):
132    """Parses changelog by going over the JSON file.
133
134    Args:
135      range_start: Starting range of the regression.
136      range_end: Ending range of the regression.
137      changelog_url: The url to retrieve changelog from.
138      revision_url: The url to retrieve individual revision from.
139      revision_map: A map from a git hash number to its revision information.
140      file_to_revision_map: A map from file to a git hash in which it occurs.
141    """
142    # Compute URLs from given range, and retrieves changelog. Stop if it fails.
143    changelog_url %= (range_start, range_end)
144    json_url = changelog_url + '?format=json'
145    response = crash_utils.GetDataFromURL(json_url)
146    if not response:
147      return
148
149    # Parse changelog from the returned object. The returned string should
150    # start with ")}]'\n", so start from the 6th character.
151    revisions = crash_utils.LoadJSON(response[5:])
152    if not revisions:
153      return
154
155    # Parse individual revision in the log.
156    for revision in revisions['log']:
157      githash = revision['commit']
158      self.ParseRevision(revision_url, githash, revision_map,
159                         file_to_revision_map)
160
161    # Parse the revision with range_start, because googlesource ignores
162    # that one.
163    self.ParseRevision(revision_url, range_start, revision_map,
164                       file_to_revision_map)
165
166  def ParseRevision(self, revision_url, githash, revision_map,
167                    file_to_revision_map):
168
169    # Retrieve data from the URL, return if it fails.
170    url = revision_url % githash
171    response = crash_utils.GetDataFromURL(url + '?format=json')
172    if not response:
173      return
174
175    # Load JSON object from the string. If it fails, terminate the function.
176    json_revision = crash_utils.LoadJSON(response[5:])
177    if not json_revision:
178      return
179
180    # Create a map representing object and get githash from the JSON object.
181    revision = {}
182    githash = json_revision['commit']
183
184    # Set author, message and URL of this CL.
185    revision['author'] = json_revision['author']['name']
186    revision['message'] = json_revision['message']
187    revision['url'] = url
188
189    # Iterate through the changed files.
190    for diff in json_revision['tree_diff']:
191      file_path = diff['new_path']
192      file_change_type = diff['type']
193
194      # Normalize file action so that it fits with svn_repository_parser.
195      file_change_type = _ConvertToFileChangeType(file_change_type)
196
197      # Add the file to the map.
198      if file_path not in file_to_revision_map:
199        file_to_revision_map[file_path] = []
200      file_to_revision_map[file_path].append((githash, file_change_type))
201
202    # Add this CL to the map.
203    revision_map[githash] = revision
204
205    return
206
207  def ParseLineDiff(self, path, component, file_change_type, githash):
208    changed_line_numbers = []
209    changed_line_contents = []
210    base_url = self.component_to_url_map[component]['repository']
211    backup_url = (base_url + self.url_parts_map['revision_url']) % githash
212
213    # If the file is added (not modified), treat it as if it is not changed.
214    if file_change_type in ('A', 'C', 'R'):
215      # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
216      return (backup_url, changed_line_numbers, changed_line_contents)
217
218    # Retrieves the diff data from URL, and if it fails, return emptry lines.
219    url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
220    data = crash_utils.GetDataFromURL(url + '?format=text')
221    if not data:
222      return (backup_url, changed_line_numbers, changed_line_contents)
223
224    # Decode the returned object to line diff info
225    diff = base64.b64decode(data).splitlines()
226
227    # Iterate through the lines in diff. Set current line to -1 so that we know
228    # that current line is part of the diff chunk.
229    current_line = -1
230    for line in diff:
231      line = line.strip()
232
233      # If line starts with @@, a new chunk starts.
234      if line.startswith('@@'):
235        current_line = int(line.split('+')[1].split(',')[0])
236
237      # If we are in a chunk.
238      elif current_line != -1:
239        # If line is either added or modified.
240        if line.startswith('+'):
241          changed_line_numbers.append(current_line)
242          changed_line_contents.append(line[2:])
243
244        # Do not increment current line if the change is 'delete'.
245        if not line.startswith('-'):
246          current_line += 1
247
248    # Return url without '?format=json'
249    return (url, changed_line_numbers, changed_line_contents)
250
251  def ParseBlameInfo(self, component, file_path, line, revision):
252    base_url = self.component_to_url_map[component]['repository']
253
254    # Retrieve blame JSON file from googlesource. If it fails, return None.
255    url_part = self.url_parts_map['blame_url'] % (revision, file_path)
256    blame_url = base_url + url_part
257    json_string = crash_utils.GetDataFromURL(blame_url)
258    if not json_string:
259      return
260
261    # Parse JSON object from the string. The returned string should
262    # start with ")}]'\n", so start from the 6th character.
263    annotation = crash_utils.LoadJSON(json_string[5:])
264    if not annotation:
265      return
266
267    # Go through the regions, which is a list of consecutive lines with same
268    # author/revision.
269    for blame_line in annotation['regions']:
270      start = blame_line['start']
271      count = blame_line['count']
272
273      # For each region, check if the line we want the blame info of is in this
274      # region.
275      if start <= line and line <= start + count - 1:
276        # If we are in the right region, get the information from the line.
277        revision = blame_line['commit']
278        author = blame_line['author']['name']
279        revision_url_parts = self.url_parts_map['revision_url'] % revision
280        revision_url = base_url + revision_url_parts
281        # TODO(jeun): Add a way to get content from JSON object.
282        content = None
283
284        (revision_info, _) = self.ParseChangelog(component, revision, revision)
285        message = revision_info[revision]['message']
286        return (content, revision, author, revision_url, message)
287
288    # Return none if the region does not exist.
289    return None
290