1#!/usr/bin/python2
2
3# Copyright 2014 Google Inc.
4#
5# Use of this source code is governed by a BSD-style license that can be
6# found in the LICENSE file.
7
8"""Skia's Chromium Codereview Comparison Script.
9
10This script takes two Codereview URLs, looks at the trybot results for
11the two codereviews and compares the results.
12
13Usage:
14  compare_codereview.py CONTROL_URL ROLL_URL
15"""
16
17import collections
18import os
19import re
20import sys
21import urllib2
22import HTMLParser
23
24
25class CodeReviewHTMLParser(HTMLParser.HTMLParser):
26  """Parses CodeReview web page.
27
28  Use the CodeReviewHTMLParser.parse static function to make use of
29  this class.
30
31  This uses the HTMLParser class because it's the best thing in
32  Python's standard library.  We need a little more power than a
33  regex.  [Search for "You can't parse [X]HTML with regex." for more
34  information.
35  """
36  # pylint: disable=I0011,R0904
37  @staticmethod
38  def parse(url):
39    """Parses a CodeReview web pages.
40
41    Args:
42      url (string), a codereview URL like this:
43        'https://codereview.chromium.org/?????????'.
44
45    Returns:
46      A dictionary; the keys are bot_name strings, the values
47      are CodeReviewHTMLParser.Status objects
48    """
49    parser = CodeReviewHTMLParser()
50    try:
51      parser.feed(urllib2.urlopen(url).read())
52    except (urllib2.URLError,):
53      print >> sys.stderr, 'Error getting', url
54      return None
55    parser.close()
56    return parser.statuses
57
58  # namedtuples are like lightweight structs in Python.  The low
59  # overhead of a tuple, but the ease of use of an object.
60  Status = collections.namedtuple('Status', ['status', 'url'])
61
62  def __init__(self):
63    HTMLParser.HTMLParser.__init__(self)
64    self._id = None
65    self._status = None
66    self._href = None
67    self._anchor_data = ''
68    self._currently_parsing_trybotdiv = False
69    # statuses is a dictionary of CodeReviewHTMLParser.Status
70    self.statuses = {}
71
72  def handle_starttag(self, tag, attrs):
73    """Overrides the HTMLParser method to implement functionality.
74
75    [[begin standard library documentation]]
76    This method is called to handle the start of a tag
77    (e.g. <div id="main">).
78
79    The tag argument is the name of the tag converted to lower
80    case. The attrs argument is a list of (name, value) pairs
81    containing the attributes found inside the tag's <>
82    brackets. The name will be translated to lower case, and
83    quotes in the value have been removed, and character and
84    entity references have been replaced.
85
86    For instance, for the tag <A HREF="http://www.cwi.nl/">, this
87    method would be called as handle_starttag('a', [('href',
88    'http://www.cwi.nl/')]).
89    [[end standard library documentation]]
90    """
91    attrs = dict(attrs)
92    if tag == 'div':
93      # We are looking for <div id="tryjobdiv*">.
94      id_attr = attrs.get('id','')
95      if id_attr.startswith('tryjobdiv'):
96        self._id = id_attr
97    if (self._id and tag == 'a'
98      and 'build-result' in attrs.get('class', '').split()):
99      # If we are already inside a <div id="tryjobdiv*">, we
100      # look for a link if the form
101      # <a class="build-result" href="*">.  Then we save the
102      # (non-standard) status attribute and the URL.
103      self._status = attrs.get('status')
104      self._href = attrs.get('href')
105      self._currently_parsing_trybotdiv = True
106      # Start saving anchor data.
107
108  def handle_data(self, data):
109    """Overrides the HTMLParser method to implement functionality.
110
111    [[begin standard library documentation]]
112    This method is called to process arbitrary data (e.g. text
113    nodes and the content of <script>...</script> and
114    <style>...</style>).
115    [[end standard library documentation]]
116    """
117    # Save the text inside the <a></a> tags.  Assume <a> tags
118    # aren't nested.
119    if self._currently_parsing_trybotdiv:
120      self._anchor_data += data
121
122  def handle_endtag(self, tag):
123    """Overrides the HTMLParser method to implement functionality.
124
125    [[begin standard library documentation]]
126    This method is called to handle the end tag of an element
127    (e.g. </div>).  The tag argument is the name of the tag
128    converted to lower case.
129    [[end standard library documentation]]
130    """
131    if tag == 'a' and self._status:
132      # We take the accumulated self._anchor_data and save it as
133      # the bot name.
134      bot = self._anchor_data.strip()
135      stat = CodeReviewHTMLParser.Status(status=self._status,
136                         url=self._href)
137      if bot:
138        # Add to accumulating dictionary.
139        self.statuses[bot] = stat
140      # Reset state to search for the next bot.
141      self._currently_parsing_trybotdiv = False
142      self._anchor_data = ''
143      self._status = None
144      self._href = None
145
146
147class BuilderHTMLParser(HTMLParser.HTMLParser):
148  """parses Trybot web pages.
149
150  Use the BuilderHTMLParser.parse static function to make use of
151  this class.
152
153  This uses the HTMLParser class because it's the best thing in
154  Python's standard library.  We need a little more power than a
155  regex.  [Search for "You can't parse [X]HTML with regex." for more
156  information.
157  """
158  # pylint: disable=I0011,R0904
159  @staticmethod
160  def parse(url):
161    """Parses a Trybot web page.
162
163    Args:
164      url (string), a trybot result URL.
165
166    Returns:
167      An array of BuilderHTMLParser.Results, each a description
168      of failure results, along with an optional url
169    """
170    parser = BuilderHTMLParser()
171    try:
172      parser.feed(urllib2.urlopen(url).read())
173    except (urllib2.URLError,):
174      print >> sys.stderr, 'Error getting', url
175      return []
176    parser.close()
177    return parser.failure_results
178
179  Result = collections.namedtuple('Result', ['text', 'url'])
180
181  def __init__(self):
182    HTMLParser.HTMLParser.__init__(self)
183    self.failure_results = []
184    self._current_failure_result = None
185    self._divlevel = None
186    self._li_level = 0
187    self._li_data = ''
188    self._current_failure = False
189    self._failure_results_url = ''
190
191  def handle_starttag(self, tag, attrs):
192    """Overrides the HTMLParser method to implement functionality.
193
194    [[begin standard library documentation]]
195    This method is called to handle the start of a tag
196    (e.g. <div id="main">).
197
198    The tag argument is the name of the tag converted to lower
199    case. The attrs argument is a list of (name, value) pairs
200    containing the attributes found inside the tag's <>
201    brackets. The name will be translated to lower case, and
202    quotes in the value have been removed, and character and
203    entity references have been replaced.
204
205    For instance, for the tag <A HREF="http://www.cwi.nl/">, this
206    method would be called as handle_starttag('a', [('href',
207    'http://www.cwi.nl/')]).
208    [[end standard library documentation]]
209    """
210    attrs = dict(attrs)
211    if tag == 'li':
212      # <li> tags can be nested.  So we have to count the
213      # nest-level for backing out.
214      self._li_level += 1
215      return
216    if tag == 'div' and attrs.get('class') == 'failure result':
217      # We care about this sort of thing:
218      # <li>
219      #   <li>
220      #   <li>
221      #     <div class="failure result">...</div>
222      #   </li>
223      #   </li>
224      #   We want this text here.
225      # </li>
226      if self._li_level > 0:
227        self._current_failure = True  # Tells us to keep text.
228      return
229
230    if tag == 'a' and self._current_failure:
231      href = attrs.get('href')
232      # Sometimes we want to keep the stdio url.  We always
233      # return it, just in case.
234      if href.endswith('/logs/stdio'):
235        self._failure_results_url = href
236
237  def handle_data(self, data):
238    """Overrides the HTMLParser method to implement functionality.
239
240    [[begin standard library documentation]]
241    This method is called to process arbitrary data (e.g. text
242    nodes and the content of <script>...</script> and
243    <style>...</style>).
244    [[end standard library documentation]]
245    """
246    if self._current_failure:
247      self._li_data += data
248
249  def handle_endtag(self, tag):
250    """Overrides the HTMLParser method to implement functionality.
251
252    [[begin standard library documentation]]
253    This method is called to handle the end tag of an element
254    (e.g. </div>).  The tag argument is the name of the tag
255    converted to lower case.
256    [[end standard library documentation]]
257    """
258    if tag == 'li':
259      self._li_level -= 1
260      if 0 == self._li_level:
261        if self._current_failure:
262          result = self._li_data.strip()
263          first = result.split()[0]
264          if first:
265            result = re.sub(
266              r'^%s(\s+%s)+' % (first, first), first, result)
267            # Sometimes, it repeats the same thing
268            # multiple times.
269          result = re.sub(r'unexpected flaky.*', '', result)
270          # Remove some extra unnecessary text.
271          result = re.sub(r'\bpreamble\b', '', result)
272          result = re.sub(r'\bstdio\b', '', result)
273          url = self._failure_results_url
274          self.failure_results.append(
275            BuilderHTMLParser.Result(result, url))
276          self._current_failure_result = None
277        # Reset the state.
278        self._current_failure = False
279        self._li_data = ''
280        self._failure_results_url = ''
281
282
283def printer(indent, string):
284  """Print indented, wrapped text.
285  """
286  def wrap_to(line, columns):
287    """Wrap a line to the given number of columns, return a list
288    of strings.
289    """
290    ret = []
291    nextline = ''
292    for word in line.split():
293      if nextline:
294        if len(nextline) + 1 + len(word) > columns:
295          ret.append(nextline)
296          nextline = word
297        else:
298          nextline += (' ' + word)
299      else:
300        nextline = word
301    if nextline:
302      ret.append(nextline)
303    return ret
304  out = sys.stdout
305  spacer = '  '
306  for line in string.split('\n'):
307    for i, wrapped_line in enumerate(wrap_to(line, 68 - (2 * indent))):
308      out.write(spacer * indent)
309      if i > 0:
310        out.write(spacer)
311      out.write(wrapped_line)
312      out.write('\n')
313  out.flush()
314
315
316def main(control_url, roll_url, verbosity=1):
317  """Compare two Codereview URLs
318
319  Args:
320    control_url, roll_url: (strings) URL of the format
321      https://codereview.chromium.org/?????????
322
323    verbosity: (int) verbose level.  0, 1, or 2.
324  """
325  # pylint: disable=I0011,R0914,R0912
326  control = CodeReviewHTMLParser.parse(control_url)
327  roll = CodeReviewHTMLParser.parse(roll_url)
328  all_bots = set(control) & set(roll)  # Set intersection.
329  if not all_bots:
330    print >> sys.stderr, (
331      'Error:  control %s and roll %s have no common trybots.'
332      % (list(control), list(roll)))
333    return
334
335  control_name = '[control %s]' % control_url.split('/')[-1]
336  roll_name = '[roll %s]' % roll_url.split('/')[-1]
337
338  out = sys.stdout
339
340  for bot in sorted(all_bots):
341    if (roll[bot].status == 'success'):
342      if verbosity > 1:
343        printer(0, '==%s==' % bot)
344        printer(1, 'OK')
345      continue
346
347    if control[bot].status != 'failure' and roll[bot].status != 'failure':
348      continue
349    printer(0, '==%s==' % bot)
350
351    formatted_results = []
352    for (status, name, url) in [
353            (control[bot].status, control_name, control[bot].url),
354            (   roll[bot].status,    roll_name,    roll[bot].url)]:
355      lines = []
356      if status == 'failure':
357        results = BuilderHTMLParser.parse(url)
358        for result in results:
359          formatted_result = re.sub(r'(\S*\.html) ', '\n__\g<1>\n', result.text)
360          # Strip runtimes.
361          formatted_result = re.sub(r'\(.*\)', '', formatted_result)
362          lines.append((2, formatted_result))
363          if ('compile' in result.text or '...and more' in result.text):
364            lines.append((3, re.sub('/[^/]*$', '/', url) + result.url))
365      formatted_results.append(lines)
366
367    identical = formatted_results[0] == formatted_results[1]
368
369
370    for (formatted_result, (status, name, url)) in zip(
371        formatted_results,
372        [(control[bot].status, control_name, control[bot].url),
373          (roll[bot].status,  roll_name,  roll[bot].url)]):
374      if status != 'failure' and not identical:
375        printer(1, name)
376        printer(2, status)
377      elif status == 'failure':
378        if identical:
379          printer(1, control_name + ' and ' + roll_name + ' failed identically')
380        else:
381          printer(1, name)
382        for (indent, line) in formatted_result:
383          printer(indent, line)
384        if identical:
385          break
386    out.write('\n')
387
388  if verbosity > 0:
389    # Print out summary of all of the bots.
390    out.write('%11s %11s %4s %s\n\n' %
391          ('CONTROL', 'ROLL', 'DIFF', 'BOT'))
392    for bot in sorted(all_bots):
393      if roll[bot].status == 'success':
394        diff = ''
395      elif (control[bot].status == 'success' and
396           roll[bot].status == 'failure'):
397        diff = '!!!!'
398      elif ('pending' in control[bot].status or
399          'pending' in roll[bot].status):
400        diff = '....'
401      else:
402        diff = '****'
403      out.write('%11s %11s %4s %s\n' % (
404          control[bot].status, roll[bot].status, diff, bot))
405    out.write('\n')
406    out.flush()
407
408if __name__ == '__main__':
409  if len(sys.argv) < 3:
410    print >> sys.stderr, __doc__
411    exit(1)
412  main(sys.argv[1], sys.argv[2],
413     int(os.environ.get('COMPARE_CODEREVIEW_VERBOSITY', 1)))
414
415