1#!/usr/bin/env python
2#
3# Copyright (c) 2012 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Helper script to shard build bot steps and save results to disk.
8
9Our buildbot infrastructure requires each slave to run steps serially.
10This is sub-optimal for android, where these steps can run independently on
11multiple connected devices.
12
13The buildbots will run this script multiple times per cycle:
14- First: all steps listed in -s in will be executed in parallel using all
15connected devices. Step results will be pickled to disk. Each step has a unique
16name. The result code will be ignored if the step name is listed in
17--flaky_steps.
18The buildbot will treat this step as a regular step, and will not process any
19graph data.
20
21- Then, with -p STEP_NAME: at this stage, we'll simply print the file with the
22step results previously saved. The buildbot will then process the graph data
23accordingly.
24
25The JSON steps file contains a dictionary in the format:
26{
27  "step_name_foo": "script_to_execute foo",
28  "step_name_bar": "script_to_execute bar"
29}
30
31The JSON flaky steps file contains a list with step names which results should
32be ignored:
33[
34  "step_name_foo",
35  "step_name_bar"
36]
37
38Note that script_to_execute necessarily have to take at least the following
39options:
40  --device: the serial number to be passed to all adb commands.
41  --keep_test_server_ports: indicates it's being run as a shard, and shouldn't
42  reset test server port allocation.
43"""
44
45
46import datetime
47import json
48import logging
49import multiprocessing
50import optparse
51import pexpect
52import pickle
53import os
54import signal
55import shutil
56import sys
57import time
58
59from pylib import android_commands
60from pylib import cmd_helper
61from pylib import constants
62from pylib import forwarder
63from pylib import ports
64
65
66_OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results')
67
68
69def _SaveResult(result):
70  with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f:
71    f.write(pickle.dumps(result))
72
73
74def _RunStepsPerDevice(steps):
75  results = []
76  for step in steps:
77    start_time = datetime.datetime.now()
78    print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'],
79                                        start_time, step['device'])
80    output, exit_code  = pexpect.run(
81        step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT),
82        withexitstatus=True, logfile=sys.stdout, timeout=1800,
83        env=os.environ)
84    exit_code = exit_code or 0
85    end_time = datetime.datetime.now()
86    exit_msg = '%s %s' % (exit_code,
87                          '(ignored, flaky step)' if step['is_flaky'] else '')
88    print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'],
89                                           end_time, step['device'])
90    if step['is_flaky']:
91      exit_code = 0
92    result = {'name': step['name'],
93              'output': output,
94              'exit_code': exit_code,
95              'total_time': (end_time - start_time).seconds,
96              'device': step['device']}
97    _SaveResult(result)
98    results += [result]
99  return results
100
101
102def _RunShardedSteps(steps, flaky_steps, devices):
103  assert steps
104  assert devices, 'No devices connected?'
105  if os.path.exists(_OUTPUT_DIR):
106    assert '/step_results' in _OUTPUT_DIR
107    shutil.rmtree(_OUTPUT_DIR)
108  if not os.path.exists(_OUTPUT_DIR):
109    os.makedirs(_OUTPUT_DIR)
110  step_names = sorted(steps.keys())
111  all_params = []
112  num_devices = len(devices)
113  shard_size = (len(steps) + num_devices - 1) / num_devices
114  for i, device in enumerate(devices):
115    steps_per_device = []
116    for s in steps.keys()[i * shard_size:(i + 1) * shard_size]:
117      steps_per_device += [{'name': s,
118                            'device': device,
119                            'is_flaky': s in flaky_steps,
120                            'cmd': steps[s] + ' --device ' + device +
121                            ' --keep_test_server_ports'}]
122    all_params += [steps_per_device]
123  print 'Start sharding (note: output is not synchronized...)'
124  print '*' * 80
125  start_time = datetime.datetime.now()
126  pool = multiprocessing.Pool(processes=num_devices)
127  async_results = pool.map_async(_RunStepsPerDevice, all_params)
128  results_per_device = async_results.get(999999)
129  end_time = datetime.datetime.now()
130  print '*' * 80
131  print 'Finished sharding.'
132  print 'Summary'
133  total_time = 0
134  for results in results_per_device:
135    for result in results:
136      print('%s : exit_code=%d in %d secs at %s' %
137            (result['name'], result['exit_code'], result['total_time'],
138             result['device']))
139      total_time += result['total_time']
140  print 'Step time: %d secs' % ((end_time - start_time).seconds)
141  print 'Bots time: %d secs' % total_time
142  # No exit_code for the sharding step: the individual _PrintResults step
143  # will return the corresponding exit_code.
144  return 0
145
146
147def _PrintStepOutput(step_name):
148  file_name = os.path.join(_OUTPUT_DIR, step_name)
149  if not os.path.exists(file_name):
150    print 'File not found ', file_name
151    return 1
152  with file(file_name, 'r') as f:
153    result = pickle.loads(f.read())
154  print result['output']
155  return result['exit_code']
156
157
158def _PrintAllStepsOutput(steps):
159  with file(steps, 'r') as f:
160    steps = json.load(f)
161  ret = 0
162  for step_name in steps.keys():
163    ret |= _PrintStepOutput(step_name)
164  return ret
165
166
167def _KillPendingServers():
168  for retry in range(5):
169    for server in ['lighttpd', 'web-page-replay']:
170      pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server])
171      pids = [pid.strip() for pid in pids.split('\n') if pid.strip()]
172      for pid in pids:
173        try:
174          logging.warning('Killing %s %s', server, pid)
175          os.kill(int(pid), signal.SIGQUIT)
176        except Exception as e:
177          logging.warning('Failed killing %s %s %s', server, pid, e)
178  # Restart the adb server with taskset to set a single CPU affinity.
179  cmd_helper.RunCmd(['adb', 'kill-server'])
180  cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'start-server'])
181  cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'root'])
182  i = 1
183  while not android_commands.GetAttachedDevices():
184    time.sleep(i)
185    i *= 2
186    if i > 10:
187      break
188
189
190def main(argv):
191  parser = optparse.OptionParser()
192  parser.add_option('-s', '--steps',
193                    help='A JSON file containing all the steps to be '
194                         'sharded.')
195  parser.add_option('--flaky_steps',
196                    help='A JSON file containing steps that are flaky and '
197                         'will have its exit code ignored.')
198  parser.add_option('-p', '--print_results',
199                    help='Only prints the results for the previously '
200                         'executed step, do not run it again.')
201  parser.add_option('-P', '--print_all',
202                    help='Only prints the results for the previously '
203                         'executed steps, do not run them again.')
204  options, urls = parser.parse_args(argv)
205  if options.print_results:
206    return _PrintStepOutput(options.print_results)
207  if options.print_all:
208    return _PrintAllStepsOutput(options.print_all)
209
210  # At this point, we should kill everything that may have been left over from
211  # previous runs.
212  _KillPendingServers()
213
214  forwarder.Forwarder.UseMultiprocessing()
215
216  # Reset the test port allocation. It's important to do it before starting
217  # to dispatch any step.
218  if not ports.ResetTestServerPortAllocation():
219    raise Exception('Failed to reset test server port.')
220
221  # Sort the devices so that we'll try to always run a step in the same device.
222  devices = sorted(android_commands.GetAttachedDevices())
223  if not devices:
224    print 'You must attach a device'
225    return 1
226
227  with file(options.steps, 'r') as f:
228    steps = json.load(f)
229  flaky_steps = []
230  if options.flaky_steps:
231    with file(options.flaky_steps, 'r') as f:
232      flaky_steps = json.load(f)
233  return _RunShardedSteps(steps, flaky_steps, devices)
234
235
236if __name__ == '__main__':
237  sys.exit(main(sys.argv))
238