1#!/usr/bin/env python 2# 3# Copyright (c) 2012 The Chromium Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Helper script to shard build bot steps and save results to disk. 8 9Our buildbot infrastructure requires each slave to run steps serially. 10This is sub-optimal for android, where these steps can run independently on 11multiple connected devices. 12 13The buildbots will run this script multiple times per cycle: 14- First: all steps listed in -s in will be executed in parallel using all 15connected devices. Step results will be pickled to disk. Each step has a unique 16name. The result code will be ignored if the step name is listed in 17--flaky_steps. 18The buildbot will treat this step as a regular step, and will not process any 19graph data. 20 21- Then, with -p STEP_NAME: at this stage, we'll simply print the file with the 22step results previously saved. The buildbot will then process the graph data 23accordingly. 24 25The JSON steps file contains a dictionary in the format: 26{ 27 "step_name_foo": "script_to_execute foo", 28 "step_name_bar": "script_to_execute bar" 29} 30 31The JSON flaky steps file contains a list with step names which results should 32be ignored: 33[ 34 "step_name_foo", 35 "step_name_bar" 36] 37 38Note that script_to_execute necessarily have to take at least the following 39options: 40 --device: the serial number to be passed to all adb commands. 41 --keep_test_server_ports: indicates it's being run as a shard, and shouldn't 42 reset test server port allocation. 43""" 44 45 46import datetime 47import json 48import logging 49import multiprocessing 50import optparse 51import pexpect 52import pickle 53import os 54import signal 55import shutil 56import sys 57import time 58 59from pylib import android_commands 60from pylib import cmd_helper 61from pylib import constants 62from pylib import forwarder 63from pylib import ports 64 65 66_OUTPUT_DIR = os.path.join(constants.DIR_SOURCE_ROOT, 'out', 'step_results') 67 68 69def _SaveResult(result): 70 with file(os.path.join(_OUTPUT_DIR, result['name']), 'w') as f: 71 f.write(pickle.dumps(result)) 72 73 74def _RunStepsPerDevice(steps): 75 results = [] 76 for step in steps: 77 start_time = datetime.datetime.now() 78 print 'Starting %s: %s %s at %s' % (step['name'], step['cmd'], 79 start_time, step['device']) 80 output, exit_code = pexpect.run( 81 step['cmd'], cwd=os.path.abspath(constants.DIR_SOURCE_ROOT), 82 withexitstatus=True, logfile=sys.stdout, timeout=1800, 83 env=os.environ) 84 exit_code = exit_code or 0 85 end_time = datetime.datetime.now() 86 exit_msg = '%s %s' % (exit_code, 87 '(ignored, flaky step)' if step['is_flaky'] else '') 88 print 'Finished %s: %s %s %s at %s' % (step['name'], exit_msg, step['cmd'], 89 end_time, step['device']) 90 if step['is_flaky']: 91 exit_code = 0 92 result = {'name': step['name'], 93 'output': output, 94 'exit_code': exit_code, 95 'total_time': (end_time - start_time).seconds, 96 'device': step['device']} 97 _SaveResult(result) 98 results += [result] 99 return results 100 101 102def _RunShardedSteps(steps, flaky_steps, devices): 103 assert steps 104 assert devices, 'No devices connected?' 105 if os.path.exists(_OUTPUT_DIR): 106 assert '/step_results' in _OUTPUT_DIR 107 shutil.rmtree(_OUTPUT_DIR) 108 if not os.path.exists(_OUTPUT_DIR): 109 os.makedirs(_OUTPUT_DIR) 110 step_names = sorted(steps.keys()) 111 all_params = [] 112 num_devices = len(devices) 113 shard_size = (len(steps) + num_devices - 1) / num_devices 114 for i, device in enumerate(devices): 115 steps_per_device = [] 116 for s in steps.keys()[i * shard_size:(i + 1) * shard_size]: 117 steps_per_device += [{'name': s, 118 'device': device, 119 'is_flaky': s in flaky_steps, 120 'cmd': steps[s] + ' --device ' + device + 121 ' --keep_test_server_ports'}] 122 all_params += [steps_per_device] 123 print 'Start sharding (note: output is not synchronized...)' 124 print '*' * 80 125 start_time = datetime.datetime.now() 126 pool = multiprocessing.Pool(processes=num_devices) 127 async_results = pool.map_async(_RunStepsPerDevice, all_params) 128 results_per_device = async_results.get(999999) 129 end_time = datetime.datetime.now() 130 print '*' * 80 131 print 'Finished sharding.' 132 print 'Summary' 133 total_time = 0 134 for results in results_per_device: 135 for result in results: 136 print('%s : exit_code=%d in %d secs at %s' % 137 (result['name'], result['exit_code'], result['total_time'], 138 result['device'])) 139 total_time += result['total_time'] 140 print 'Step time: %d secs' % ((end_time - start_time).seconds) 141 print 'Bots time: %d secs' % total_time 142 # No exit_code for the sharding step: the individual _PrintResults step 143 # will return the corresponding exit_code. 144 return 0 145 146 147def _PrintStepOutput(step_name): 148 file_name = os.path.join(_OUTPUT_DIR, step_name) 149 if not os.path.exists(file_name): 150 print 'File not found ', file_name 151 return 1 152 with file(file_name, 'r') as f: 153 result = pickle.loads(f.read()) 154 print result['output'] 155 return result['exit_code'] 156 157 158def _PrintAllStepsOutput(steps): 159 with file(steps, 'r') as f: 160 steps = json.load(f) 161 ret = 0 162 for step_name in steps.keys(): 163 ret |= _PrintStepOutput(step_name) 164 return ret 165 166 167def _KillPendingServers(): 168 for retry in range(5): 169 for server in ['lighttpd', 'web-page-replay']: 170 pids = cmd_helper.GetCmdOutput(['pgrep', '-f', server]) 171 pids = [pid.strip() for pid in pids.split('\n') if pid.strip()] 172 for pid in pids: 173 try: 174 logging.warning('Killing %s %s', server, pid) 175 os.kill(int(pid), signal.SIGQUIT) 176 except Exception as e: 177 logging.warning('Failed killing %s %s %s', server, pid, e) 178 # Restart the adb server with taskset to set a single CPU affinity. 179 cmd_helper.RunCmd(['adb', 'kill-server']) 180 cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'start-server']) 181 cmd_helper.RunCmd(['taskset', '-c', '0', 'adb', 'root']) 182 i = 1 183 while not android_commands.GetAttachedDevices(): 184 time.sleep(i) 185 i *= 2 186 if i > 10: 187 break 188 189 190def main(argv): 191 parser = optparse.OptionParser() 192 parser.add_option('-s', '--steps', 193 help='A JSON file containing all the steps to be ' 194 'sharded.') 195 parser.add_option('--flaky_steps', 196 help='A JSON file containing steps that are flaky and ' 197 'will have its exit code ignored.') 198 parser.add_option('-p', '--print_results', 199 help='Only prints the results for the previously ' 200 'executed step, do not run it again.') 201 parser.add_option('-P', '--print_all', 202 help='Only prints the results for the previously ' 203 'executed steps, do not run them again.') 204 options, urls = parser.parse_args(argv) 205 if options.print_results: 206 return _PrintStepOutput(options.print_results) 207 if options.print_all: 208 return _PrintAllStepsOutput(options.print_all) 209 210 # At this point, we should kill everything that may have been left over from 211 # previous runs. 212 _KillPendingServers() 213 214 forwarder.Forwarder.UseMultiprocessing() 215 216 # Reset the test port allocation. It's important to do it before starting 217 # to dispatch any step. 218 if not ports.ResetTestServerPortAllocation(): 219 raise Exception('Failed to reset test server port.') 220 221 # Sort the devices so that we'll try to always run a step in the same device. 222 devices = sorted(android_commands.GetAttachedDevices()) 223 if not devices: 224 print 'You must attach a device' 225 return 1 226 227 with file(options.steps, 'r') as f: 228 steps = json.load(f) 229 flaky_steps = [] 230 if options.flaky_steps: 231 with file(options.flaky_steps, 'r') as f: 232 flaky_steps = json.load(f) 233 return _RunShardedSteps(steps, flaky_steps, devices) 234 235 236if __name__ == '__main__': 237 sys.exit(main(sys.argv)) 238