autoserv.py revision ca76bcccd6b8029bb0c3fcf72b1c4649e7fad9b1
1#!/usr/bin/python -u 2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc. 3# Released under the GPL v2 4 5""" 6Run a control file through the server side engine 7""" 8 9import sys, os, re, traceback, signal, time, logging, getpass 10 11import common 12 13from autotest_lib.client.common_lib import control_data 14from autotest_lib.client.common_lib import global_config 15require_atfork = global_config.global_config.get_config_value( 16 'AUTOSERV', 'require_atfork_module', type=bool, default=True) 17 18 19try: 20 import atfork 21 atfork.monkeypatch_os_fork_functions() 22 import atfork.stdlib_fixer 23 # Fix the Python standard library for threading+fork safety with its 24 # internal locks. http://code.google.com/p/python-atfork/ 25 import warnings 26 warnings.filterwarnings('ignore', 'logging module already imported') 27 atfork.stdlib_fixer.fix_logging_module() 28except ImportError, e: 29 from autotest_lib.client.common_lib import global_config 30 if global_config.global_config.get_config_value( 31 'AUTOSERV', 'require_atfork_module', type=bool, default=False): 32 print >>sys.stderr, 'Please run utils/build_externals.py' 33 print e 34 sys.exit(1) 35 36from autotest_lib.server import frontend 37from autotest_lib.server import server_logging_config 38from autotest_lib.server import server_job, utils, autoserv_parser, autotest 39from autotest_lib.server import utils as server_utils 40 41from autotest_lib.client.common_lib import pidfile, logging_manager 42from autotest_lib.site_utils.graphite import stats 43 44def log_alarm(signum, frame): 45 logging.error("Received SIGALARM. Ignoring and continuing on.") 46 sys.exit(1) 47 48def run_autoserv(pid_file_manager, results, parser): 49 # send stdin to /dev/null 50 dev_null = os.open(os.devnull, os.O_RDONLY) 51 os.dup2(dev_null, sys.stdin.fileno()) 52 os.close(dev_null) 53 54 # Create separate process group 55 os.setpgrp() 56 57 # Implement SIGTERM handler 58 def handle_sigterm(signum, frame): 59 logging.debug('Received SIGTERM') 60 if pid_file_manager: 61 pid_file_manager.close_file(1, signal.SIGTERM) 62 logging.debug('Finished writing to pid_file. Killing process.') 63 # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. 64 # This sleep allows the pending output to be logged before the kill 65 # signal is sent. 66 time.sleep(.1) 67 os.killpg(os.getpgrp(), signal.SIGKILL) 68 69 # Set signal handler 70 signal.signal(signal.SIGTERM, handle_sigterm) 71 72 # faulthandler is only needed to debug in the Lab and is not avaliable to 73 # be imported in the chroot as part of VMTest, so Try-Except it. 74 try: 75 import faulthandler 76 faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) 77 logging.debug('faulthandler registered on SIGTERM.') 78 except ImportError: 79 pass 80 81 # Ignore SIGTTOU's generated by output from forked children. 82 signal.signal(signal.SIGTTOU, signal.SIG_IGN) 83 84 # If we received a SIGALARM, let's be loud about it. 85 signal.signal(signal.SIGALRM, log_alarm) 86 87 # Server side tests that call shell scripts often depend on $USER being set 88 # but depending on how you launch your autotest scheduler it may not be set. 89 os.environ['USER'] = getpass.getuser() 90 91 if parser.options.machines: 92 machines = parser.options.machines.replace(',', ' ').strip().split() 93 else: 94 machines = [] 95 machines_file = parser.options.machines_file 96 label = parser.options.label 97 group_name = parser.options.group_name 98 user = parser.options.user 99 client = parser.options.client 100 server = parser.options.server 101 install_before = parser.options.install_before 102 install_after = parser.options.install_after 103 verify = parser.options.verify 104 repair = parser.options.repair 105 cleanup = parser.options.cleanup 106 provision = parser.options.provision 107 reset = parser.options.reset 108 job_labels = parser.options.job_labels 109 no_tee = parser.options.no_tee 110 parse_job = parser.options.parse_job 111 execution_tag = parser.options.execution_tag 112 if not execution_tag: 113 execution_tag = parse_job 114 host_protection = parser.options.host_protection 115 ssh_user = parser.options.ssh_user 116 ssh_port = parser.options.ssh_port 117 ssh_pass = parser.options.ssh_pass 118 collect_crashinfo = parser.options.collect_crashinfo 119 control_filename = parser.options.control_filename 120 test_retry = parser.options.test_retry 121 verify_job_repo_url = parser.options.verify_job_repo_url 122 skip_crash_collection = parser.options.skip_crash_collection 123 ssh_verbosity = int(parser.options.ssh_verbosity) 124 ssh_options = parser.options.ssh_options 125 126 # can't be both a client and a server side test 127 if client and server: 128 parser.parser.error("Can not specify a test as both server and client!") 129 130 if provision and client: 131 parser.parser.error("Cannot specify provisioning and client!") 132 133 is_special_task = (verify or repair or cleanup or collect_crashinfo or 134 provision or reset) 135 if len(parser.args) < 1 and not is_special_task: 136 parser.parser.error("Missing argument: control file") 137 138 if ssh_verbosity > 0: 139 # ssh_verbosity is an integer between 0 and 3, inclusive 140 ssh_verbosity_flag = '-' + 'v' * ssh_verbosity 141 else: 142 ssh_verbosity_flag = '' 143 144 # We have a control file unless it's just a verify/repair/cleanup job 145 if len(parser.args) > 0: 146 control = parser.args[0] 147 else: 148 control = None 149 150 if machines_file: 151 machines = [] 152 for m in open(machines_file, 'r').readlines(): 153 # remove comments, spaces 154 m = re.sub('#.*', '', m).strip() 155 if m: 156 machines.append(m) 157 print "Read list of machines from file: %s" % machines_file 158 print ','.join(machines) 159 160 if machines: 161 for machine in machines: 162 if not machine or re.search('\s', machine): 163 parser.parser.error("Invalid machine: %s" % str(machine)) 164 machines = list(set(machines)) 165 machines.sort() 166 167 if group_name and len(machines) < 2: 168 parser.parser.error("-G %r may only be supplied with more than one machine." 169 % group_name) 170 171 kwargs = {'group_name': group_name, 'tag': execution_tag, 172 'disable_sysinfo': parser.options.disable_sysinfo} 173 if control_filename: 174 kwargs['control_filename'] = control_filename 175 job = server_job.server_job(control, parser.args[1:], results, label, 176 user, machines, client, parse_job, 177 ssh_user, ssh_port, ssh_pass, 178 ssh_verbosity_flag, ssh_options, 179 test_retry, **kwargs) 180 job.logging.start_logging() 181 job.init_parser() 182 183 # perform checks 184 job.precheck() 185 186 # run the job 187 exit_code = 0 188 try: 189 try: 190 if repair: 191 job.repair(host_protection, job_labels) 192 elif verify: 193 job.verify(job_labels) 194 elif provision: 195 job.provision(job_labels) 196 elif reset: 197 job.reset(job_labels) 198 else: 199 job.run(cleanup, install_before, install_after, 200 verify_job_repo_url=verify_job_repo_url, 201 only_collect_crashinfo=collect_crashinfo, 202 skip_crash_collection=skip_crash_collection, 203 job_labels=job_labels) 204 finally: 205 while job.hosts: 206 host = job.hosts.pop() 207 host.close() 208 except: 209 exit_code = 1 210 traceback.print_exc() 211 212 if pid_file_manager: 213 pid_file_manager.num_tests_failed = job.num_tests_failed 214 pid_file_manager.close_file(exit_code) 215 job.cleanup_parser() 216 217 sys.exit(exit_code) 218 219 220def main(): 221 # White list of tests with run time measurement enabled. 222 measure_run_time_tests_names = global_config.global_config.get_config_value( 223 'AUTOSERV', 'measure_run_time_tests', type=str) 224 if measure_run_time_tests_names: 225 measure_run_time_tests = [t.strip() for t in 226 measure_run_time_tests_names.split(',')] 227 else: 228 measure_run_time_tests = [] 229 # grab the parser 230 parser = autoserv_parser.autoserv_parser 231 parser.parse_args() 232 233 if len(sys.argv) == 1: 234 parser.parser.print_help() 235 sys.exit(1) 236 237 if parser.options.no_logging: 238 results = None 239 else: 240 results = parser.options.results 241 if not results: 242 results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S') 243 results = os.path.abspath(results) 244 resultdir_exists = False 245 for filename in ('control.srv', 'status.log', '.autoserv_execute'): 246 if os.path.exists(os.path.join(results, filename)): 247 resultdir_exists = True 248 if not parser.options.use_existing_results and resultdir_exists: 249 error = "Error: results directory already exists: %s\n" % results 250 sys.stderr.write(error) 251 sys.exit(1) 252 253 # Now that we certified that there's no leftover results dir from 254 # previous jobs, lets create the result dir since the logging system 255 # needs to create the log file in there. 256 if not os.path.isdir(results): 257 os.makedirs(results) 258 259 logging_manager.configure_logging( 260 server_logging_config.ServerLoggingConfig(), results_dir=results, 261 use_console=not parser.options.no_tee, 262 verbose=parser.options.verbose, 263 no_console_prefix=parser.options.no_console_prefix) 264 if results: 265 logging.info("Results placed in %s" % results) 266 267 # wait until now to perform this check, so it get properly logged 268 if parser.options.use_existing_results and not resultdir_exists: 269 logging.error("No existing results directory found: %s", results) 270 sys.exit(1) 271 272 logging.debug('autoserv command was: %s', ' '.join(sys.argv)) 273 274 if parser.options.write_pidfile: 275 pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label, 276 results) 277 pid_file_manager.open_file() 278 else: 279 pid_file_manager = None 280 281 autotest.BaseAutotest.set_install_in_tmpdir( 282 parser.options.install_in_tmpdir) 283 284 timer = None 285 try: 286 # Take the first argument as control file name, get the test name from 287 # the control file. If the test name exists in the list of tests with 288 # run time measurement enabled, start a timer to begin measurement. 289 if (len(parser.args) > 0 and parser.args[0] != '' and 290 parser.options.machines): 291 try: 292 test_name = control_data.parse_control(parser.args[0], 293 raise_warnings=True).name 294 except control_data.ControlVariableException: 295 logging.debug('Failed to retrieve test name from control file.') 296 test_name = None 297 if test_name in measure_run_time_tests: 298 machines = parser.options.machines.replace(',', ' ' 299 ).strip().split() 300 afe = frontend.AFE() 301 board = server_utils.get_board_from_afe(machines[0], afe) 302 timer = stats.Timer('autoserv_run_time.%s.%s' % 303 (board, test_name)) 304 timer.start() 305 except control_data.ControlVariableException as e: 306 logging.error(str(e)) 307 exit_code = 0 308 try: 309 try: 310 run_autoserv(pid_file_manager, results, parser) 311 except SystemExit as e: 312 exit_code = e.code 313 if exit_code: 314 logging.exception(e) 315 except Exception as e: 316 # If we don't know what happened, we'll classify it as 317 # an 'abort' and return 1. 318 logging.exception(e) 319 exit_code = 1 320 finally: 321 if pid_file_manager: 322 pid_file_manager.close_file(exit_code) 323 if timer: 324 timer.stop() 325 sys.exit(exit_code) 326 327 328if __name__ == '__main__': 329 main() 330