autoserv.py revision 22dd226625255110c079e979113dcda1f4fa5ea8
1#!/usr/bin/python -u 2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc. 3# Released under the GPL v2 4 5""" 6Run a control file through the server side engine 7""" 8 9import ast 10import datetime 11import getpass 12import logging 13import os 14import re 15import signal 16import sys 17import traceback 18import time 19import urllib2 20 21import common 22 23from autotest_lib.client.common_lib import control_data 24from autotest_lib.client.common_lib import global_config 25try: 26 from autotest_lib.puppylab import results_mocker 27except ImportError: 28 results_mocker = None 29 30require_atfork = global_config.global_config.get_config_value( 31 'AUTOSERV', 'require_atfork_module', type=bool, default=True) 32 33 34# Number of seconds to wait before returning if testing mode is enabled 35TESTING_MODE_SLEEP_SECS = 1 36 37 38try: 39 import atfork 40 atfork.monkeypatch_os_fork_functions() 41 import atfork.stdlib_fixer 42 # Fix the Python standard library for threading+fork safety with its 43 # internal locks. http://code.google.com/p/python-atfork/ 44 import warnings 45 warnings.filterwarnings('ignore', 'logging module already imported') 46 atfork.stdlib_fixer.fix_logging_module() 47except ImportError, e: 48 from autotest_lib.client.common_lib import global_config 49 if global_config.global_config.get_config_value( 50 'AUTOSERV', 'require_atfork_module', type=bool, default=False): 51 print >>sys.stderr, 'Please run utils/build_externals.py' 52 print e 53 sys.exit(1) 54 55from autotest_lib.server import frontend 56from autotest_lib.server import server_logging_config 57from autotest_lib.server import server_job, utils, autoserv_parser, autotest 58from autotest_lib.server import utils as server_utils 59from autotest_lib.site_utils import job_overhead 60from autotest_lib.client.common_lib import pidfile, logging_manager 61from autotest_lib.client.common_lib.cros.graphite import stats 62 63def log_alarm(signum, frame): 64 logging.error("Received SIGALARM. Ignoring and continuing on.") 65 sys.exit(1) 66 67def run_autoserv(pid_file_manager, results, parser): 68 # send stdin to /dev/null 69 dev_null = os.open(os.devnull, os.O_RDONLY) 70 os.dup2(dev_null, sys.stdin.fileno()) 71 os.close(dev_null) 72 73 # Create separate process group 74 os.setpgrp() 75 76 # Implement SIGTERM handler 77 def handle_sigterm(signum, frame): 78 logging.debug('Received SIGTERM') 79 if pid_file_manager: 80 pid_file_manager.close_file(1, signal.SIGTERM) 81 logging.debug('Finished writing to pid_file. Killing process.') 82 # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. 83 # This sleep allows the pending output to be logged before the kill 84 # signal is sent. 85 time.sleep(.1) 86 os.killpg(os.getpgrp(), signal.SIGKILL) 87 88 # Set signal handler 89 signal.signal(signal.SIGTERM, handle_sigterm) 90 91 # faulthandler is only needed to debug in the Lab and is not avaliable to 92 # be imported in the chroot as part of VMTest, so Try-Except it. 93 try: 94 import faulthandler 95 faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) 96 logging.debug('faulthandler registered on SIGTERM.') 97 except ImportError: 98 pass 99 100 # Ignore SIGTTOU's generated by output from forked children. 101 signal.signal(signal.SIGTTOU, signal.SIG_IGN) 102 103 # If we received a SIGALARM, let's be loud about it. 104 signal.signal(signal.SIGALRM, log_alarm) 105 106 # Server side tests that call shell scripts often depend on $USER being set 107 # but depending on how you launch your autotest scheduler it may not be set. 108 os.environ['USER'] = getpass.getuser() 109 110 if parser.options.machines: 111 machines = parser.options.machines.replace(',', ' ').strip().split() 112 else: 113 machines = [] 114 machines_file = parser.options.machines_file 115 label = parser.options.label 116 group_name = parser.options.group_name 117 user = parser.options.user 118 client = parser.options.client 119 server = parser.options.server 120 install_before = parser.options.install_before 121 install_after = parser.options.install_after 122 verify = parser.options.verify 123 repair = parser.options.repair 124 cleanup = parser.options.cleanup 125 provision = parser.options.provision 126 reset = parser.options.reset 127 job_labels = parser.options.job_labels 128 no_tee = parser.options.no_tee 129 parse_job = parser.options.parse_job 130 execution_tag = parser.options.execution_tag 131 if not execution_tag: 132 execution_tag = parse_job 133 host_protection = parser.options.host_protection 134 ssh_user = parser.options.ssh_user 135 ssh_port = parser.options.ssh_port 136 ssh_pass = parser.options.ssh_pass 137 collect_crashinfo = parser.options.collect_crashinfo 138 control_filename = parser.options.control_filename 139 test_retry = parser.options.test_retry 140 verify_job_repo_url = parser.options.verify_job_repo_url 141 skip_crash_collection = parser.options.skip_crash_collection 142 ssh_verbosity = int(parser.options.ssh_verbosity) 143 ssh_options = parser.options.ssh_options 144 145 # can't be both a client and a server side test 146 if client and server: 147 parser.parser.error("Can not specify a test as both server and client!") 148 149 if provision and client: 150 parser.parser.error("Cannot specify provisioning and client!") 151 152 is_special_task = (verify or repair or cleanup or collect_crashinfo or 153 provision or reset) 154 if len(parser.args) < 1 and not is_special_task: 155 parser.parser.error("Missing argument: control file") 156 157 if ssh_verbosity > 0: 158 # ssh_verbosity is an integer between 0 and 3, inclusive 159 ssh_verbosity_flag = '-' + 'v' * ssh_verbosity 160 else: 161 ssh_verbosity_flag = '' 162 163 # We have a control file unless it's just a verify/repair/cleanup job 164 if len(parser.args) > 0: 165 control = parser.args[0] 166 else: 167 control = None 168 169 if machines_file: 170 machines = [] 171 for m in open(machines_file, 'r').readlines(): 172 # remove comments, spaces 173 m = re.sub('#.*', '', m).strip() 174 if m: 175 machines.append(m) 176 print "Read list of machines from file: %s" % machines_file 177 print ','.join(machines) 178 179 if machines: 180 for machine in machines: 181 if not machine or re.search('\s', machine): 182 parser.parser.error("Invalid machine: %s" % str(machine)) 183 machines = list(set(machines)) 184 machines.sort() 185 186 if group_name and len(machines) < 2: 187 parser.parser.error("-G %r may only be supplied with more than one machine." 188 % group_name) 189 190 kwargs = {'group_name': group_name, 'tag': execution_tag, 191 'disable_sysinfo': parser.options.disable_sysinfo} 192 if control_filename: 193 kwargs['control_filename'] = control_filename 194 job = server_job.server_job(control, parser.args[1:], results, label, 195 user, machines, client, parse_job, 196 ssh_user, ssh_port, ssh_pass, 197 ssh_verbosity_flag, ssh_options, 198 test_retry, **kwargs) 199 job.logging.start_logging() 200 job.init_parser() 201 202 # perform checks 203 job.precheck() 204 205 # run the job 206 exit_code = 0 207 try: 208 try: 209 if repair: 210 job.repair(host_protection, job_labels) 211 elif verify: 212 job.verify(job_labels) 213 elif provision: 214 job.provision(job_labels) 215 elif reset: 216 job.reset(job_labels) 217 elif cleanup: 218 job.cleanup(job_labels) 219 else: 220 job.run(install_before, install_after, 221 verify_job_repo_url=verify_job_repo_url, 222 only_collect_crashinfo=collect_crashinfo, 223 skip_crash_collection=skip_crash_collection, 224 job_labels=job_labels) 225 finally: 226 while job.hosts: 227 host = job.hosts.pop() 228 host.close() 229 except: 230 exit_code = 1 231 traceback.print_exc() 232 233 if pid_file_manager: 234 pid_file_manager.num_tests_failed = job.num_tests_failed 235 pid_file_manager.close_file(exit_code) 236 job.cleanup_parser() 237 238 sys.exit(exit_code) 239 240 241def _get_job_id_or_task_id(result_dir, machine, is_special_task): 242 """Extract job id or special task id from result_dir 243 244 @param result_dir: path to the result dir. 245 @param machine: hostname of the machine. 246 @param is_special_task: True/False, whether it is a special task. 247 248 @returns: integer representing the job id or task id. 249 """ 250 if not result_dir: 251 return 252 result_dir = os.path.abspath(result_dir) 253 if is_special_task: 254 # special task result dir is like 255 # /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup 256 pattern = '.*/hosts/%s/(\d+)-[^/]+' % machine 257 else: 258 # non-special task result dir is like 259 # /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6 260 pattern ='.*/(\d+)-[^/]+/%s' % machine 261 m = re.match(pattern, result_dir) 262 return int(m.group(1)) if m else None 263 264 265def record_autoserv(options, duration_secs): 266 """Record autoserv end-to-end time in metadata db. 267 268 @param options: parser options. 269 @param duration_secs: How long autoserv has taken, in secs. 270 """ 271 # Get machine hostname 272 machines = options.machines.replace( 273 ',', ' ').strip().split() if options.machines else [] 274 num_machines = len(machines) 275 if num_machines > 1: 276 # Skip the case where atomic group is used. 277 return 278 elif num_machines == 0: 279 machines.append('hostless') 280 281 # Determine the status that will be reported. 282 s = job_overhead.STATUS 283 task_mapping = { 284 'reset': s.RESETTING, 'verify': s.VERIFYING, 285 'provision': s.PROVISIONING, 'repair': s.REPAIRING, 286 'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING} 287 # option_dict will be like {'reset': True, 'repair': False, ...} 288 option_dict = ast.literal_eval(str(options)) 289 match = filter(lambda task: option_dict.get(task) == True, task_mapping) 290 status = task_mapping[match[0]] if match else s.RUNNING 291 is_special_task = status not in [s.RUNNING, s.GATHERING] 292 job_or_task_id = _get_job_id_or_task_id( 293 options.results, machines[0], is_special_task) 294 job_overhead.record_state_duration( 295 job_or_task_id, machines[0], status, duration_secs, 296 is_special_task=is_special_task) 297 298 299def main(): 300 start_time = datetime.datetime.now() 301 # White list of tests with run time measurement enabled. 302 measure_run_time_tests_names = global_config.global_config.get_config_value( 303 'AUTOSERV', 'measure_run_time_tests', type=str) 304 if measure_run_time_tests_names: 305 measure_run_time_tests = [t.strip() for t in 306 measure_run_time_tests_names.split(',')] 307 else: 308 measure_run_time_tests = [] 309 # grab the parser 310 parser = autoserv_parser.autoserv_parser 311 parser.parse_args() 312 313 if len(sys.argv) == 1: 314 parser.parser.print_help() 315 sys.exit(1) 316 317 if parser.options.no_logging: 318 results = None 319 else: 320 results = parser.options.results 321 if not results: 322 results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S') 323 results = os.path.abspath(results) 324 resultdir_exists = False 325 for filename in ('control.srv', 'status.log', '.autoserv_execute'): 326 if os.path.exists(os.path.join(results, filename)): 327 resultdir_exists = True 328 if not parser.options.use_existing_results and resultdir_exists: 329 error = "Error: results directory already exists: %s\n" % results 330 sys.stderr.write(error) 331 sys.exit(1) 332 333 # Now that we certified that there's no leftover results dir from 334 # previous jobs, lets create the result dir since the logging system 335 # needs to create the log file in there. 336 if not os.path.isdir(results): 337 os.makedirs(results) 338 339 logging_manager.configure_logging( 340 server_logging_config.ServerLoggingConfig(), results_dir=results, 341 use_console=not parser.options.no_tee, 342 verbose=parser.options.verbose, 343 no_console_prefix=parser.options.no_console_prefix) 344 if results: 345 logging.info("Results placed in %s" % results) 346 347 # wait until now to perform this check, so it get properly logged 348 if parser.options.use_existing_results and not resultdir_exists: 349 logging.error("No existing results directory found: %s", results) 350 sys.exit(1) 351 352 logging.debug('autoserv command was: %s', ' '.join(sys.argv)) 353 354 if parser.options.write_pidfile: 355 pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label, 356 results) 357 pid_file_manager.open_file() 358 else: 359 pid_file_manager = None 360 361 autotest.BaseAutotest.set_install_in_tmpdir( 362 parser.options.install_in_tmpdir) 363 364 timer = None 365 try: 366 # Take the first argument as control file name, get the test name from 367 # the control file. If the test name exists in the list of tests with 368 # run time measurement enabled, start a timer to begin measurement. 369 if (len(parser.args) > 0 and parser.args[0] != '' and 370 parser.options.machines): 371 try: 372 test_name = control_data.parse_control(parser.args[0], 373 raise_warnings=True).name 374 except control_data.ControlVariableException: 375 logging.debug('Failed to retrieve test name from control file.') 376 test_name = None 377 if test_name in measure_run_time_tests: 378 machines = parser.options.machines.replace(',', ' ' 379 ).strip().split() 380 try: 381 afe = frontend.AFE() 382 board = server_utils.get_board_from_afe(machines[0], afe) 383 timer = stats.Timer('autoserv_run_time.%s.%s' % 384 (board, test_name)) 385 timer.start() 386 except (urllib2.HTTPError, urllib2.URLError): 387 # Ignore error if RPC failed to get board 388 pass 389 except control_data.ControlVariableException as e: 390 logging.error(str(e)) 391 exit_code = 0 392 # TODO(beeps): Extend this to cover different failure modes. 393 # Testing exceptions are matched against labels sent to autoserv. Eg, 394 # to allow only the hostless job to run, specify 395 # testing_exceptions: test_suite in the shadow_config. To allow both 396 # the hostless job and dummy_Pass to run, specify 397 # testing_exceptions: test_suite,dummy_Pass. You can figure out 398 # what label autoserv is invoked with by looking through the logs of a test 399 # for the autoserv command's -l option. 400 testing_exceptions = global_config.global_config.get_config_value( 401 'AUTOSERV', 'testing_exceptions', type=list, default=[]) 402 test_mode = global_config.global_config.get_config_value( 403 'AUTOSERV', 'testing_mode', type=bool, default=False) 404 test_mode = (results_mocker and test_mode and not 405 any([ex in parser.options.label 406 for ex in testing_exceptions])) 407 is_task = (parser.options.verify or parser.options.repair or 408 parser.options.provision or parser.options.reset or 409 parser.options.cleanup or parser.options.collect_crashinfo) 410 try: 411 try: 412 if test_mode: 413 # The parser doesn't run on tasks anyway, so we can just return 414 # happy signals without faking results. 415 if not is_task: 416 machine = parser.options.results.split('/')[-1] 417 418 # TODO(beeps): The proper way to do this would be to 419 # refactor job creation so we can invoke job.record 420 # directly. To do that one needs to pipe the test_name 421 # through run_autoserv and bail just before invoking 422 # the server job. See the comment in 423 # puppylab/results_mocker for more context. 424 results_mocker.ResultsMocker( 425 test_name if test_name else 'unknown-test', 426 parser.options.results, machine 427 ).mock_results() 428 return 429 else: 430 run_autoserv(pid_file_manager, results, parser) 431 except SystemExit as e: 432 exit_code = e.code 433 if exit_code: 434 logging.exception(e) 435 except Exception as e: 436 # If we don't know what happened, we'll classify it as 437 # an 'abort' and return 1. 438 logging.exception(e) 439 exit_code = 1 440 finally: 441 if pid_file_manager: 442 pid_file_manager.close_file(exit_code) 443 if timer: 444 timer.stop() 445 # Record the autoserv duration time. Must be called 446 # just before the system exits to ensure accuracy. 447 duration_secs = (datetime.datetime.now() - start_time).total_seconds() 448 record_autoserv(parser.options, duration_secs) 449 sys.exit(exit_code) 450 451 452if __name__ == '__main__': 453 main() 454