autoserv revision c68fefb789c6d6bf936ea5e4270401cca6411ac4
1#!/usr/bin/python -u 2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc. 3# Released under the GPL v2 4 5""" 6Run a control file through the server side engine 7""" 8 9import ast 10import datetime 11import getpass 12import logging 13import os 14import re 15import signal 16import socket 17import sys 18import traceback 19import time 20import urllib2 21 22import common 23 24from autotest_lib.client.common_lib import control_data 25from autotest_lib.client.common_lib import global_config 26try: 27 from autotest_lib.puppylab import results_mocker 28except ImportError: 29 results_mocker = None 30 31require_atfork = global_config.global_config.get_config_value( 32 'AUTOSERV', 'require_atfork_module', type=bool, default=True) 33 34 35# Number of seconds to wait before returning if testing mode is enabled 36TESTING_MODE_SLEEP_SECS = 1 37 38try: 39 import atfork 40 atfork.monkeypatch_os_fork_functions() 41 import atfork.stdlib_fixer 42 # Fix the Python standard library for threading+fork safety with its 43 # internal locks. http://code.google.com/p/python-atfork/ 44 import warnings 45 warnings.filterwarnings('ignore', 'logging module already imported') 46 atfork.stdlib_fixer.fix_logging_module() 47except ImportError, e: 48 from autotest_lib.client.common_lib import global_config 49 if global_config.global_config.get_config_value( 50 'AUTOSERV', 'require_atfork_module', type=bool, default=False): 51 print >>sys.stderr, 'Please run utils/build_externals.py' 52 print e 53 sys.exit(1) 54 55from autotest_lib.server import frontend 56from autotest_lib.server import server_logging_config 57from autotest_lib.server import server_job, utils, autoserv_parser, autotest 58from autotest_lib.server import utils as server_utils 59from autotest_lib.site_utils import job_directories 60from autotest_lib.site_utils import job_overhead 61from autotest_lib.site_utils import lxc 62from autotest_lib.client.common_lib import pidfile, logging_manager 63from autotest_lib.client.common_lib.cros.graphite import autotest_stats 64 65# Control segment to stage server-side package. 66STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE = server_job._control_segment_path( 67 'stage_server_side_package') 68 69def log_alarm(signum, frame): 70 logging.error("Received SIGALARM. Ignoring and continuing on.") 71 sys.exit(1) 72 73 74def _get_machines(parser): 75 """Get a list of machine names from command line arg -m or a file. 76 77 @param parser: Parser for the command line arguments. 78 79 @return: A list of machine names from command line arg -m or the 80 machines file specified in the command line arg -M. 81 """ 82 if parser.options.machines: 83 machines = parser.options.machines.replace(',', ' ').strip().split() 84 else: 85 machines = [] 86 machines_file = parser.options.machines_file 87 if machines_file: 88 machines = [] 89 for m in open(machines_file, 'r').readlines(): 90 # remove comments, spaces 91 m = re.sub('#.*', '', m).strip() 92 if m: 93 machines.append(m) 94 logging.debug('Read list of machines from file: %s', machines_file) 95 logging.debug('Machines: %s', ','.join(machines)) 96 97 if machines: 98 for machine in machines: 99 if not machine or re.search('\s', machine): 100 parser.parser.error("Invalid machine: %s" % str(machine)) 101 machines = list(set(machines)) 102 machines.sort() 103 return machines 104 105 106def _stage_ssp(parser): 107 """Stage server-side package. 108 109 This function calls a control segment to stage server-side package based on 110 the job and autoserv command line option. The detail implementation could 111 be different for each host type. Currently, only CrosHost has 112 stage_server_side_package function defined. 113 The script returns None if no server-side package is available. However, 114 it may raise exception if it failed for reasons other than artifact (the 115 server-side package) not found. 116 117 @param parser: Command line arguments parser passed in the autoserv process. 118 119 @return: url of the staged server-side package. Return None if server- 120 side package is not found for the build. 121 """ 122 namespace = {'machines': _get_machines(parser), 123 'image': parser.options.image} 124 script_locals = {} 125 execfile(STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE, namespace, script_locals) 126 return script_locals['ssp_url'] 127 128 129def _run_with_ssp(container_name, job_id, results, parser, ssp_url): 130 """Run the server job with server-side packaging. 131 132 @param container_name: Name of the container to run the test. 133 @param job_id: ID of the test job. 134 @param results: Folder to store results. This could be different from 135 parser.options.results: 136 parser.options.results can be set to None for results to be 137 stored in a temp folder. 138 results can be None for autoserv run requires no logging. 139 @param parser: Command line parser that contains the options. 140 @param ssp_url: url of the staged server-side package. 141 """ 142 bucket = lxc.ContainerBucket() 143 control = (parser.args[0] if len(parser.args) > 0 and parser.args[0] != '' 144 else None) 145 test_container = bucket.setup_test(container_name, job_id, ssp_url, results, 146 control=control) 147 args = sys.argv[:] 148 args.remove('--require-ssp') 149 150 # A dictionary of paths to replace in the command line. Key is the path to 151 # be replaced with the one in value. 152 paths_to_replace = {} 153 # Replace the control file path with the one in container. 154 if control: 155 container_control_filename = os.path.join( 156 lxc.CONTROL_TEMP_PATH, os.path.basename(control)) 157 paths_to_replace[control] = container_control_filename 158 # Update result directory with the one in container. 159 if parser.options.results: 160 container_result_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id) 161 paths_to_replace[parser.options.results] = container_result_dir 162 # Update parse_job directory with the one in container. The assumption is 163 # that the result folder to be parsed is always the same as the results_dir. 164 if parser.options.parse_job: 165 container_parse_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id) 166 paths_to_replace[parser.options.parse_job] = container_result_dir 167 168 args = [paths_to_replace.get(arg, arg) for arg in args] 169 170 # Apply --use-existing-results, results directory is aready created and 171 # mounted in container. Apply this arg to avoid exception being raised. 172 if not '--use-existing-results' in args: 173 args.append('--use-existing-results') 174 175 # Make sure autoserv running in container using a different pid file. 176 if not '--pidfile-label' in args: 177 args.extend(['--pidfile-label', 'container_autoserv']) 178 179 cmd_line = ' '.join(args) 180 logging.info('Run command in container: %s', cmd_line) 181 try: 182 test_container.attach_run(cmd_line) 183 finally: 184 test_container.destroy() 185 186 187def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): 188 """Run server job with given options. 189 190 @param pid_file_manager: PidFileManager used to monitor the autoserv process 191 @param results: Folder to store results. 192 @param parser: Parser for the command line arguments. 193 @param ssp_url: Url to server-side package. 194 @param use_ssp: Set to True to run with server-side packaging. 195 """ 196 if parser.options.warn_no_ssp: 197 # Post a warning in the log. 198 logging.warn('Autoserv is required to run with server-side packaging. ' 199 'However, no drone is found to support server-side ' 200 'packaging. The test will be executed in a drone without ' 201 'server-side packaging supported.') 202 203 # send stdin to /dev/null 204 dev_null = os.open(os.devnull, os.O_RDONLY) 205 os.dup2(dev_null, sys.stdin.fileno()) 206 os.close(dev_null) 207 208 # Create separate process group 209 os.setpgrp() 210 211 # Container name is predefined so the container can be destroyed in 212 # handle_sigterm. 213 job_or_task_id = job_directories.get_job_id_or_task_id( 214 parser.options.results) 215 container_name = (lxc.TEST_CONTAINER_NAME_FMT % 216 (job_or_task_id, time.time())) 217 218 # Implement SIGTERM handler 219 def handle_sigterm(signum, frame): 220 logging.debug('Received SIGTERM') 221 if pid_file_manager: 222 pid_file_manager.close_file(1, signal.SIGTERM) 223 logging.debug('Finished writing to pid_file. Killing process.') 224 # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. 225 # This sleep allows the pending output to be logged before the kill 226 # signal is sent. 227 time.sleep(.1) 228 if use_ssp: 229 logging.debug('Destroy container %s before aborting the autoserv ' 230 'process.', container_name) 231 try: 232 bucket = lxc.ContainerBucket() 233 container = bucket.get(container_name) 234 if container: 235 container.destroy() 236 else: 237 logging.debug('Container %s is not found.', container_name) 238 except: 239 # Handle any exception so the autoserv process can be aborted. 240 logging.error('Failed to destroy container %s. Error: %s', 241 container_name, sys.exc_info()) 242 243 os.killpg(os.getpgrp(), signal.SIGKILL) 244 245 # Set signal handler 246 signal.signal(signal.SIGTERM, handle_sigterm) 247 248 # faulthandler is only needed to debug in the Lab and is not avaliable to 249 # be imported in the chroot as part of VMTest, so Try-Except it. 250 try: 251 import faulthandler 252 faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) 253 logging.debug('faulthandler registered on SIGTERM.') 254 except ImportError: 255 pass 256 257 # Ignore SIGTTOU's generated by output from forked children. 258 signal.signal(signal.SIGTTOU, signal.SIG_IGN) 259 260 # If we received a SIGALARM, let's be loud about it. 261 signal.signal(signal.SIGALRM, log_alarm) 262 263 # Server side tests that call shell scripts often depend on $USER being set 264 # but depending on how you launch your autotest scheduler it may not be set. 265 os.environ['USER'] = getpass.getuser() 266 267 label = parser.options.label 268 group_name = parser.options.group_name 269 user = parser.options.user 270 client = parser.options.client 271 server = parser.options.server 272 install_before = parser.options.install_before 273 install_after = parser.options.install_after 274 verify = parser.options.verify 275 repair = parser.options.repair 276 cleanup = parser.options.cleanup 277 provision = parser.options.provision 278 reset = parser.options.reset 279 job_labels = parser.options.job_labels 280 no_tee = parser.options.no_tee 281 parse_job = parser.options.parse_job 282 execution_tag = parser.options.execution_tag 283 if not execution_tag: 284 execution_tag = parse_job 285 host_protection = parser.options.host_protection 286 ssh_user = parser.options.ssh_user 287 ssh_port = parser.options.ssh_port 288 ssh_pass = parser.options.ssh_pass 289 collect_crashinfo = parser.options.collect_crashinfo 290 control_filename = parser.options.control_filename 291 test_retry = parser.options.test_retry 292 verify_job_repo_url = parser.options.verify_job_repo_url 293 skip_crash_collection = parser.options.skip_crash_collection 294 ssh_verbosity = int(parser.options.ssh_verbosity) 295 ssh_options = parser.options.ssh_options 296 no_use_packaging = parser.options.no_use_packaging 297 298 # can't be both a client and a server side test 299 if client and server: 300 parser.parser.error("Can not specify a test as both server and client!") 301 302 if provision and client: 303 parser.parser.error("Cannot specify provisioning and client!") 304 305 is_special_task = (verify or repair or cleanup or collect_crashinfo or 306 provision or reset) 307 if len(parser.args) < 1 and not is_special_task: 308 parser.parser.error("Missing argument: control file") 309 310 if ssh_verbosity > 0: 311 # ssh_verbosity is an integer between 0 and 3, inclusive 312 ssh_verbosity_flag = '-' + 'v' * ssh_verbosity 313 else: 314 ssh_verbosity_flag = '' 315 316 # We have a control file unless it's just a verify/repair/cleanup job 317 if len(parser.args) > 0: 318 control = parser.args[0] 319 else: 320 control = None 321 322 machines = _get_machines(parser) 323 if group_name and len(machines) < 2: 324 parser.parser.error('-G %r may only be supplied with more than one ' 325 'machine.' % group_name) 326 327 kwargs = {'group_name': group_name, 'tag': execution_tag, 328 'disable_sysinfo': parser.options.disable_sysinfo} 329 if control_filename: 330 kwargs['control_filename'] = control_filename 331 job = server_job.server_job(control, parser.args[1:], results, label, 332 user, machines, client, parse_job, 333 ssh_user, ssh_port, ssh_pass, 334 ssh_verbosity_flag, ssh_options, 335 test_retry, **kwargs) 336 337 job.logging.start_logging() 338 job.init_parser() 339 340 # perform checks 341 job.precheck() 342 343 # run the job 344 exit_code = 0 345 try: 346 try: 347 if repair: 348 job.repair(host_protection, job_labels) 349 elif verify: 350 job.verify(job_labels) 351 elif provision: 352 job.provision(job_labels) 353 elif reset: 354 job.reset(job_labels) 355 elif cleanup: 356 job.cleanup(job_labels) 357 else: 358 if use_ssp: 359 try: 360 _run_with_ssp(container_name, job_or_task_id, results, 361 parser, ssp_url) 362 finally: 363 # Update the ownership of files in result folder. 364 # TODO(dshi): crbug.com/459344 Skip following action 365 # when test container can be unprivileged container. 366 if results: 367 lxc.run('chown -R %s %s' % (os.getuid(), results)) 368 lxc.run('chgrp -R %s %s' % (os.getgid(), results)) 369 else: 370 job.run(install_before, install_after, 371 verify_job_repo_url=verify_job_repo_url, 372 only_collect_crashinfo=collect_crashinfo, 373 skip_crash_collection=skip_crash_collection, 374 job_labels=job_labels, 375 use_packaging=(not no_use_packaging)) 376 finally: 377 while job.hosts: 378 host = job.hosts.pop() 379 host.close() 380 except: 381 exit_code = 1 382 traceback.print_exc() 383 384 if pid_file_manager: 385 pid_file_manager.num_tests_failed = job.num_tests_failed 386 pid_file_manager.close_file(exit_code) 387 job.cleanup_parser() 388 389 sys.exit(exit_code) 390 391 392def record_autoserv(options, duration_secs): 393 """Record autoserv end-to-end time in metadata db. 394 395 @param options: parser options. 396 @param duration_secs: How long autoserv has taken, in secs. 397 """ 398 # Get machine hostname 399 machines = options.machines.replace( 400 ',', ' ').strip().split() if options.machines else [] 401 num_machines = len(machines) 402 if num_machines > 1: 403 # Skip the case where atomic group is used. 404 return 405 elif num_machines == 0: 406 machines.append('hostless') 407 408 # Determine the status that will be reported. 409 s = job_overhead.STATUS 410 task_mapping = { 411 'reset': s.RESETTING, 'verify': s.VERIFYING, 412 'provision': s.PROVISIONING, 'repair': s.REPAIRING, 413 'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING} 414 # option_dict will be like {'reset': True, 'repair': False, ...} 415 option_dict = ast.literal_eval(str(options)) 416 match = filter(lambda task: option_dict.get(task) == True, task_mapping) 417 status = task_mapping[match[0]] if match else s.RUNNING 418 is_special_task = status not in [s.RUNNING, s.GATHERING] 419 job_or_task_id = job_directories.get_job_id_or_task_id(options.results) 420 job_overhead.record_state_duration( 421 job_or_task_id, machines[0], status, duration_secs, 422 is_special_task=is_special_task) 423 424 425def main(): 426 start_time = datetime.datetime.now() 427 # White list of tests with run time measurement enabled. 428 measure_run_time_tests_names = global_config.global_config.get_config_value( 429 'AUTOSERV', 'measure_run_time_tests', type=str) 430 if measure_run_time_tests_names: 431 measure_run_time_tests = [t.strip() for t in 432 measure_run_time_tests_names.split(',')] 433 else: 434 measure_run_time_tests = [] 435 # grab the parser 436 parser = autoserv_parser.autoserv_parser 437 parser.parse_args() 438 439 if len(sys.argv) == 1: 440 parser.parser.print_help() 441 sys.exit(1) 442 443 # If the job requires to run with server-side package, try to stage server- 444 # side package first. If that fails with error that autotest server package 445 # does not exist, fall back to run the job without using server-side 446 # packaging. If option warn_no_ssp is specified, that means autoserv is 447 # running in a drone does not support SSP, thus no need to stage server-side 448 # package. 449 ssp_url = None 450 if (not parser.options.warn_no_ssp and parser.options.require_ssp): 451 ssp_url = _stage_ssp(parser) 452 if not ssp_url: 453 # The build does not have autotest server package. Fall back to not 454 # to use server-side package, reset logging to log in results 455 # folder. 456 logging.warn( 457 'Autoserv is required to run with server-side packaging. ' 458 'However, no server-side package can be found based on ' 459 '`--image`, host attribute job_repo_url or host label of ' 460 'cros-version. The test will be executed without ' 461 'server-side packaging supported.') 462 463 if parser.options.no_logging: 464 results = None 465 else: 466 results = parser.options.results 467 if not results: 468 results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S') 469 results = os.path.abspath(results) 470 resultdir_exists = False 471 for filename in ('control.srv', 'status.log', '.autoserv_execute'): 472 if os.path.exists(os.path.join(results, filename)): 473 resultdir_exists = True 474 if not parser.options.use_existing_results and resultdir_exists: 475 error = "Error: results directory already exists: %s\n" % results 476 sys.stderr.write(error) 477 sys.exit(1) 478 479 # Now that we certified that there's no leftover results dir from 480 # previous jobs, lets create the result dir since the logging system 481 # needs to create the log file in there. 482 if not os.path.isdir(results): 483 os.makedirs(results) 484 485 # Server-side packaging will only be used if it's required and the package 486 # is available. If warn_no_ssp is specified, it means that autoserv is 487 # running in a drone does not have SSP supported and a warning will be logs. 488 # Therefore, it should not run with SSP. 489 use_ssp = (not parser.options.warn_no_ssp and parser.options.require_ssp 490 and ssp_url) 491 if use_ssp: 492 log_dir = os.path.join(results, 'wrapper') if results else None 493 if log_dir and not os.path.exists(log_dir): 494 os.makedirs(log_dir) 495 else: 496 log_dir = results 497 logging_manager.configure_logging( 498 server_logging_config.ServerLoggingConfig(), 499 results_dir=log_dir, 500 use_console=not parser.options.no_tee, 501 verbose=parser.options.verbose, 502 no_console_prefix=parser.options.no_console_prefix) 503 504 if results: 505 logging.info("Results placed in %s" % results) 506 507 # wait until now to perform this check, so it get properly logged 508 if (parser.options.use_existing_results and not resultdir_exists and 509 not lxc.is_in_container()): 510 logging.error("No existing results directory found: %s", results) 511 sys.exit(1) 512 513 logging.debug('autoserv is running in drone %s.', socket.gethostname()) 514 logging.debug('autoserv command was: %s', ' '.join(sys.argv)) 515 516 if parser.options.write_pidfile and results: 517 pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label, 518 results) 519 pid_file_manager.open_file() 520 else: 521 pid_file_manager = None 522 523 autotest.BaseAutotest.set_install_in_tmpdir( 524 parser.options.install_in_tmpdir) 525 526 timer = None 527 try: 528 # Take the first argument as control file name, get the test name from 529 # the control file. If the test name exists in the list of tests with 530 # run time measurement enabled, start a timer to begin measurement. 531 if (len(parser.args) > 0 and parser.args[0] != '' and 532 parser.options.machines): 533 try: 534 test_name = control_data.parse_control(parser.args[0], 535 raise_warnings=True).name 536 except control_data.ControlVariableException: 537 logging.debug('Failed to retrieve test name from control file.') 538 test_name = None 539 if test_name in measure_run_time_tests: 540 machines = parser.options.machines.replace(',', ' ' 541 ).strip().split() 542 try: 543 afe = frontend.AFE() 544 board = server_utils.get_board_from_afe(machines[0], afe) 545 timer = autotest_stats.Timer('autoserv_run_time.%s.%s' % 546 (board, test_name)) 547 timer.start() 548 except (urllib2.HTTPError, urllib2.URLError): 549 # Ignore error if RPC failed to get board 550 pass 551 except control_data.ControlVariableException as e: 552 logging.error(str(e)) 553 exit_code = 0 554 # TODO(beeps): Extend this to cover different failure modes. 555 # Testing exceptions are matched against labels sent to autoserv. Eg, 556 # to allow only the hostless job to run, specify 557 # testing_exceptions: test_suite in the shadow_config. To allow both 558 # the hostless job and dummy_Pass to run, specify 559 # testing_exceptions: test_suite,dummy_Pass. You can figure out 560 # what label autoserv is invoked with by looking through the logs of a test 561 # for the autoserv command's -l option. 562 testing_exceptions = global_config.global_config.get_config_value( 563 'AUTOSERV', 'testing_exceptions', type=list, default=[]) 564 test_mode = global_config.global_config.get_config_value( 565 'AUTOSERV', 'testing_mode', type=bool, default=False) 566 test_mode = (results_mocker and test_mode and not 567 any([ex in parser.options.label 568 for ex in testing_exceptions])) 569 is_task = (parser.options.verify or parser.options.repair or 570 parser.options.provision or parser.options.reset or 571 parser.options.cleanup or parser.options.collect_crashinfo) 572 try: 573 try: 574 if test_mode: 575 # The parser doesn't run on tasks anyway, so we can just return 576 # happy signals without faking results. 577 if not is_task: 578 machine = parser.options.results.split('/')[-1] 579 580 # TODO(beeps): The proper way to do this would be to 581 # refactor job creation so we can invoke job.record 582 # directly. To do that one needs to pipe the test_name 583 # through run_autoserv and bail just before invoking 584 # the server job. See the comment in 585 # puppylab/results_mocker for more context. 586 results_mocker.ResultsMocker( 587 test_name if test_name else 'unknown-test', 588 parser.options.results, machine 589 ).mock_results() 590 return 591 else: 592 run_autoserv(pid_file_manager, results, parser, ssp_url, 593 use_ssp) 594 except SystemExit as e: 595 exit_code = e.code 596 if exit_code: 597 logging.exception(e) 598 except Exception as e: 599 # If we don't know what happened, we'll classify it as 600 # an 'abort' and return 1. 601 logging.exception(e) 602 exit_code = 1 603 finally: 604 if pid_file_manager: 605 pid_file_manager.close_file(exit_code) 606 if timer: 607 timer.stop() 608 # Record the autoserv duration time. Must be called 609 # just before the system exits to ensure accuracy. 610 duration_secs = (datetime.datetime.now() - start_time).total_seconds() 611 record_autoserv(parser.options, duration_secs) 612 sys.exit(exit_code) 613 614 615if __name__ == '__main__': 616 main() 617