autoserv revision d68d51c731ef83a45c173497b59a26b2e6870c1b
1#!/usr/bin/python -u 2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc. 3# Released under the GPL v2 4 5""" 6Run a control file through the server side engine 7""" 8 9import ast 10import datetime 11import getpass 12import logging 13import os 14import re 15import signal 16import socket 17import sys 18import traceback 19import time 20import urllib2 21 22import common 23 24from autotest_lib.client.common_lib import control_data 25from autotest_lib.client.common_lib import global_config 26from autotest_lib.client.common_lib.cros.graphite import autotest_es 27from autotest_lib.client.common_lib.cros.graphite import autotest_stats 28try: 29 from autotest_lib.puppylab import results_mocker 30except ImportError: 31 results_mocker = None 32 33require_atfork = global_config.global_config.get_config_value( 34 'AUTOSERV', 'require_atfork_module', type=bool, default=True) 35 36 37# Number of seconds to wait before returning if testing mode is enabled 38TESTING_MODE_SLEEP_SECS = 1 39 40try: 41 import atfork 42 atfork.monkeypatch_os_fork_functions() 43 import atfork.stdlib_fixer 44 # Fix the Python standard library for threading+fork safety with its 45 # internal locks. http://code.google.com/p/python-atfork/ 46 import warnings 47 warnings.filterwarnings('ignore', 'logging module already imported') 48 atfork.stdlib_fixer.fix_logging_module() 49except ImportError, e: 50 from autotest_lib.client.common_lib import global_config 51 if global_config.global_config.get_config_value( 52 'AUTOSERV', 'require_atfork_module', type=bool, default=False): 53 print >>sys.stderr, 'Please run utils/build_externals.py' 54 print e 55 sys.exit(1) 56 57from autotest_lib.server import frontend 58from autotest_lib.server import server_logging_config 59from autotest_lib.server import server_job, utils, autoserv_parser, autotest 60from autotest_lib.server import utils as server_utils 61from autotest_lib.site_utils import job_directories 62from autotest_lib.site_utils import job_overhead 63from autotest_lib.site_utils import lxc 64from autotest_lib.client.common_lib import pidfile, logging_manager 65from autotest_lib.client.common_lib.cros.graphite import autotest_stats 66 67# Control segment to stage server-side package. 68STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE = server_job._control_segment_path( 69 'stage_server_side_package') 70 71def log_alarm(signum, frame): 72 logging.error("Received SIGALARM. Ignoring and continuing on.") 73 sys.exit(1) 74 75 76def _get_machines(parser): 77 """Get a list of machine names from command line arg -m or a file. 78 79 @param parser: Parser for the command line arguments. 80 81 @return: A list of machine names from command line arg -m or the 82 machines file specified in the command line arg -M. 83 """ 84 if parser.options.machines: 85 machines = parser.options.machines.replace(',', ' ').strip().split() 86 else: 87 machines = [] 88 machines_file = parser.options.machines_file 89 if machines_file: 90 machines = [] 91 for m in open(machines_file, 'r').readlines(): 92 # remove comments, spaces 93 m = re.sub('#.*', '', m).strip() 94 if m: 95 machines.append(m) 96 logging.debug('Read list of machines from file: %s', machines_file) 97 logging.debug('Machines: %s', ','.join(machines)) 98 99 if machines: 100 for machine in machines: 101 if not machine or re.search('\s', machine): 102 parser.parser.error("Invalid machine: %s" % str(machine)) 103 machines = list(set(machines)) 104 machines.sort() 105 return machines 106 107 108def _stage_ssp(parser): 109 """Stage server-side package. 110 111 This function calls a control segment to stage server-side package based on 112 the job and autoserv command line option. The detail implementation could 113 be different for each host type. Currently, only CrosHost has 114 stage_server_side_package function defined. 115 The script returns None if no server-side package is available. However, 116 it may raise exception if it failed for reasons other than artifact (the 117 server-side package) not found. 118 119 @param parser: Command line arguments parser passed in the autoserv process. 120 121 @return: url of the staged server-side package. Return None if server- 122 side package is not found for the build. 123 """ 124 namespace = {'machines': _get_machines(parser), 125 'image': parser.options.image} 126 script_locals = {} 127 execfile(STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE, namespace, script_locals) 128 return script_locals['ssp_url'] 129 130 131def _run_with_ssp(container_name, job_id, results, parser, ssp_url): 132 """Run the server job with server-side packaging. 133 134 @param container_name: Name of the container to run the test. 135 @param job_id: ID of the test job. 136 @param results: Folder to store results. This could be different from 137 parser.options.results: 138 parser.options.results can be set to None for results to be 139 stored in a temp folder. 140 results can be None for autoserv run requires no logging. 141 @param parser: Command line parser that contains the options. 142 @param ssp_url: url of the staged server-side package. 143 """ 144 bucket = lxc.ContainerBucket() 145 control = (parser.args[0] if len(parser.args) > 0 and parser.args[0] != '' 146 else None) 147 test_container = bucket.setup_test(container_name, job_id, ssp_url, results, 148 control=control) 149 args = sys.argv[:] 150 args.remove('--require-ssp') 151 152 # A dictionary of paths to replace in the command line. Key is the path to 153 # be replaced with the one in value. 154 paths_to_replace = {} 155 # Replace the control file path with the one in container. 156 if control: 157 container_control_filename = os.path.join( 158 lxc.CONTROL_TEMP_PATH, os.path.basename(control)) 159 paths_to_replace[control] = container_control_filename 160 # Update result directory with the one in container. 161 if parser.options.results: 162 container_result_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id) 163 paths_to_replace[parser.options.results] = container_result_dir 164 # Update parse_job directory with the one in container. The assumption is 165 # that the result folder to be parsed is always the same as the results_dir. 166 if parser.options.parse_job: 167 container_parse_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id) 168 paths_to_replace[parser.options.parse_job] = container_result_dir 169 170 args = [paths_to_replace.get(arg, arg) for arg in args] 171 172 # Apply --use-existing-results, results directory is aready created and 173 # mounted in container. Apply this arg to avoid exception being raised. 174 if not '--use-existing-results' in args: 175 args.append('--use-existing-results') 176 177 # Make sure autoserv running in container using a different pid file. 178 if not '--pidfile-label' in args: 179 args.extend(['--pidfile-label', 'container_autoserv']) 180 181 cmd_line = ' '.join(["'%s'" % arg if ' ' in arg else arg for arg in args]) 182 logging.info('Run command in container: %s', cmd_line) 183 success = False 184 try: 185 test_container.attach_run(cmd_line) 186 success = True 187 finally: 188 counter_key = '%s.%s' % (lxc.STATS_KEY, 189 'success' if success else 'fail') 190 autotest_stats.Counter(counter_key).increment() 191 # metadata is uploaded separately so it can use http to upload. 192 metadata = {'drone': socket.gethostname(), 193 'job_id': job_id, 194 'success': success} 195 autotest_es.post(use_http=True, 196 type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE, 197 metadata=metadata) 198 test_container.destroy() 199 200 201def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): 202 """Run server job with given options. 203 204 @param pid_file_manager: PidFileManager used to monitor the autoserv process 205 @param results: Folder to store results. 206 @param parser: Parser for the command line arguments. 207 @param ssp_url: Url to server-side package. 208 @param use_ssp: Set to True to run with server-side packaging. 209 """ 210 if parser.options.warn_no_ssp: 211 # Post a warning in the log. 212 logging.warn('Autoserv is required to run with server-side packaging. ' 213 'However, no drone is found to support server-side ' 214 'packaging. The test will be executed in a drone without ' 215 'server-side packaging supported.') 216 217 # send stdin to /dev/null 218 dev_null = os.open(os.devnull, os.O_RDONLY) 219 os.dup2(dev_null, sys.stdin.fileno()) 220 os.close(dev_null) 221 222 # Create separate process group 223 os.setpgrp() 224 225 # Container name is predefined so the container can be destroyed in 226 # handle_sigterm. 227 job_or_task_id = job_directories.get_job_id_or_task_id( 228 parser.options.results) 229 container_name = (lxc.TEST_CONTAINER_NAME_FMT % 230 (job_or_task_id, time.time(), os.getpid())) 231 232 # Implement SIGTERM handler 233 def handle_sigterm(signum, frame): 234 logging.debug('Received SIGTERM') 235 if pid_file_manager: 236 pid_file_manager.close_file(1, signal.SIGTERM) 237 logging.debug('Finished writing to pid_file. Killing process.') 238 # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. 239 # This sleep allows the pending output to be logged before the kill 240 # signal is sent. 241 time.sleep(.1) 242 if use_ssp: 243 logging.debug('Destroy container %s before aborting the autoserv ' 244 'process.', container_name) 245 try: 246 bucket = lxc.ContainerBucket() 247 container = bucket.get(container_name) 248 if container: 249 container.destroy() 250 else: 251 logging.debug('Container %s is not found.', container_name) 252 except: 253 # Handle any exception so the autoserv process can be aborted. 254 logging.error('Failed to destroy container %s. Error: %s', 255 container_name, sys.exc_info()) 256 257 os.killpg(os.getpgrp(), signal.SIGKILL) 258 259 # Set signal handler 260 signal.signal(signal.SIGTERM, handle_sigterm) 261 262 # faulthandler is only needed to debug in the Lab and is not avaliable to 263 # be imported in the chroot as part of VMTest, so Try-Except it. 264 try: 265 import faulthandler 266 faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) 267 logging.debug('faulthandler registered on SIGTERM.') 268 except ImportError: 269 pass 270 271 # Ignore SIGTTOU's generated by output from forked children. 272 signal.signal(signal.SIGTTOU, signal.SIG_IGN) 273 274 # If we received a SIGALARM, let's be loud about it. 275 signal.signal(signal.SIGALRM, log_alarm) 276 277 # Server side tests that call shell scripts often depend on $USER being set 278 # but depending on how you launch your autotest scheduler it may not be set. 279 os.environ['USER'] = getpass.getuser() 280 281 label = parser.options.label 282 group_name = parser.options.group_name 283 user = parser.options.user 284 client = parser.options.client 285 server = parser.options.server 286 install_before = parser.options.install_before 287 install_after = parser.options.install_after 288 verify = parser.options.verify 289 repair = parser.options.repair 290 cleanup = parser.options.cleanup 291 provision = parser.options.provision 292 reset = parser.options.reset 293 job_labels = parser.options.job_labels 294 no_tee = parser.options.no_tee 295 parse_job = parser.options.parse_job 296 execution_tag = parser.options.execution_tag 297 if not execution_tag: 298 execution_tag = parse_job 299 host_protection = parser.options.host_protection 300 ssh_user = parser.options.ssh_user 301 ssh_port = parser.options.ssh_port 302 ssh_pass = parser.options.ssh_pass 303 collect_crashinfo = parser.options.collect_crashinfo 304 control_filename = parser.options.control_filename 305 test_retry = parser.options.test_retry 306 verify_job_repo_url = parser.options.verify_job_repo_url 307 skip_crash_collection = parser.options.skip_crash_collection 308 ssh_verbosity = int(parser.options.ssh_verbosity) 309 ssh_options = parser.options.ssh_options 310 no_use_packaging = parser.options.no_use_packaging 311 312 # can't be both a client and a server side test 313 if client and server: 314 parser.parser.error("Can not specify a test as both server and client!") 315 316 if provision and client: 317 parser.parser.error("Cannot specify provisioning and client!") 318 319 is_special_task = (verify or repair or cleanup or collect_crashinfo or 320 provision or reset) 321 if len(parser.args) < 1 and not is_special_task: 322 parser.parser.error("Missing argument: control file") 323 324 if ssh_verbosity > 0: 325 # ssh_verbosity is an integer between 0 and 3, inclusive 326 ssh_verbosity_flag = '-' + 'v' * ssh_verbosity 327 else: 328 ssh_verbosity_flag = '' 329 330 # We have a control file unless it's just a verify/repair/cleanup job 331 if len(parser.args) > 0: 332 control = parser.args[0] 333 else: 334 control = None 335 336 machines = _get_machines(parser) 337 if group_name and len(machines) < 2: 338 parser.parser.error('-G %r may only be supplied with more than one ' 339 'machine.' % group_name) 340 341 kwargs = {'group_name': group_name, 'tag': execution_tag, 342 'disable_sysinfo': parser.options.disable_sysinfo} 343 if control_filename: 344 kwargs['control_filename'] = control_filename 345 job = server_job.server_job(control, parser.args[1:], results, label, 346 user, machines, client, parse_job, 347 ssh_user, ssh_port, ssh_pass, 348 ssh_verbosity_flag, ssh_options, 349 test_retry, **kwargs) 350 351 job.logging.start_logging() 352 job.init_parser() 353 354 # perform checks 355 job.precheck() 356 357 # run the job 358 exit_code = 0 359 try: 360 try: 361 if repair: 362 job.repair(host_protection, job_labels) 363 elif verify: 364 job.verify(job_labels) 365 elif provision: 366 job.provision(job_labels) 367 elif reset: 368 job.reset(job_labels) 369 elif cleanup: 370 job.cleanup(job_labels) 371 else: 372 if use_ssp: 373 try: 374 _run_with_ssp(container_name, job_or_task_id, results, 375 parser, ssp_url) 376 finally: 377 # Update the ownership of files in result folder. 378 # TODO(dshi): crbug.com/459344 Skip following action 379 # when test container can be unprivileged container. 380 if results: 381 lxc.run('chown -R %s %s' % (os.getuid(), results)) 382 lxc.run('chgrp -R %s %s' % (os.getgid(), results)) 383 else: 384 job.run(install_before, install_after, 385 verify_job_repo_url=verify_job_repo_url, 386 only_collect_crashinfo=collect_crashinfo, 387 skip_crash_collection=skip_crash_collection, 388 job_labels=job_labels, 389 use_packaging=(not no_use_packaging)) 390 finally: 391 while job.hosts: 392 host = job.hosts.pop() 393 host.close() 394 except: 395 exit_code = 1 396 traceback.print_exc() 397 398 if pid_file_manager: 399 pid_file_manager.num_tests_failed = job.num_tests_failed 400 pid_file_manager.close_file(exit_code) 401 job.cleanup_parser() 402 403 sys.exit(exit_code) 404 405 406def record_autoserv(options, duration_secs): 407 """Record autoserv end-to-end time in metadata db. 408 409 @param options: parser options. 410 @param duration_secs: How long autoserv has taken, in secs. 411 """ 412 # Get machine hostname 413 machines = options.machines.replace( 414 ',', ' ').strip().split() if options.machines else [] 415 num_machines = len(machines) 416 if num_machines > 1: 417 # Skip the case where atomic group is used. 418 return 419 elif num_machines == 0: 420 machines.append('hostless') 421 422 # Determine the status that will be reported. 423 s = job_overhead.STATUS 424 task_mapping = { 425 'reset': s.RESETTING, 'verify': s.VERIFYING, 426 'provision': s.PROVISIONING, 'repair': s.REPAIRING, 427 'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING} 428 # option_dict will be like {'reset': True, 'repair': False, ...} 429 option_dict = ast.literal_eval(str(options)) 430 match = filter(lambda task: option_dict.get(task) == True, task_mapping) 431 status = task_mapping[match[0]] if match else s.RUNNING 432 is_special_task = status not in [s.RUNNING, s.GATHERING] 433 job_or_task_id = job_directories.get_job_id_or_task_id(options.results) 434 job_overhead.record_state_duration( 435 job_or_task_id, machines[0], status, duration_secs, 436 is_special_task=is_special_task) 437 438 439def main(): 440 start_time = datetime.datetime.now() 441 # White list of tests with run time measurement enabled. 442 measure_run_time_tests_names = global_config.global_config.get_config_value( 443 'AUTOSERV', 'measure_run_time_tests', type=str) 444 if measure_run_time_tests_names: 445 measure_run_time_tests = [t.strip() for t in 446 measure_run_time_tests_names.split(',')] 447 else: 448 measure_run_time_tests = [] 449 # grab the parser 450 parser = autoserv_parser.autoserv_parser 451 parser.parse_args() 452 453 if len(sys.argv) == 1: 454 parser.parser.print_help() 455 sys.exit(1) 456 457 # If the job requires to run with server-side package, try to stage server- 458 # side package first. If that fails with error that autotest server package 459 # does not exist, fall back to run the job without using server-side 460 # packaging. If option warn_no_ssp is specified, that means autoserv is 461 # running in a drone does not support SSP, thus no need to stage server-side 462 # package. 463 ssp_url = None 464 ssp_url_warning = False 465 if (not parser.options.warn_no_ssp and parser.options.require_ssp): 466 ssp_url = _stage_ssp(parser) 467 # The build does not have autotest server package. Fall back to not 468 # to use server-side package. Logging is postponed until logging being 469 # set up. 470 ssp_url_warning = not ssp_url 471 472 if parser.options.no_logging: 473 results = None 474 else: 475 results = parser.options.results 476 if not results: 477 results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S') 478 results = os.path.abspath(results) 479 resultdir_exists = False 480 for filename in ('control.srv', 'status.log', '.autoserv_execute'): 481 if os.path.exists(os.path.join(results, filename)): 482 resultdir_exists = True 483 if not parser.options.use_existing_results and resultdir_exists: 484 error = "Error: results directory already exists: %s\n" % results 485 sys.stderr.write(error) 486 sys.exit(1) 487 488 # Now that we certified that there's no leftover results dir from 489 # previous jobs, lets create the result dir since the logging system 490 # needs to create the log file in there. 491 if not os.path.isdir(results): 492 os.makedirs(results) 493 494 # Server-side packaging will only be used if it's required and the package 495 # is available. If warn_no_ssp is specified, it means that autoserv is 496 # running in a drone does not have SSP supported and a warning will be logs. 497 # Therefore, it should not run with SSP. 498 use_ssp = (not parser.options.warn_no_ssp and parser.options.require_ssp 499 and ssp_url) 500 if use_ssp: 501 log_dir = os.path.join(results, 'wrapper') if results else None 502 if log_dir and not os.path.exists(log_dir): 503 os.makedirs(log_dir) 504 else: 505 log_dir = results 506 logging_manager.configure_logging( 507 server_logging_config.ServerLoggingConfig(), 508 results_dir=log_dir, 509 use_console=not parser.options.no_tee, 510 verbose=parser.options.verbose, 511 no_console_prefix=parser.options.no_console_prefix) 512 513 if ssp_url_warning: 514 logging.warn( 515 'Autoserv is required to run with server-side packaging. ' 516 'However, no server-side package can be found based on ' 517 '`--image`, host attribute job_repo_url or host label of ' 518 'cros-version. The test will be executed without ' 519 'server-side packaging supported.') 520 521 if results: 522 logging.info("Results placed in %s" % results) 523 524 # wait until now to perform this check, so it get properly logged 525 if (parser.options.use_existing_results and not resultdir_exists and 526 not lxc.is_in_container()): 527 logging.error("No existing results directory found: %s", results) 528 sys.exit(1) 529 530 logging.debug('autoserv is running in drone %s.', socket.gethostname()) 531 logging.debug('autoserv command was: %s', ' '.join(sys.argv)) 532 533 if parser.options.write_pidfile and results: 534 pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label, 535 results) 536 pid_file_manager.open_file() 537 else: 538 pid_file_manager = None 539 540 autotest.BaseAutotest.set_install_in_tmpdir( 541 parser.options.install_in_tmpdir) 542 543 timer = None 544 try: 545 # Take the first argument as control file name, get the test name from 546 # the control file. If the test name exists in the list of tests with 547 # run time measurement enabled, start a timer to begin measurement. 548 if (len(parser.args) > 0 and parser.args[0] != '' and 549 parser.options.machines): 550 try: 551 test_name = control_data.parse_control(parser.args[0], 552 raise_warnings=True).name 553 except control_data.ControlVariableException: 554 logging.debug('Failed to retrieve test name from control file.') 555 test_name = None 556 if test_name in measure_run_time_tests: 557 machines = parser.options.machines.replace(',', ' ' 558 ).strip().split() 559 try: 560 afe = frontend.AFE() 561 board = server_utils.get_board_from_afe(machines[0], afe) 562 timer = autotest_stats.Timer('autoserv_run_time.%s.%s' % 563 (board, test_name)) 564 timer.start() 565 except (urllib2.HTTPError, urllib2.URLError): 566 # Ignore error if RPC failed to get board 567 pass 568 except control_data.ControlVariableException as e: 569 logging.error(str(e)) 570 exit_code = 0 571 # TODO(beeps): Extend this to cover different failure modes. 572 # Testing exceptions are matched against labels sent to autoserv. Eg, 573 # to allow only the hostless job to run, specify 574 # testing_exceptions: test_suite in the shadow_config. To allow both 575 # the hostless job and dummy_Pass to run, specify 576 # testing_exceptions: test_suite,dummy_Pass. You can figure out 577 # what label autoserv is invoked with by looking through the logs of a test 578 # for the autoserv command's -l option. 579 testing_exceptions = global_config.global_config.get_config_value( 580 'AUTOSERV', 'testing_exceptions', type=list, default=[]) 581 test_mode = global_config.global_config.get_config_value( 582 'AUTOSERV', 'testing_mode', type=bool, default=False) 583 test_mode = (results_mocker and test_mode and not 584 any([ex in parser.options.label 585 for ex in testing_exceptions])) 586 is_task = (parser.options.verify or parser.options.repair or 587 parser.options.provision or parser.options.reset or 588 parser.options.cleanup or parser.options.collect_crashinfo) 589 try: 590 try: 591 if test_mode: 592 # The parser doesn't run on tasks anyway, so we can just return 593 # happy signals without faking results. 594 if not is_task: 595 machine = parser.options.results.split('/')[-1] 596 597 # TODO(beeps): The proper way to do this would be to 598 # refactor job creation so we can invoke job.record 599 # directly. To do that one needs to pipe the test_name 600 # through run_autoserv and bail just before invoking 601 # the server job. See the comment in 602 # puppylab/results_mocker for more context. 603 results_mocker.ResultsMocker( 604 test_name if test_name else 'unknown-test', 605 parser.options.results, machine 606 ).mock_results() 607 return 608 else: 609 run_autoserv(pid_file_manager, results, parser, ssp_url, 610 use_ssp) 611 except SystemExit as e: 612 exit_code = e.code 613 if exit_code: 614 logging.exception(e) 615 except Exception as e: 616 # If we don't know what happened, we'll classify it as 617 # an 'abort' and return 1. 618 logging.exception(e) 619 exit_code = 1 620 finally: 621 if pid_file_manager: 622 pid_file_manager.close_file(exit_code) 623 if timer: 624 timer.stop() 625 # Record the autoserv duration time. Must be called 626 # just before the system exits to ensure accuracy. 627 duration_secs = (datetime.datetime.now() - start_time).total_seconds() 628 record_autoserv(parser.options, duration_secs) 629 sys.exit(exit_code) 630 631 632if __name__ == '__main__': 633 main() 634