autoserv revision c68fefb789c6d6bf936ea5e4270401cca6411ac4
1#!/usr/bin/python -u
2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
3# Released under the GPL v2
4
5"""
6Run a control file through the server side engine
7"""
8
9import ast
10import datetime
11import getpass
12import logging
13import os
14import re
15import signal
16import socket
17import sys
18import traceback
19import time
20import urllib2
21
22import common
23
24from autotest_lib.client.common_lib import control_data
25from autotest_lib.client.common_lib import global_config
26try:
27    from autotest_lib.puppylab import results_mocker
28except ImportError:
29    results_mocker = None
30
31require_atfork = global_config.global_config.get_config_value(
32        'AUTOSERV', 'require_atfork_module', type=bool, default=True)
33
34
35# Number of seconds to wait before returning if testing mode is enabled
36TESTING_MODE_SLEEP_SECS = 1
37
38try:
39    import atfork
40    atfork.monkeypatch_os_fork_functions()
41    import atfork.stdlib_fixer
42    # Fix the Python standard library for threading+fork safety with its
43    # internal locks.  http://code.google.com/p/python-atfork/
44    import warnings
45    warnings.filterwarnings('ignore', 'logging module already imported')
46    atfork.stdlib_fixer.fix_logging_module()
47except ImportError, e:
48    from autotest_lib.client.common_lib import global_config
49    if global_config.global_config.get_config_value(
50            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
51        print >>sys.stderr, 'Please run utils/build_externals.py'
52        print e
53        sys.exit(1)
54
55from autotest_lib.server import frontend
56from autotest_lib.server import server_logging_config
57from autotest_lib.server import server_job, utils, autoserv_parser, autotest
58from autotest_lib.server import utils as server_utils
59from autotest_lib.site_utils import job_directories
60from autotest_lib.site_utils import job_overhead
61from autotest_lib.site_utils import lxc
62from autotest_lib.client.common_lib import pidfile, logging_manager
63from autotest_lib.client.common_lib.cros.graphite import autotest_stats
64
65# Control segment to stage server-side package.
66STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE = server_job._control_segment_path(
67        'stage_server_side_package')
68
69def log_alarm(signum, frame):
70    logging.error("Received SIGALARM. Ignoring and continuing on.")
71    sys.exit(1)
72
73
74def _get_machines(parser):
75    """Get a list of machine names from command line arg -m or a file.
76
77    @param parser: Parser for the command line arguments.
78
79    @return: A list of machine names from command line arg -m or the
80             machines file specified in the command line arg -M.
81    """
82    if parser.options.machines:
83        machines = parser.options.machines.replace(',', ' ').strip().split()
84    else:
85        machines = []
86    machines_file = parser.options.machines_file
87    if machines_file:
88        machines = []
89        for m in open(machines_file, 'r').readlines():
90            # remove comments, spaces
91            m = re.sub('#.*', '', m).strip()
92            if m:
93                machines.append(m)
94        logging.debug('Read list of machines from file: %s', machines_file)
95        logging.debug('Machines: %s', ','.join(machines))
96
97    if machines:
98        for machine in machines:
99            if not machine or re.search('\s', machine):
100                parser.parser.error("Invalid machine: %s" % str(machine))
101        machines = list(set(machines))
102        machines.sort()
103    return machines
104
105
106def _stage_ssp(parser):
107    """Stage server-side package.
108
109    This function calls a control segment to stage server-side package based on
110    the job and autoserv command line option. The detail implementation could
111    be different for each host type. Currently, only CrosHost has
112    stage_server_side_package function defined.
113    The script returns None if no server-side package is available. However,
114    it may raise exception if it failed for reasons other than artifact (the
115    server-side package) not found.
116
117    @param parser: Command line arguments parser passed in the autoserv process.
118
119    @return: url of the staged server-side package. Return None if server-
120             side package is not found for the build.
121    """
122    namespace = {'machines': _get_machines(parser),
123                 'image': parser.options.image}
124    script_locals = {}
125    execfile(STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE, namespace, script_locals)
126    return script_locals['ssp_url']
127
128
129def _run_with_ssp(container_name, job_id, results, parser, ssp_url):
130    """Run the server job with server-side packaging.
131
132    @param container_name: Name of the container to run the test.
133    @param job_id: ID of the test job.
134    @param results: Folder to store results. This could be different from
135                    parser.options.results:
136                    parser.options.results  can be set to None for results to be
137                    stored in a temp folder.
138                    results can be None for autoserv run requires no logging.
139    @param parser: Command line parser that contains the options.
140    @param ssp_url: url of the staged server-side package.
141    """
142    bucket = lxc.ContainerBucket()
143    control = (parser.args[0] if len(parser.args) > 0 and parser.args[0] != ''
144               else None)
145    test_container = bucket.setup_test(container_name, job_id, ssp_url, results,
146                                       control=control)
147    args = sys.argv[:]
148    args.remove('--require-ssp')
149
150    # A dictionary of paths to replace in the command line. Key is the path to
151    # be replaced with the one in value.
152    paths_to_replace = {}
153    # Replace the control file path with the one in container.
154    if control:
155        container_control_filename = os.path.join(
156                lxc.CONTROL_TEMP_PATH, os.path.basename(control))
157        paths_to_replace[control] = container_control_filename
158    # Update result directory with the one in container.
159    if parser.options.results:
160        container_result_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
161        paths_to_replace[parser.options.results] = container_result_dir
162    # Update parse_job directory with the one in container. The assumption is
163    # that the result folder to be parsed is always the same as the results_dir.
164    if parser.options.parse_job:
165        container_parse_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
166        paths_to_replace[parser.options.parse_job] = container_result_dir
167
168    args = [paths_to_replace.get(arg, arg) for arg in args]
169
170    # Apply --use-existing-results, results directory is aready created and
171    # mounted in container. Apply this arg to avoid exception being raised.
172    if not '--use-existing-results' in args:
173        args.append('--use-existing-results')
174
175    # Make sure autoserv running in container using a different pid file.
176    if not '--pidfile-label' in args:
177        args.extend(['--pidfile-label', 'container_autoserv'])
178
179    cmd_line = ' '.join(args)
180    logging.info('Run command in container: %s', cmd_line)
181    try:
182        test_container.attach_run(cmd_line)
183    finally:
184        test_container.destroy()
185
186
187def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
188    """Run server job with given options.
189
190    @param pid_file_manager: PidFileManager used to monitor the autoserv process
191    @param results: Folder to store results.
192    @param parser: Parser for the command line arguments.
193    @param ssp_url: Url to server-side package.
194    @param use_ssp: Set to True to run with server-side packaging.
195    """
196    if parser.options.warn_no_ssp:
197        # Post a warning in the log.
198        logging.warn('Autoserv is required to run with server-side packaging. '
199                     'However, no drone is found to support server-side '
200                     'packaging. The test will be executed in a drone without '
201                     'server-side packaging supported.')
202
203    # send stdin to /dev/null
204    dev_null = os.open(os.devnull, os.O_RDONLY)
205    os.dup2(dev_null, sys.stdin.fileno())
206    os.close(dev_null)
207
208    # Create separate process group
209    os.setpgrp()
210
211    # Container name is predefined so the container can be destroyed in
212    # handle_sigterm.
213    job_or_task_id = job_directories.get_job_id_or_task_id(
214            parser.options.results)
215    container_name = (lxc.TEST_CONTAINER_NAME_FMT %
216                      (job_or_task_id, time.time()))
217
218    # Implement SIGTERM handler
219    def handle_sigterm(signum, frame):
220        logging.debug('Received SIGTERM')
221        if pid_file_manager:
222            pid_file_manager.close_file(1, signal.SIGTERM)
223        logging.debug('Finished writing to pid_file. Killing process.')
224        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
225        # This sleep allows the pending output to be logged before the kill
226        # signal is sent.
227        time.sleep(.1)
228        if use_ssp:
229            logging.debug('Destroy container %s before aborting the autoserv '
230                          'process.', container_name)
231            try:
232                bucket = lxc.ContainerBucket()
233                container = bucket.get(container_name)
234                if container:
235                    container.destroy()
236                else:
237                    logging.debug('Container %s is not found.', container_name)
238            except:
239                # Handle any exception so the autoserv process can be aborted.
240                logging.error('Failed to destroy container %s. Error: %s',
241                              container_name, sys.exc_info())
242
243        os.killpg(os.getpgrp(), signal.SIGKILL)
244
245    # Set signal handler
246    signal.signal(signal.SIGTERM, handle_sigterm)
247
248    # faulthandler is only needed to debug in the Lab and is not avaliable to
249    # be imported in the chroot as part of VMTest, so Try-Except it.
250    try:
251        import faulthandler
252        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
253        logging.debug('faulthandler registered on SIGTERM.')
254    except ImportError:
255        pass
256
257    # Ignore SIGTTOU's generated by output from forked children.
258    signal.signal(signal.SIGTTOU, signal.SIG_IGN)
259
260    # If we received a SIGALARM, let's be loud about it.
261    signal.signal(signal.SIGALRM, log_alarm)
262
263    # Server side tests that call shell scripts often depend on $USER being set
264    # but depending on how you launch your autotest scheduler it may not be set.
265    os.environ['USER'] = getpass.getuser()
266
267    label = parser.options.label
268    group_name = parser.options.group_name
269    user = parser.options.user
270    client = parser.options.client
271    server = parser.options.server
272    install_before = parser.options.install_before
273    install_after = parser.options.install_after
274    verify = parser.options.verify
275    repair = parser.options.repair
276    cleanup = parser.options.cleanup
277    provision = parser.options.provision
278    reset = parser.options.reset
279    job_labels = parser.options.job_labels
280    no_tee = parser.options.no_tee
281    parse_job = parser.options.parse_job
282    execution_tag = parser.options.execution_tag
283    if not execution_tag:
284        execution_tag = parse_job
285    host_protection = parser.options.host_protection
286    ssh_user = parser.options.ssh_user
287    ssh_port = parser.options.ssh_port
288    ssh_pass = parser.options.ssh_pass
289    collect_crashinfo = parser.options.collect_crashinfo
290    control_filename = parser.options.control_filename
291    test_retry = parser.options.test_retry
292    verify_job_repo_url = parser.options.verify_job_repo_url
293    skip_crash_collection = parser.options.skip_crash_collection
294    ssh_verbosity = int(parser.options.ssh_verbosity)
295    ssh_options = parser.options.ssh_options
296    no_use_packaging = parser.options.no_use_packaging
297
298    # can't be both a client and a server side test
299    if client and server:
300        parser.parser.error("Can not specify a test as both server and client!")
301
302    if provision and client:
303        parser.parser.error("Cannot specify provisioning and client!")
304
305    is_special_task = (verify or repair or cleanup or collect_crashinfo or
306                       provision or reset)
307    if len(parser.args) < 1 and not is_special_task:
308        parser.parser.error("Missing argument: control file")
309
310    if ssh_verbosity > 0:
311        # ssh_verbosity is an integer between 0 and 3, inclusive
312        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
313    else:
314        ssh_verbosity_flag = ''
315
316    # We have a control file unless it's just a verify/repair/cleanup job
317    if len(parser.args) > 0:
318        control = parser.args[0]
319    else:
320        control = None
321
322    machines = _get_machines(parser)
323    if group_name and len(machines) < 2:
324        parser.parser.error('-G %r may only be supplied with more than one '
325                            'machine.' % group_name)
326
327    kwargs = {'group_name': group_name, 'tag': execution_tag,
328              'disable_sysinfo': parser.options.disable_sysinfo}
329    if control_filename:
330        kwargs['control_filename'] = control_filename
331    job = server_job.server_job(control, parser.args[1:], results, label,
332                                user, machines, client, parse_job,
333                                ssh_user, ssh_port, ssh_pass,
334                                ssh_verbosity_flag, ssh_options,
335                                test_retry, **kwargs)
336
337    job.logging.start_logging()
338    job.init_parser()
339
340    # perform checks
341    job.precheck()
342
343    # run the job
344    exit_code = 0
345    try:
346        try:
347            if repair:
348                job.repair(host_protection, job_labels)
349            elif verify:
350                job.verify(job_labels)
351            elif provision:
352                job.provision(job_labels)
353            elif reset:
354                job.reset(job_labels)
355            elif cleanup:
356                job.cleanup(job_labels)
357            else:
358                if use_ssp:
359                    try:
360                        _run_with_ssp(container_name, job_or_task_id, results,
361                                      parser, ssp_url)
362                    finally:
363                        # Update the ownership of files in result folder.
364                        # TODO(dshi): crbug.com/459344 Skip following action
365                        # when test container can be unprivileged container.
366                        if results:
367                            lxc.run('chown -R %s %s' % (os.getuid(), results))
368                            lxc.run('chgrp -R %s %s' % (os.getgid(), results))
369                else:
370                    job.run(install_before, install_after,
371                            verify_job_repo_url=verify_job_repo_url,
372                            only_collect_crashinfo=collect_crashinfo,
373                            skip_crash_collection=skip_crash_collection,
374                            job_labels=job_labels,
375                            use_packaging=(not no_use_packaging))
376        finally:
377            while job.hosts:
378                host = job.hosts.pop()
379                host.close()
380    except:
381        exit_code = 1
382        traceback.print_exc()
383
384    if pid_file_manager:
385        pid_file_manager.num_tests_failed = job.num_tests_failed
386        pid_file_manager.close_file(exit_code)
387    job.cleanup_parser()
388
389    sys.exit(exit_code)
390
391
392def record_autoserv(options, duration_secs):
393    """Record autoserv end-to-end time in metadata db.
394
395    @param options: parser options.
396    @param duration_secs: How long autoserv has taken, in secs.
397    """
398    # Get machine hostname
399    machines = options.machines.replace(
400            ',', ' ').strip().split() if options.machines else []
401    num_machines = len(machines)
402    if num_machines > 1:
403        # Skip the case where atomic group is used.
404        return
405    elif num_machines == 0:
406        machines.append('hostless')
407
408    # Determine the status that will be reported.
409    s = job_overhead.STATUS
410    task_mapping = {
411            'reset': s.RESETTING, 'verify': s.VERIFYING,
412            'provision': s.PROVISIONING, 'repair': s.REPAIRING,
413            'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
414    # option_dict will be like {'reset': True, 'repair': False, ...}
415    option_dict = ast.literal_eval(str(options))
416    match = filter(lambda task: option_dict.get(task) == True, task_mapping)
417    status = task_mapping[match[0]] if match else s.RUNNING
418    is_special_task = status not in [s.RUNNING, s.GATHERING]
419    job_or_task_id = job_directories.get_job_id_or_task_id(options.results)
420    job_overhead.record_state_duration(
421            job_or_task_id, machines[0], status, duration_secs,
422            is_special_task=is_special_task)
423
424
425def main():
426    start_time = datetime.datetime.now()
427    # White list of tests with run time measurement enabled.
428    measure_run_time_tests_names = global_config.global_config.get_config_value(
429                        'AUTOSERV', 'measure_run_time_tests', type=str)
430    if measure_run_time_tests_names:
431        measure_run_time_tests = [t.strip() for t in
432                                  measure_run_time_tests_names.split(',')]
433    else:
434        measure_run_time_tests = []
435    # grab the parser
436    parser = autoserv_parser.autoserv_parser
437    parser.parse_args()
438
439    if len(sys.argv) == 1:
440        parser.parser.print_help()
441        sys.exit(1)
442
443    # If the job requires to run with server-side package, try to stage server-
444    # side package first. If that fails with error that autotest server package
445    # does not exist, fall back to run the job without using server-side
446    # packaging. If option warn_no_ssp is specified, that means autoserv is
447    # running in a drone does not support SSP, thus no need to stage server-side
448    # package.
449    ssp_url = None
450    if (not parser.options.warn_no_ssp and parser.options.require_ssp):
451        ssp_url = _stage_ssp(parser)
452        if not ssp_url:
453            # The build does not have autotest server package. Fall back to not
454            # to use server-side package, reset logging to log in results
455            # folder.
456            logging.warn(
457                    'Autoserv is required to run with server-side packaging. '
458                    'However, no server-side package can be found based on '
459                    '`--image`, host attribute job_repo_url or host label of '
460                    'cros-version. The test will be executed without '
461                    'server-side packaging supported.')
462
463    if parser.options.no_logging:
464        results = None
465    else:
466        results = parser.options.results
467        if not results:
468            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
469        results  = os.path.abspath(results)
470        resultdir_exists = False
471        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
472            if os.path.exists(os.path.join(results, filename)):
473                resultdir_exists = True
474        if not parser.options.use_existing_results and resultdir_exists:
475            error = "Error: results directory already exists: %s\n" % results
476            sys.stderr.write(error)
477            sys.exit(1)
478
479        # Now that we certified that there's no leftover results dir from
480        # previous jobs, lets create the result dir since the logging system
481        # needs to create the log file in there.
482        if not os.path.isdir(results):
483            os.makedirs(results)
484
485    # Server-side packaging will only be used if it's required and the package
486    # is available. If warn_no_ssp is specified, it means that autoserv is
487    # running in a drone does not have SSP supported and a warning will be logs.
488    # Therefore, it should not run with SSP.
489    use_ssp = (not parser.options.warn_no_ssp and parser.options.require_ssp
490               and ssp_url)
491    if use_ssp:
492        log_dir = os.path.join(results, 'wrapper') if results else None
493        if log_dir and not os.path.exists(log_dir):
494            os.makedirs(log_dir)
495    else:
496        log_dir = results
497    logging_manager.configure_logging(
498            server_logging_config.ServerLoggingConfig(),
499            results_dir=log_dir,
500            use_console=not parser.options.no_tee,
501            verbose=parser.options.verbose,
502            no_console_prefix=parser.options.no_console_prefix)
503
504    if results:
505        logging.info("Results placed in %s" % results)
506
507        # wait until now to perform this check, so it get properly logged
508        if (parser.options.use_existing_results and not resultdir_exists and
509            not lxc.is_in_container()):
510            logging.error("No existing results directory found: %s", results)
511            sys.exit(1)
512
513    logging.debug('autoserv is running in drone %s.', socket.gethostname())
514    logging.debug('autoserv command was: %s', ' '.join(sys.argv))
515
516    if parser.options.write_pidfile and results:
517        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
518                                                  results)
519        pid_file_manager.open_file()
520    else:
521        pid_file_manager = None
522
523    autotest.BaseAutotest.set_install_in_tmpdir(
524        parser.options.install_in_tmpdir)
525
526    timer = None
527    try:
528        # Take the first argument as control file name, get the test name from
529        # the control file. If the test name exists in the list of tests with
530        # run time measurement enabled, start a timer to begin measurement.
531        if (len(parser.args) > 0 and parser.args[0] != '' and
532            parser.options.machines):
533            try:
534                test_name = control_data.parse_control(parser.args[0],
535                                                       raise_warnings=True).name
536            except control_data.ControlVariableException:
537                logging.debug('Failed to retrieve test name from control file.')
538                test_name = None
539            if test_name in measure_run_time_tests:
540                machines = parser.options.machines.replace(',', ' '
541                                                           ).strip().split()
542                try:
543                    afe = frontend.AFE()
544                    board = server_utils.get_board_from_afe(machines[0], afe)
545                    timer = autotest_stats.Timer('autoserv_run_time.%s.%s' %
546                                                 (board, test_name))
547                    timer.start()
548                except (urllib2.HTTPError, urllib2.URLError):
549                    # Ignore error if RPC failed to get board
550                    pass
551    except control_data.ControlVariableException as e:
552        logging.error(str(e))
553    exit_code = 0
554    # TODO(beeps): Extend this to cover different failure modes.
555    # Testing exceptions are matched against labels sent to autoserv. Eg,
556    # to allow only the hostless job to run, specify
557    # testing_exceptions: test_suite in the shadow_config. To allow both
558    # the hostless job and dummy_Pass to run, specify
559    # testing_exceptions: test_suite,dummy_Pass. You can figure out
560    # what label autoserv is invoked with by looking through the logs of a test
561    # for the autoserv command's -l option.
562    testing_exceptions = global_config.global_config.get_config_value(
563            'AUTOSERV', 'testing_exceptions', type=list, default=[])
564    test_mode = global_config.global_config.get_config_value(
565            'AUTOSERV', 'testing_mode', type=bool, default=False)
566    test_mode = (results_mocker and test_mode and not
567                 any([ex in parser.options.label
568                      for ex in testing_exceptions]))
569    is_task = (parser.options.verify or parser.options.repair or
570               parser.options.provision or parser.options.reset or
571               parser.options.cleanup or parser.options.collect_crashinfo)
572    try:
573        try:
574            if test_mode:
575                # The parser doesn't run on tasks anyway, so we can just return
576                # happy signals without faking results.
577                if not is_task:
578                    machine = parser.options.results.split('/')[-1]
579
580                    # TODO(beeps): The proper way to do this would be to
581                    # refactor job creation so we can invoke job.record
582                    # directly. To do that one needs to pipe the test_name
583                    # through run_autoserv and bail just before invoking
584                    # the server job. See the comment in
585                    # puppylab/results_mocker for more context.
586                    results_mocker.ResultsMocker(
587                            test_name if test_name else 'unknown-test',
588                            parser.options.results, machine
589                            ).mock_results()
590                return
591            else:
592                run_autoserv(pid_file_manager, results, parser, ssp_url,
593                             use_ssp)
594        except SystemExit as e:
595            exit_code = e.code
596            if exit_code:
597                logging.exception(e)
598        except Exception as e:
599            # If we don't know what happened, we'll classify it as
600            # an 'abort' and return 1.
601            logging.exception(e)
602            exit_code = 1
603    finally:
604        if pid_file_manager:
605            pid_file_manager.close_file(exit_code)
606        if timer:
607            timer.stop()
608        # Record the autoserv duration time. Must be called
609        # just before the system exits to ensure accuracy.
610        duration_secs = (datetime.datetime.now() - start_time).total_seconds()
611        record_autoserv(parser.options, duration_secs)
612    sys.exit(exit_code)
613
614
615if __name__ == '__main__':
616    main()
617