revision 22dd226625255110c079e979113dcda1f4fa5ea8
1#!/usr/bin/python -u
2# Copyright 2007-2008 Martin J. Bligh <>, Google Inc.
3# Released under the GPL v2
6Run a control file through the server side engine
9import ast
10import datetime
11import getpass
12import logging
13import os
14import re
15import signal
16import sys
17import traceback
18import time
19import urllib2
21import common
23from autotest_lib.client.common_lib import control_data
24from autotest_lib.client.common_lib import global_config
26    from autotest_lib.puppylab import results_mocker
27except ImportError:
28    results_mocker = None
30require_atfork = global_config.global_config.get_config_value(
31        'AUTOSERV', 'require_atfork_module', type=bool, default=True)
34# Number of seconds to wait before returning if testing mode is enabled
39    import atfork
40    atfork.monkeypatch_os_fork_functions()
41    import atfork.stdlib_fixer
42    # Fix the Python standard library for threading+fork safety with its
43    # internal locks.
44    import warnings
45    warnings.filterwarnings('ignore', 'logging module already imported')
46    atfork.stdlib_fixer.fix_logging_module()
47except ImportError, e:
48    from autotest_lib.client.common_lib import global_config
49    if global_config.global_config.get_config_value(
50            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
51        print >>sys.stderr, 'Please run utils/'
52        print e
53        sys.exit(1)
55from autotest_lib.server import frontend
56from autotest_lib.server import server_logging_config
57from autotest_lib.server import server_job, utils, autoserv_parser, autotest
58from autotest_lib.server import utils as server_utils
59from autotest_lib.site_utils import job_overhead
60from autotest_lib.client.common_lib import pidfile, logging_manager
61from autotest_lib.client.common_lib.cros.graphite import stats
63def log_alarm(signum, frame):
64    logging.error("Received SIGALARM. Ignoring and continuing on.")
65    sys.exit(1)
67def run_autoserv(pid_file_manager, results, parser):
68    # send stdin to /dev/null
69    dev_null =, os.O_RDONLY)
70    os.dup2(dev_null, sys.stdin.fileno())
71    os.close(dev_null)
73    # Create separate process group
74    os.setpgrp()
76    # Implement SIGTERM handler
77    def handle_sigterm(signum, frame):
78        logging.debug('Received SIGTERM')
79        if pid_file_manager:
80            pid_file_manager.close_file(1, signal.SIGTERM)
81        logging.debug('Finished writing to pid_file. Killing process.')
82        # TODO (sbasi) - remove the time.sleep when is solved.
83        # This sleep allows the pending output to be logged before the kill
84        # signal is sent.
85        time.sleep(.1)
86        os.killpg(os.getpgrp(), signal.SIGKILL)
88    # Set signal handler
89    signal.signal(signal.SIGTERM, handle_sigterm)
91    # faulthandler is only needed to debug in the Lab and is not avaliable to
92    # be imported in the chroot as part of VMTest, so Try-Except it.
93    try:
94        import faulthandler
95        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
96        logging.debug('faulthandler registered on SIGTERM.')
97    except ImportError:
98        pass
100    # Ignore SIGTTOU's generated by output from forked children.
101    signal.signal(signal.SIGTTOU, signal.SIG_IGN)
103    # If we received a SIGALARM, let's be loud about it.
104    signal.signal(signal.SIGALRM, log_alarm)
106    # Server side tests that call shell scripts often depend on $USER being set
107    # but depending on how you launch your autotest scheduler it may not be set.
108    os.environ['USER'] = getpass.getuser()
110    if parser.options.machines:
111        machines = parser.options.machines.replace(',', ' ').strip().split()
112    else:
113        machines = []
114    machines_file = parser.options.machines_file
115    label = parser.options.label
116    group_name = parser.options.group_name
117    user = parser.options.user
118    client = parser.options.client
119    server = parser.options.server
120    install_before = parser.options.install_before
121    install_after = parser.options.install_after
122    verify = parser.options.verify
123    repair =
124    cleanup = parser.options.cleanup
125    provision = parser.options.provision
126    reset = parser.options.reset
127    job_labels = parser.options.job_labels
128    no_tee = parser.options.no_tee
129    parse_job = parser.options.parse_job
130    execution_tag = parser.options.execution_tag
131    if not execution_tag:
132        execution_tag = parse_job
133    host_protection = parser.options.host_protection
134    ssh_user = parser.options.ssh_user
135    ssh_port = parser.options.ssh_port
136    ssh_pass = parser.options.ssh_pass
137    collect_crashinfo = parser.options.collect_crashinfo
138    control_filename = parser.options.control_filename
139    test_retry = parser.options.test_retry
140    verify_job_repo_url = parser.options.verify_job_repo_url
141    skip_crash_collection = parser.options.skip_crash_collection
142    ssh_verbosity = int(parser.options.ssh_verbosity)
143    ssh_options = parser.options.ssh_options
145    # can't be both a client and a server side test
146    if client and server:
147        parser.parser.error("Can not specify a test as both server and client!")
149    if provision and client:
150        parser.parser.error("Cannot specify provisioning and client!")
152    is_special_task = (verify or repair or cleanup or collect_crashinfo or
153                       provision or reset)
154    if len(parser.args) < 1 and not is_special_task:
155        parser.parser.error("Missing argument: control file")
157    if ssh_verbosity > 0:
158        # ssh_verbosity is an integer between 0 and 3, inclusive
159        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
160    else:
161        ssh_verbosity_flag = ''
163    # We have a control file unless it's just a verify/repair/cleanup job
164    if len(parser.args) > 0:
165        control = parser.args[0]
166    else:
167        control = None
169    if machines_file:
170        machines = []
171        for m in open(machines_file, 'r').readlines():
172            # remove comments, spaces
173            m = re.sub('#.*', '', m).strip()
174            if m:
175                machines.append(m)
176        print "Read list of machines from file: %s" % machines_file
177        print ','.join(machines)
179    if machines:
180        for machine in machines:
181            if not machine or'\s', machine):
182                parser.parser.error("Invalid machine: %s" % str(machine))
183        machines = list(set(machines))
184        machines.sort()
186    if group_name and len(machines) < 2:
187        parser.parser.error("-G %r may only be supplied with more than one machine."
188               % group_name)
190    kwargs = {'group_name': group_name, 'tag': execution_tag,
191              'disable_sysinfo': parser.options.disable_sysinfo}
192    if control_filename:
193        kwargs['control_filename'] = control_filename
194    job = server_job.server_job(control, parser.args[1:], results, label,
195                                user, machines, client, parse_job,
196                                ssh_user, ssh_port, ssh_pass,
197                                ssh_verbosity_flag, ssh_options,
198                                test_retry, **kwargs)
199    job.logging.start_logging()
200    job.init_parser()
202    # perform checks
203    job.precheck()
205    # run the job
206    exit_code = 0
207    try:
208        try:
209            if repair:
210      , job_labels)
211            elif verify:
212                job.verify(job_labels)
213            elif provision:
214                job.provision(job_labels)
215            elif reset:
216                job.reset(job_labels)
217            elif cleanup:
218                job.cleanup(job_labels)
219            else:
220      , install_after,
221                        verify_job_repo_url=verify_job_repo_url,
222                        only_collect_crashinfo=collect_crashinfo,
223                        skip_crash_collection=skip_crash_collection,
224                        job_labels=job_labels)
225        finally:
226            while job.hosts:
227                host = job.hosts.pop()
228                host.close()
229    except:
230        exit_code = 1
231        traceback.print_exc()
233    if pid_file_manager:
234        pid_file_manager.num_tests_failed = job.num_tests_failed
235        pid_file_manager.close_file(exit_code)
236    job.cleanup_parser()
238    sys.exit(exit_code)
241def _get_job_id_or_task_id(result_dir, machine, is_special_task):
242    """Extract job id or special task id from result_dir
244    @param result_dir: path to the result dir.
245    @param machine: hostname of the machine.
246    @param is_special_task: True/False, whether it is a special task.
248    @returns: integer representing the job id or task id.
249    """
250    if not result_dir:
251        return
252    result_dir = os.path.abspath(result_dir)
253    if is_special_task:
254        # special task result dir is like
255        # /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
256        pattern = '.*/hosts/%s/(\d+)-[^/]+' % machine
257    else:
258        # non-special task result dir is like
259        # /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
260        pattern ='.*/(\d+)-[^/]+/%s' % machine
261    m = re.match(pattern, result_dir)
262    return int( if m else None
265def record_autoserv(options, duration_secs):
266    """Record autoserv end-to-end time in metadata db.
268    @param options: parser options.
269    @param duration_secs: How long autoserv has taken, in secs.
270    """
271    # Get machine hostname
272    machines = options.machines.replace(
273            ',', ' ').strip().split() if options.machines else []
274    num_machines = len(machines)
275    if num_machines > 1:
276        # Skip the case where atomic group is used.
277        return
278    elif num_machines == 0:
279        machines.append('hostless')
281    # Determine the status that will be reported.
282    s = job_overhead.STATUS
283    task_mapping = {
284            'reset': s.RESETTING, 'verify': s.VERIFYING,
285            'provision': s.PROVISIONING, 'repair': s.REPAIRING,
286            'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
287    # option_dict will be like {'reset': True, 'repair': False, ...}
288    option_dict = ast.literal_eval(str(options))
289    match = filter(lambda task: option_dict.get(task) == True, task_mapping)
290    status = task_mapping[match[0]] if match else s.RUNNING
291    is_special_task = status not in [s.RUNNING, s.GATHERING]
292    job_or_task_id = _get_job_id_or_task_id(
293            options.results, machines[0], is_special_task)
294    job_overhead.record_state_duration(
295            job_or_task_id, machines[0], status, duration_secs,
296            is_special_task=is_special_task)
299def main():
300    start_time =
301    # White list of tests with run time measurement enabled.
302    measure_run_time_tests_names = global_config.global_config.get_config_value(
303                        'AUTOSERV', 'measure_run_time_tests', type=str)
304    if measure_run_time_tests_names:
305        measure_run_time_tests = [t.strip() for t in
306                                  measure_run_time_tests_names.split(',')]
307    else:
308        measure_run_time_tests = []
309    # grab the parser
310    parser = autoserv_parser.autoserv_parser
311    parser.parse_args()
313    if len(sys.argv) == 1:
314        parser.parser.print_help()
315        sys.exit(1)
317    if parser.options.no_logging:
318        results = None
319    else:
320        results = parser.options.results
321        if not results:
322            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
323        results  = os.path.abspath(results)
324        resultdir_exists = False
325        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
326            if os.path.exists(os.path.join(results, filename)):
327                resultdir_exists = True
328        if not parser.options.use_existing_results and resultdir_exists:
329            error = "Error: results directory already exists: %s\n" % results
330            sys.stderr.write(error)
331            sys.exit(1)
333        # Now that we certified that there's no leftover results dir from
334        # previous jobs, lets create the result dir since the logging system
335        # needs to create the log file in there.
336        if not os.path.isdir(results):
337            os.makedirs(results)
339    logging_manager.configure_logging(
340            server_logging_config.ServerLoggingConfig(), results_dir=results,
341            use_console=not parser.options.no_tee,
342            verbose=parser.options.verbose,
343            no_console_prefix=parser.options.no_console_prefix)
344    if results:
345"Results placed in %s" % results)
347        # wait until now to perform this check, so it get properly logged
348        if parser.options.use_existing_results and not resultdir_exists:
349            logging.error("No existing results directory found: %s", results)
350            sys.exit(1)
352    logging.debug('autoserv command was: %s', ' '.join(sys.argv))
354    if parser.options.write_pidfile:
355        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
356                                                  results)
357        pid_file_manager.open_file()
358    else:
359        pid_file_manager = None
361    autotest.BaseAutotest.set_install_in_tmpdir(
362        parser.options.install_in_tmpdir)
364    timer = None
365    try:
366        # Take the first argument as control file name, get the test name from
367        # the control file. If the test name exists in the list of tests with
368        # run time measurement enabled, start a timer to begin measurement.
369        if (len(parser.args) > 0 and parser.args[0] != '' and
370            parser.options.machines):
371            try:
372                test_name = control_data.parse_control(parser.args[0],
373                                                       raise_warnings=True).name
374            except control_data.ControlVariableException:
375                logging.debug('Failed to retrieve test name from control file.')
376                test_name = None
377            if test_name in measure_run_time_tests:
378                machines = parser.options.machines.replace(',', ' '
379                                                           ).strip().split()
380                try:
381                    afe = frontend.AFE()
382                    board = server_utils.get_board_from_afe(machines[0], afe)
383                    timer = stats.Timer('autoserv_run_time.%s.%s' %
384                                        (board, test_name))
385                    timer.start()
386                except (urllib2.HTTPError, urllib2.URLError):
387                    # Ignore error if RPC failed to get board
388                    pass
389    except control_data.ControlVariableException as e:
390        logging.error(str(e))
391    exit_code = 0
392    # TODO(beeps): Extend this to cover different failure modes.
393    # Testing exceptions are matched against labels sent to autoserv. Eg,
394    # to allow only the hostless job to run, specify
395    # testing_exceptions: test_suite in the shadow_config. To allow both
396    # the hostless job and dummy_Pass to run, specify
397    # testing_exceptions: test_suite,dummy_Pass. You can figure out
398    # what label autoserv is invoked with by looking through the logs of a test
399    # for the autoserv command's -l option.
400    testing_exceptions = global_config.global_config.get_config_value(
401            'AUTOSERV', 'testing_exceptions', type=list, default=[])
402    test_mode = global_config.global_config.get_config_value(
403            'AUTOSERV', 'testing_mode', type=bool, default=False)
404    test_mode = (results_mocker and test_mode and not
405                 any([ex in parser.options.label
406                      for ex in testing_exceptions]))
407    is_task = (parser.options.verify or or
408               parser.options.provision or parser.options.reset or
409               parser.options.cleanup or parser.options.collect_crashinfo)
410    try:
411        try:
412            if test_mode:
413                # The parser doesn't run on tasks anyway, so we can just return
414                # happy signals without faking results.
415                if not is_task:
416                    machine = parser.options.results.split('/')[-1]
418                    # TODO(beeps): The proper way to do this would be to
419                    # refactor job creation so we can invoke job.record
420                    # directly. To do that one needs to pipe the test_name
421                    # through run_autoserv and bail just before invoking
422                    # the server job. See the comment in
423                    # puppylab/results_mocker for more context.
424                    results_mocker.ResultsMocker(
425                            test_name if test_name else 'unknown-test',
426                            parser.options.results, machine
427                            ).mock_results()
428                return
429            else:
430                run_autoserv(pid_file_manager, results, parser)
431        except SystemExit as e:
432            exit_code = e.code
433            if exit_code:
434                logging.exception(e)
435        except Exception as e:
436            # If we don't know what happened, we'll classify it as
437            # an 'abort' and return 1.
438            logging.exception(e)
439            exit_code = 1
440    finally:
441        if pid_file_manager:
442            pid_file_manager.close_file(exit_code)
443        if timer:
444            timer.stop()
445        # Record the autoserv duration time. Must be called
446        # just before the system exits to ensure accuracy.
447        duration_secs = ( - start_time).total_seconds()
448        record_autoserv(parser.options, duration_secs)
449    sys.exit(exit_code)
452if __name__ == '__main__':
453    main()