autoserv.py revision 22dd226625255110c079e979113dcda1f4fa5ea8
1#!/usr/bin/python -u
2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
3# Released under the GPL v2
4
5"""
6Run a control file through the server side engine
7"""
8
9import ast
10import datetime
11import getpass
12import logging
13import os
14import re
15import signal
16import sys
17import traceback
18import time
19import urllib2
20
21import common
22
23from autotest_lib.client.common_lib import control_data
24from autotest_lib.client.common_lib import global_config
25try:
26    from autotest_lib.puppylab import results_mocker
27except ImportError:
28    results_mocker = None
29
30require_atfork = global_config.global_config.get_config_value(
31        'AUTOSERV', 'require_atfork_module', type=bool, default=True)
32
33
34# Number of seconds to wait before returning if testing mode is enabled
35TESTING_MODE_SLEEP_SECS = 1
36
37
38try:
39    import atfork
40    atfork.monkeypatch_os_fork_functions()
41    import atfork.stdlib_fixer
42    # Fix the Python standard library for threading+fork safety with its
43    # internal locks.  http://code.google.com/p/python-atfork/
44    import warnings
45    warnings.filterwarnings('ignore', 'logging module already imported')
46    atfork.stdlib_fixer.fix_logging_module()
47except ImportError, e:
48    from autotest_lib.client.common_lib import global_config
49    if global_config.global_config.get_config_value(
50            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
51        print >>sys.stderr, 'Please run utils/build_externals.py'
52        print e
53        sys.exit(1)
54
55from autotest_lib.server import frontend
56from autotest_lib.server import server_logging_config
57from autotest_lib.server import server_job, utils, autoserv_parser, autotest
58from autotest_lib.server import utils as server_utils
59from autotest_lib.site_utils import job_overhead
60from autotest_lib.client.common_lib import pidfile, logging_manager
61from autotest_lib.client.common_lib.cros.graphite import stats
62
63def log_alarm(signum, frame):
64    logging.error("Received SIGALARM. Ignoring and continuing on.")
65    sys.exit(1)
66
67def run_autoserv(pid_file_manager, results, parser):
68    # send stdin to /dev/null
69    dev_null = os.open(os.devnull, os.O_RDONLY)
70    os.dup2(dev_null, sys.stdin.fileno())
71    os.close(dev_null)
72
73    # Create separate process group
74    os.setpgrp()
75
76    # Implement SIGTERM handler
77    def handle_sigterm(signum, frame):
78        logging.debug('Received SIGTERM')
79        if pid_file_manager:
80            pid_file_manager.close_file(1, signal.SIGTERM)
81        logging.debug('Finished writing to pid_file. Killing process.')
82        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
83        # This sleep allows the pending output to be logged before the kill
84        # signal is sent.
85        time.sleep(.1)
86        os.killpg(os.getpgrp(), signal.SIGKILL)
87
88    # Set signal handler
89    signal.signal(signal.SIGTERM, handle_sigterm)
90
91    # faulthandler is only needed to debug in the Lab and is not avaliable to
92    # be imported in the chroot as part of VMTest, so Try-Except it.
93    try:
94        import faulthandler
95        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
96        logging.debug('faulthandler registered on SIGTERM.')
97    except ImportError:
98        pass
99
100    # Ignore SIGTTOU's generated by output from forked children.
101    signal.signal(signal.SIGTTOU, signal.SIG_IGN)
102
103    # If we received a SIGALARM, let's be loud about it.
104    signal.signal(signal.SIGALRM, log_alarm)
105
106    # Server side tests that call shell scripts often depend on $USER being set
107    # but depending on how you launch your autotest scheduler it may not be set.
108    os.environ['USER'] = getpass.getuser()
109
110    if parser.options.machines:
111        machines = parser.options.machines.replace(',', ' ').strip().split()
112    else:
113        machines = []
114    machines_file = parser.options.machines_file
115    label = parser.options.label
116    group_name = parser.options.group_name
117    user = parser.options.user
118    client = parser.options.client
119    server = parser.options.server
120    install_before = parser.options.install_before
121    install_after = parser.options.install_after
122    verify = parser.options.verify
123    repair = parser.options.repair
124    cleanup = parser.options.cleanup
125    provision = parser.options.provision
126    reset = parser.options.reset
127    job_labels = parser.options.job_labels
128    no_tee = parser.options.no_tee
129    parse_job = parser.options.parse_job
130    execution_tag = parser.options.execution_tag
131    if not execution_tag:
132        execution_tag = parse_job
133    host_protection = parser.options.host_protection
134    ssh_user = parser.options.ssh_user
135    ssh_port = parser.options.ssh_port
136    ssh_pass = parser.options.ssh_pass
137    collect_crashinfo = parser.options.collect_crashinfo
138    control_filename = parser.options.control_filename
139    test_retry = parser.options.test_retry
140    verify_job_repo_url = parser.options.verify_job_repo_url
141    skip_crash_collection = parser.options.skip_crash_collection
142    ssh_verbosity = int(parser.options.ssh_verbosity)
143    ssh_options = parser.options.ssh_options
144
145    # can't be both a client and a server side test
146    if client and server:
147        parser.parser.error("Can not specify a test as both server and client!")
148
149    if provision and client:
150        parser.parser.error("Cannot specify provisioning and client!")
151
152    is_special_task = (verify or repair or cleanup or collect_crashinfo or
153                       provision or reset)
154    if len(parser.args) < 1 and not is_special_task:
155        parser.parser.error("Missing argument: control file")
156
157    if ssh_verbosity > 0:
158        # ssh_verbosity is an integer between 0 and 3, inclusive
159        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
160    else:
161        ssh_verbosity_flag = ''
162
163    # We have a control file unless it's just a verify/repair/cleanup job
164    if len(parser.args) > 0:
165        control = parser.args[0]
166    else:
167        control = None
168
169    if machines_file:
170        machines = []
171        for m in open(machines_file, 'r').readlines():
172            # remove comments, spaces
173            m = re.sub('#.*', '', m).strip()
174            if m:
175                machines.append(m)
176        print "Read list of machines from file: %s" % machines_file
177        print ','.join(machines)
178
179    if machines:
180        for machine in machines:
181            if not machine or re.search('\s', machine):
182                parser.parser.error("Invalid machine: %s" % str(machine))
183        machines = list(set(machines))
184        machines.sort()
185
186    if group_name and len(machines) < 2:
187        parser.parser.error("-G %r may only be supplied with more than one machine."
188               % group_name)
189
190    kwargs = {'group_name': group_name, 'tag': execution_tag,
191              'disable_sysinfo': parser.options.disable_sysinfo}
192    if control_filename:
193        kwargs['control_filename'] = control_filename
194    job = server_job.server_job(control, parser.args[1:], results, label,
195                                user, machines, client, parse_job,
196                                ssh_user, ssh_port, ssh_pass,
197                                ssh_verbosity_flag, ssh_options,
198                                test_retry, **kwargs)
199    job.logging.start_logging()
200    job.init_parser()
201
202    # perform checks
203    job.precheck()
204
205    # run the job
206    exit_code = 0
207    try:
208        try:
209            if repair:
210                job.repair(host_protection, job_labels)
211            elif verify:
212                job.verify(job_labels)
213            elif provision:
214                job.provision(job_labels)
215            elif reset:
216                job.reset(job_labels)
217            elif cleanup:
218                job.cleanup(job_labels)
219            else:
220                job.run(install_before, install_after,
221                        verify_job_repo_url=verify_job_repo_url,
222                        only_collect_crashinfo=collect_crashinfo,
223                        skip_crash_collection=skip_crash_collection,
224                        job_labels=job_labels)
225        finally:
226            while job.hosts:
227                host = job.hosts.pop()
228                host.close()
229    except:
230        exit_code = 1
231        traceback.print_exc()
232
233    if pid_file_manager:
234        pid_file_manager.num_tests_failed = job.num_tests_failed
235        pid_file_manager.close_file(exit_code)
236    job.cleanup_parser()
237
238    sys.exit(exit_code)
239
240
241def _get_job_id_or_task_id(result_dir, machine, is_special_task):
242    """Extract job id or special task id from result_dir
243
244    @param result_dir: path to the result dir.
245    @param machine: hostname of the machine.
246    @param is_special_task: True/False, whether it is a special task.
247
248    @returns: integer representing the job id or task id.
249    """
250    if not result_dir:
251        return
252    result_dir = os.path.abspath(result_dir)
253    if is_special_task:
254        # special task result dir is like
255        # /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
256        pattern = '.*/hosts/%s/(\d+)-[^/]+' % machine
257    else:
258        # non-special task result dir is like
259        # /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
260        pattern ='.*/(\d+)-[^/]+/%s' % machine
261    m = re.match(pattern, result_dir)
262    return int(m.group(1)) if m else None
263
264
265def record_autoserv(options, duration_secs):
266    """Record autoserv end-to-end time in metadata db.
267
268    @param options: parser options.
269    @param duration_secs: How long autoserv has taken, in secs.
270    """
271    # Get machine hostname
272    machines = options.machines.replace(
273            ',', ' ').strip().split() if options.machines else []
274    num_machines = len(machines)
275    if num_machines > 1:
276        # Skip the case where atomic group is used.
277        return
278    elif num_machines == 0:
279        machines.append('hostless')
280
281    # Determine the status that will be reported.
282    s = job_overhead.STATUS
283    task_mapping = {
284            'reset': s.RESETTING, 'verify': s.VERIFYING,
285            'provision': s.PROVISIONING, 'repair': s.REPAIRING,
286            'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
287    # option_dict will be like {'reset': True, 'repair': False, ...}
288    option_dict = ast.literal_eval(str(options))
289    match = filter(lambda task: option_dict.get(task) == True, task_mapping)
290    status = task_mapping[match[0]] if match else s.RUNNING
291    is_special_task = status not in [s.RUNNING, s.GATHERING]
292    job_or_task_id = _get_job_id_or_task_id(
293            options.results, machines[0], is_special_task)
294    job_overhead.record_state_duration(
295            job_or_task_id, machines[0], status, duration_secs,
296            is_special_task=is_special_task)
297
298
299def main():
300    start_time = datetime.datetime.now()
301    # White list of tests with run time measurement enabled.
302    measure_run_time_tests_names = global_config.global_config.get_config_value(
303                        'AUTOSERV', 'measure_run_time_tests', type=str)
304    if measure_run_time_tests_names:
305        measure_run_time_tests = [t.strip() for t in
306                                  measure_run_time_tests_names.split(',')]
307    else:
308        measure_run_time_tests = []
309    # grab the parser
310    parser = autoserv_parser.autoserv_parser
311    parser.parse_args()
312
313    if len(sys.argv) == 1:
314        parser.parser.print_help()
315        sys.exit(1)
316
317    if parser.options.no_logging:
318        results = None
319    else:
320        results = parser.options.results
321        if not results:
322            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
323        results  = os.path.abspath(results)
324        resultdir_exists = False
325        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
326            if os.path.exists(os.path.join(results, filename)):
327                resultdir_exists = True
328        if not parser.options.use_existing_results and resultdir_exists:
329            error = "Error: results directory already exists: %s\n" % results
330            sys.stderr.write(error)
331            sys.exit(1)
332
333        # Now that we certified that there's no leftover results dir from
334        # previous jobs, lets create the result dir since the logging system
335        # needs to create the log file in there.
336        if not os.path.isdir(results):
337            os.makedirs(results)
338
339    logging_manager.configure_logging(
340            server_logging_config.ServerLoggingConfig(), results_dir=results,
341            use_console=not parser.options.no_tee,
342            verbose=parser.options.verbose,
343            no_console_prefix=parser.options.no_console_prefix)
344    if results:
345        logging.info("Results placed in %s" % results)
346
347        # wait until now to perform this check, so it get properly logged
348        if parser.options.use_existing_results and not resultdir_exists:
349            logging.error("No existing results directory found: %s", results)
350            sys.exit(1)
351
352    logging.debug('autoserv command was: %s', ' '.join(sys.argv))
353
354    if parser.options.write_pidfile:
355        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
356                                                  results)
357        pid_file_manager.open_file()
358    else:
359        pid_file_manager = None
360
361    autotest.BaseAutotest.set_install_in_tmpdir(
362        parser.options.install_in_tmpdir)
363
364    timer = None
365    try:
366        # Take the first argument as control file name, get the test name from
367        # the control file. If the test name exists in the list of tests with
368        # run time measurement enabled, start a timer to begin measurement.
369        if (len(parser.args) > 0 and parser.args[0] != '' and
370            parser.options.machines):
371            try:
372                test_name = control_data.parse_control(parser.args[0],
373                                                       raise_warnings=True).name
374            except control_data.ControlVariableException:
375                logging.debug('Failed to retrieve test name from control file.')
376                test_name = None
377            if test_name in measure_run_time_tests:
378                machines = parser.options.machines.replace(',', ' '
379                                                           ).strip().split()
380                try:
381                    afe = frontend.AFE()
382                    board = server_utils.get_board_from_afe(machines[0], afe)
383                    timer = stats.Timer('autoserv_run_time.%s.%s' %
384                                        (board, test_name))
385                    timer.start()
386                except (urllib2.HTTPError, urllib2.URLError):
387                    # Ignore error if RPC failed to get board
388                    pass
389    except control_data.ControlVariableException as e:
390        logging.error(str(e))
391    exit_code = 0
392    # TODO(beeps): Extend this to cover different failure modes.
393    # Testing exceptions are matched against labels sent to autoserv. Eg,
394    # to allow only the hostless job to run, specify
395    # testing_exceptions: test_suite in the shadow_config. To allow both
396    # the hostless job and dummy_Pass to run, specify
397    # testing_exceptions: test_suite,dummy_Pass. You can figure out
398    # what label autoserv is invoked with by looking through the logs of a test
399    # for the autoserv command's -l option.
400    testing_exceptions = global_config.global_config.get_config_value(
401            'AUTOSERV', 'testing_exceptions', type=list, default=[])
402    test_mode = global_config.global_config.get_config_value(
403            'AUTOSERV', 'testing_mode', type=bool, default=False)
404    test_mode = (results_mocker and test_mode and not
405                 any([ex in parser.options.label
406                      for ex in testing_exceptions]))
407    is_task = (parser.options.verify or parser.options.repair or
408               parser.options.provision or parser.options.reset or
409               parser.options.cleanup or parser.options.collect_crashinfo)
410    try:
411        try:
412            if test_mode:
413                # The parser doesn't run on tasks anyway, so we can just return
414                # happy signals without faking results.
415                if not is_task:
416                    machine = parser.options.results.split('/')[-1]
417
418                    # TODO(beeps): The proper way to do this would be to
419                    # refactor job creation so we can invoke job.record
420                    # directly. To do that one needs to pipe the test_name
421                    # through run_autoserv and bail just before invoking
422                    # the server job. See the comment in
423                    # puppylab/results_mocker for more context.
424                    results_mocker.ResultsMocker(
425                            test_name if test_name else 'unknown-test',
426                            parser.options.results, machine
427                            ).mock_results()
428                return
429            else:
430                run_autoserv(pid_file_manager, results, parser)
431        except SystemExit as e:
432            exit_code = e.code
433            if exit_code:
434                logging.exception(e)
435        except Exception as e:
436            # If we don't know what happened, we'll classify it as
437            # an 'abort' and return 1.
438            logging.exception(e)
439            exit_code = 1
440    finally:
441        if pid_file_manager:
442            pid_file_manager.close_file(exit_code)
443        if timer:
444            timer.stop()
445        # Record the autoserv duration time. Must be called
446        # just before the system exits to ensure accuracy.
447        duration_secs = (datetime.datetime.now() - start_time).total_seconds()
448        record_autoserv(parser.options, duration_secs)
449    sys.exit(exit_code)
450
451
452if __name__ == '__main__':
453    main()
454