autoserv.py revision ca76bcccd6b8029bb0c3fcf72b1c4649e7fad9b1
1#!/usr/bin/python -u
2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
3# Released under the GPL v2
4
5"""
6Run a control file through the server side engine
7"""
8
9import sys, os, re, traceback, signal, time, logging, getpass
10
11import common
12
13from autotest_lib.client.common_lib import control_data
14from autotest_lib.client.common_lib import global_config
15require_atfork = global_config.global_config.get_config_value(
16        'AUTOSERV', 'require_atfork_module', type=bool, default=True)
17
18
19try:
20    import atfork
21    atfork.monkeypatch_os_fork_functions()
22    import atfork.stdlib_fixer
23    # Fix the Python standard library for threading+fork safety with its
24    # internal locks.  http://code.google.com/p/python-atfork/
25    import warnings
26    warnings.filterwarnings('ignore', 'logging module already imported')
27    atfork.stdlib_fixer.fix_logging_module()
28except ImportError, e:
29    from autotest_lib.client.common_lib import global_config
30    if global_config.global_config.get_config_value(
31            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
32        print >>sys.stderr, 'Please run utils/build_externals.py'
33        print e
34        sys.exit(1)
35
36from autotest_lib.server import frontend
37from autotest_lib.server import server_logging_config
38from autotest_lib.server import server_job, utils, autoserv_parser, autotest
39from autotest_lib.server import utils as server_utils
40
41from autotest_lib.client.common_lib import pidfile, logging_manager
42from autotest_lib.site_utils.graphite import stats
43
44def log_alarm(signum, frame):
45    logging.error("Received SIGALARM. Ignoring and continuing on.")
46    sys.exit(1)
47
48def run_autoserv(pid_file_manager, results, parser):
49    # send stdin to /dev/null
50    dev_null = os.open(os.devnull, os.O_RDONLY)
51    os.dup2(dev_null, sys.stdin.fileno())
52    os.close(dev_null)
53
54    # Create separate process group
55    os.setpgrp()
56
57    # Implement SIGTERM handler
58    def handle_sigterm(signum, frame):
59        logging.debug('Received SIGTERM')
60        if pid_file_manager:
61            pid_file_manager.close_file(1, signal.SIGTERM)
62        logging.debug('Finished writing to pid_file. Killing process.')
63        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
64        # This sleep allows the pending output to be logged before the kill
65        # signal is sent.
66        time.sleep(.1)
67        os.killpg(os.getpgrp(), signal.SIGKILL)
68
69    # Set signal handler
70    signal.signal(signal.SIGTERM, handle_sigterm)
71
72    # faulthandler is only needed to debug in the Lab and is not avaliable to
73    # be imported in the chroot as part of VMTest, so Try-Except it.
74    try:
75        import faulthandler
76        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
77        logging.debug('faulthandler registered on SIGTERM.')
78    except ImportError:
79        pass
80
81    # Ignore SIGTTOU's generated by output from forked children.
82    signal.signal(signal.SIGTTOU, signal.SIG_IGN)
83
84    # If we received a SIGALARM, let's be loud about it.
85    signal.signal(signal.SIGALRM, log_alarm)
86
87    # Server side tests that call shell scripts often depend on $USER being set
88    # but depending on how you launch your autotest scheduler it may not be set.
89    os.environ['USER'] = getpass.getuser()
90
91    if parser.options.machines:
92        machines = parser.options.machines.replace(',', ' ').strip().split()
93    else:
94        machines = []
95    machines_file = parser.options.machines_file
96    label = parser.options.label
97    group_name = parser.options.group_name
98    user = parser.options.user
99    client = parser.options.client
100    server = parser.options.server
101    install_before = parser.options.install_before
102    install_after = parser.options.install_after
103    verify = parser.options.verify
104    repair = parser.options.repair
105    cleanup = parser.options.cleanup
106    provision = parser.options.provision
107    reset = parser.options.reset
108    job_labels = parser.options.job_labels
109    no_tee = parser.options.no_tee
110    parse_job = parser.options.parse_job
111    execution_tag = parser.options.execution_tag
112    if not execution_tag:
113        execution_tag = parse_job
114    host_protection = parser.options.host_protection
115    ssh_user = parser.options.ssh_user
116    ssh_port = parser.options.ssh_port
117    ssh_pass = parser.options.ssh_pass
118    collect_crashinfo = parser.options.collect_crashinfo
119    control_filename = parser.options.control_filename
120    test_retry = parser.options.test_retry
121    verify_job_repo_url = parser.options.verify_job_repo_url
122    skip_crash_collection = parser.options.skip_crash_collection
123    ssh_verbosity = int(parser.options.ssh_verbosity)
124    ssh_options = parser.options.ssh_options
125
126    # can't be both a client and a server side test
127    if client and server:
128        parser.parser.error("Can not specify a test as both server and client!")
129
130    if provision and client:
131        parser.parser.error("Cannot specify provisioning and client!")
132
133    is_special_task = (verify or repair or cleanup or collect_crashinfo or
134                       provision or reset)
135    if len(parser.args) < 1 and not is_special_task:
136        parser.parser.error("Missing argument: control file")
137
138    if ssh_verbosity > 0:
139        # ssh_verbosity is an integer between 0 and 3, inclusive
140        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
141    else:
142        ssh_verbosity_flag = ''
143
144    # We have a control file unless it's just a verify/repair/cleanup job
145    if len(parser.args) > 0:
146        control = parser.args[0]
147    else:
148        control = None
149
150    if machines_file:
151        machines = []
152        for m in open(machines_file, 'r').readlines():
153            # remove comments, spaces
154            m = re.sub('#.*', '', m).strip()
155            if m:
156                machines.append(m)
157        print "Read list of machines from file: %s" % machines_file
158        print ','.join(machines)
159
160    if machines:
161        for machine in machines:
162            if not machine or re.search('\s', machine):
163                parser.parser.error("Invalid machine: %s" % str(machine))
164        machines = list(set(machines))
165        machines.sort()
166
167    if group_name and len(machines) < 2:
168        parser.parser.error("-G %r may only be supplied with more than one machine."
169               % group_name)
170
171    kwargs = {'group_name': group_name, 'tag': execution_tag,
172              'disable_sysinfo': parser.options.disable_sysinfo}
173    if control_filename:
174        kwargs['control_filename'] = control_filename
175    job = server_job.server_job(control, parser.args[1:], results, label,
176                                user, machines, client, parse_job,
177                                ssh_user, ssh_port, ssh_pass,
178                                ssh_verbosity_flag, ssh_options,
179                                test_retry, **kwargs)
180    job.logging.start_logging()
181    job.init_parser()
182
183    # perform checks
184    job.precheck()
185
186    # run the job
187    exit_code = 0
188    try:
189        try:
190            if repair:
191                job.repair(host_protection, job_labels)
192            elif verify:
193                job.verify(job_labels)
194            elif provision:
195                job.provision(job_labels)
196            elif reset:
197                job.reset(job_labels)
198            else:
199                job.run(cleanup, install_before, install_after,
200                        verify_job_repo_url=verify_job_repo_url,
201                        only_collect_crashinfo=collect_crashinfo,
202                        skip_crash_collection=skip_crash_collection,
203                        job_labels=job_labels)
204        finally:
205            while job.hosts:
206                host = job.hosts.pop()
207                host.close()
208    except:
209        exit_code = 1
210        traceback.print_exc()
211
212    if pid_file_manager:
213        pid_file_manager.num_tests_failed = job.num_tests_failed
214        pid_file_manager.close_file(exit_code)
215    job.cleanup_parser()
216
217    sys.exit(exit_code)
218
219
220def main():
221    # White list of tests with run time measurement enabled.
222    measure_run_time_tests_names = global_config.global_config.get_config_value(
223                        'AUTOSERV', 'measure_run_time_tests', type=str)
224    if measure_run_time_tests_names:
225        measure_run_time_tests = [t.strip() for t in
226                                  measure_run_time_tests_names.split(',')]
227    else:
228        measure_run_time_tests = []
229    # grab the parser
230    parser = autoserv_parser.autoserv_parser
231    parser.parse_args()
232
233    if len(sys.argv) == 1:
234        parser.parser.print_help()
235        sys.exit(1)
236
237    if parser.options.no_logging:
238        results = None
239    else:
240        results = parser.options.results
241        if not results:
242            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
243        results  = os.path.abspath(results)
244        resultdir_exists = False
245        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
246            if os.path.exists(os.path.join(results, filename)):
247                resultdir_exists = True
248        if not parser.options.use_existing_results and resultdir_exists:
249            error = "Error: results directory already exists: %s\n" % results
250            sys.stderr.write(error)
251            sys.exit(1)
252
253        # Now that we certified that there's no leftover results dir from
254        # previous jobs, lets create the result dir since the logging system
255        # needs to create the log file in there.
256        if not os.path.isdir(results):
257            os.makedirs(results)
258
259    logging_manager.configure_logging(
260            server_logging_config.ServerLoggingConfig(), results_dir=results,
261            use_console=not parser.options.no_tee,
262            verbose=parser.options.verbose,
263            no_console_prefix=parser.options.no_console_prefix)
264    if results:
265        logging.info("Results placed in %s" % results)
266
267        # wait until now to perform this check, so it get properly logged
268        if parser.options.use_existing_results and not resultdir_exists:
269            logging.error("No existing results directory found: %s", results)
270            sys.exit(1)
271
272    logging.debug('autoserv command was: %s', ' '.join(sys.argv))
273
274    if parser.options.write_pidfile:
275        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
276                                                  results)
277        pid_file_manager.open_file()
278    else:
279        pid_file_manager = None
280
281    autotest.BaseAutotest.set_install_in_tmpdir(
282        parser.options.install_in_tmpdir)
283
284    timer = None
285    try:
286        # Take the first argument as control file name, get the test name from
287        # the control file. If the test name exists in the list of tests with
288        # run time measurement enabled, start a timer to begin measurement.
289        if (len(parser.args) > 0 and parser.args[0] != '' and
290            parser.options.machines):
291            try:
292                test_name = control_data.parse_control(parser.args[0],
293                                                       raise_warnings=True).name
294            except control_data.ControlVariableException:
295                logging.debug('Failed to retrieve test name from control file.')
296                test_name = None
297            if test_name in measure_run_time_tests:
298                machines = parser.options.machines.replace(',', ' '
299                                                           ).strip().split()
300                afe = frontend.AFE()
301                board = server_utils.get_board_from_afe(machines[0], afe)
302                timer = stats.Timer('autoserv_run_time.%s.%s' %
303                                    (board, test_name))
304                timer.start()
305    except control_data.ControlVariableException as e:
306        logging.error(str(e))
307    exit_code = 0
308    try:
309        try:
310            run_autoserv(pid_file_manager, results, parser)
311        except SystemExit as e:
312            exit_code = e.code
313            if exit_code:
314                logging.exception(e)
315        except Exception as e:
316            # If we don't know what happened, we'll classify it as
317            # an 'abort' and return 1.
318            logging.exception(e)
319            exit_code = 1
320    finally:
321        if pid_file_manager:
322            pid_file_manager.close_file(exit_code)
323        if timer:
324            timer.stop()
325    sys.exit(exit_code)
326
327
328if __name__ == '__main__':
329    main()
330