autoserv revision d68d51c731ef83a45c173497b59a26b2e6870c1b
1#!/usr/bin/python -u
2# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
3# Released under the GPL v2
4
5"""
6Run a control file through the server side engine
7"""
8
9import ast
10import datetime
11import getpass
12import logging
13import os
14import re
15import signal
16import socket
17import sys
18import traceback
19import time
20import urllib2
21
22import common
23
24from autotest_lib.client.common_lib import control_data
25from autotest_lib.client.common_lib import global_config
26from autotest_lib.client.common_lib.cros.graphite import autotest_es
27from autotest_lib.client.common_lib.cros.graphite import autotest_stats
28try:
29    from autotest_lib.puppylab import results_mocker
30except ImportError:
31    results_mocker = None
32
33require_atfork = global_config.global_config.get_config_value(
34        'AUTOSERV', 'require_atfork_module', type=bool, default=True)
35
36
37# Number of seconds to wait before returning if testing mode is enabled
38TESTING_MODE_SLEEP_SECS = 1
39
40try:
41    import atfork
42    atfork.monkeypatch_os_fork_functions()
43    import atfork.stdlib_fixer
44    # Fix the Python standard library for threading+fork safety with its
45    # internal locks.  http://code.google.com/p/python-atfork/
46    import warnings
47    warnings.filterwarnings('ignore', 'logging module already imported')
48    atfork.stdlib_fixer.fix_logging_module()
49except ImportError, e:
50    from autotest_lib.client.common_lib import global_config
51    if global_config.global_config.get_config_value(
52            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
53        print >>sys.stderr, 'Please run utils/build_externals.py'
54        print e
55        sys.exit(1)
56
57from autotest_lib.server import frontend
58from autotest_lib.server import server_logging_config
59from autotest_lib.server import server_job, utils, autoserv_parser, autotest
60from autotest_lib.server import utils as server_utils
61from autotest_lib.site_utils import job_directories
62from autotest_lib.site_utils import job_overhead
63from autotest_lib.site_utils import lxc
64from autotest_lib.client.common_lib import pidfile, logging_manager
65from autotest_lib.client.common_lib.cros.graphite import autotest_stats
66
67# Control segment to stage server-side package.
68STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE = server_job._control_segment_path(
69        'stage_server_side_package')
70
71def log_alarm(signum, frame):
72    logging.error("Received SIGALARM. Ignoring and continuing on.")
73    sys.exit(1)
74
75
76def _get_machines(parser):
77    """Get a list of machine names from command line arg -m or a file.
78
79    @param parser: Parser for the command line arguments.
80
81    @return: A list of machine names from command line arg -m or the
82             machines file specified in the command line arg -M.
83    """
84    if parser.options.machines:
85        machines = parser.options.machines.replace(',', ' ').strip().split()
86    else:
87        machines = []
88    machines_file = parser.options.machines_file
89    if machines_file:
90        machines = []
91        for m in open(machines_file, 'r').readlines():
92            # remove comments, spaces
93            m = re.sub('#.*', '', m).strip()
94            if m:
95                machines.append(m)
96        logging.debug('Read list of machines from file: %s', machines_file)
97        logging.debug('Machines: %s', ','.join(machines))
98
99    if machines:
100        for machine in machines:
101            if not machine or re.search('\s', machine):
102                parser.parser.error("Invalid machine: %s" % str(machine))
103        machines = list(set(machines))
104        machines.sort()
105    return machines
106
107
108def _stage_ssp(parser):
109    """Stage server-side package.
110
111    This function calls a control segment to stage server-side package based on
112    the job and autoserv command line option. The detail implementation could
113    be different for each host type. Currently, only CrosHost has
114    stage_server_side_package function defined.
115    The script returns None if no server-side package is available. However,
116    it may raise exception if it failed for reasons other than artifact (the
117    server-side package) not found.
118
119    @param parser: Command line arguments parser passed in the autoserv process.
120
121    @return: url of the staged server-side package. Return None if server-
122             side package is not found for the build.
123    """
124    namespace = {'machines': _get_machines(parser),
125                 'image': parser.options.image}
126    script_locals = {}
127    execfile(STAGE_SERVER_SIDE_PACKAGE_CONTROL_FILE, namespace, script_locals)
128    return script_locals['ssp_url']
129
130
131def _run_with_ssp(container_name, job_id, results, parser, ssp_url):
132    """Run the server job with server-side packaging.
133
134    @param container_name: Name of the container to run the test.
135    @param job_id: ID of the test job.
136    @param results: Folder to store results. This could be different from
137                    parser.options.results:
138                    parser.options.results  can be set to None for results to be
139                    stored in a temp folder.
140                    results can be None for autoserv run requires no logging.
141    @param parser: Command line parser that contains the options.
142    @param ssp_url: url of the staged server-side package.
143    """
144    bucket = lxc.ContainerBucket()
145    control = (parser.args[0] if len(parser.args) > 0 and parser.args[0] != ''
146               else None)
147    test_container = bucket.setup_test(container_name, job_id, ssp_url, results,
148                                       control=control)
149    args = sys.argv[:]
150    args.remove('--require-ssp')
151
152    # A dictionary of paths to replace in the command line. Key is the path to
153    # be replaced with the one in value.
154    paths_to_replace = {}
155    # Replace the control file path with the one in container.
156    if control:
157        container_control_filename = os.path.join(
158                lxc.CONTROL_TEMP_PATH, os.path.basename(control))
159        paths_to_replace[control] = container_control_filename
160    # Update result directory with the one in container.
161    if parser.options.results:
162        container_result_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
163        paths_to_replace[parser.options.results] = container_result_dir
164    # Update parse_job directory with the one in container. The assumption is
165    # that the result folder to be parsed is always the same as the results_dir.
166    if parser.options.parse_job:
167        container_parse_dir = os.path.join(lxc.RESULT_DIR_FMT % job_id)
168        paths_to_replace[parser.options.parse_job] = container_result_dir
169
170    args = [paths_to_replace.get(arg, arg) for arg in args]
171
172    # Apply --use-existing-results, results directory is aready created and
173    # mounted in container. Apply this arg to avoid exception being raised.
174    if not '--use-existing-results' in args:
175        args.append('--use-existing-results')
176
177    # Make sure autoserv running in container using a different pid file.
178    if not '--pidfile-label' in args:
179        args.extend(['--pidfile-label', 'container_autoserv'])
180
181    cmd_line = ' '.join(["'%s'" % arg if ' ' in arg else arg for arg in args])
182    logging.info('Run command in container: %s', cmd_line)
183    success = False
184    try:
185        test_container.attach_run(cmd_line)
186        success = True
187    finally:
188        counter_key = '%s.%s' % (lxc.STATS_KEY,
189                                 'success' if success else 'fail')
190        autotest_stats.Counter(counter_key).increment()
191        # metadata is uploaded separately so it can use http to upload.
192        metadata = {'drone': socket.gethostname(),
193                    'job_id': job_id,
194                    'success': success}
195        autotest_es.post(use_http=True,
196                         type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE,
197                         metadata=metadata)
198        test_container.destroy()
199
200
201def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
202    """Run server job with given options.
203
204    @param pid_file_manager: PidFileManager used to monitor the autoserv process
205    @param results: Folder to store results.
206    @param parser: Parser for the command line arguments.
207    @param ssp_url: Url to server-side package.
208    @param use_ssp: Set to True to run with server-side packaging.
209    """
210    if parser.options.warn_no_ssp:
211        # Post a warning in the log.
212        logging.warn('Autoserv is required to run with server-side packaging. '
213                     'However, no drone is found to support server-side '
214                     'packaging. The test will be executed in a drone without '
215                     'server-side packaging supported.')
216
217    # send stdin to /dev/null
218    dev_null = os.open(os.devnull, os.O_RDONLY)
219    os.dup2(dev_null, sys.stdin.fileno())
220    os.close(dev_null)
221
222    # Create separate process group
223    os.setpgrp()
224
225    # Container name is predefined so the container can be destroyed in
226    # handle_sigterm.
227    job_or_task_id = job_directories.get_job_id_or_task_id(
228            parser.options.results)
229    container_name = (lxc.TEST_CONTAINER_NAME_FMT %
230                      (job_or_task_id, time.time(), os.getpid()))
231
232    # Implement SIGTERM handler
233    def handle_sigterm(signum, frame):
234        logging.debug('Received SIGTERM')
235        if pid_file_manager:
236            pid_file_manager.close_file(1, signal.SIGTERM)
237        logging.debug('Finished writing to pid_file. Killing process.')
238        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
239        # This sleep allows the pending output to be logged before the kill
240        # signal is sent.
241        time.sleep(.1)
242        if use_ssp:
243            logging.debug('Destroy container %s before aborting the autoserv '
244                          'process.', container_name)
245            try:
246                bucket = lxc.ContainerBucket()
247                container = bucket.get(container_name)
248                if container:
249                    container.destroy()
250                else:
251                    logging.debug('Container %s is not found.', container_name)
252            except:
253                # Handle any exception so the autoserv process can be aborted.
254                logging.error('Failed to destroy container %s. Error: %s',
255                              container_name, sys.exc_info())
256
257        os.killpg(os.getpgrp(), signal.SIGKILL)
258
259    # Set signal handler
260    signal.signal(signal.SIGTERM, handle_sigterm)
261
262    # faulthandler is only needed to debug in the Lab and is not avaliable to
263    # be imported in the chroot as part of VMTest, so Try-Except it.
264    try:
265        import faulthandler
266        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
267        logging.debug('faulthandler registered on SIGTERM.')
268    except ImportError:
269        pass
270
271    # Ignore SIGTTOU's generated by output from forked children.
272    signal.signal(signal.SIGTTOU, signal.SIG_IGN)
273
274    # If we received a SIGALARM, let's be loud about it.
275    signal.signal(signal.SIGALRM, log_alarm)
276
277    # Server side tests that call shell scripts often depend on $USER being set
278    # but depending on how you launch your autotest scheduler it may not be set.
279    os.environ['USER'] = getpass.getuser()
280
281    label = parser.options.label
282    group_name = parser.options.group_name
283    user = parser.options.user
284    client = parser.options.client
285    server = parser.options.server
286    install_before = parser.options.install_before
287    install_after = parser.options.install_after
288    verify = parser.options.verify
289    repair = parser.options.repair
290    cleanup = parser.options.cleanup
291    provision = parser.options.provision
292    reset = parser.options.reset
293    job_labels = parser.options.job_labels
294    no_tee = parser.options.no_tee
295    parse_job = parser.options.parse_job
296    execution_tag = parser.options.execution_tag
297    if not execution_tag:
298        execution_tag = parse_job
299    host_protection = parser.options.host_protection
300    ssh_user = parser.options.ssh_user
301    ssh_port = parser.options.ssh_port
302    ssh_pass = parser.options.ssh_pass
303    collect_crashinfo = parser.options.collect_crashinfo
304    control_filename = parser.options.control_filename
305    test_retry = parser.options.test_retry
306    verify_job_repo_url = parser.options.verify_job_repo_url
307    skip_crash_collection = parser.options.skip_crash_collection
308    ssh_verbosity = int(parser.options.ssh_verbosity)
309    ssh_options = parser.options.ssh_options
310    no_use_packaging = parser.options.no_use_packaging
311
312    # can't be both a client and a server side test
313    if client and server:
314        parser.parser.error("Can not specify a test as both server and client!")
315
316    if provision and client:
317        parser.parser.error("Cannot specify provisioning and client!")
318
319    is_special_task = (verify or repair or cleanup or collect_crashinfo or
320                       provision or reset)
321    if len(parser.args) < 1 and not is_special_task:
322        parser.parser.error("Missing argument: control file")
323
324    if ssh_verbosity > 0:
325        # ssh_verbosity is an integer between 0 and 3, inclusive
326        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
327    else:
328        ssh_verbosity_flag = ''
329
330    # We have a control file unless it's just a verify/repair/cleanup job
331    if len(parser.args) > 0:
332        control = parser.args[0]
333    else:
334        control = None
335
336    machines = _get_machines(parser)
337    if group_name and len(machines) < 2:
338        parser.parser.error('-G %r may only be supplied with more than one '
339                            'machine.' % group_name)
340
341    kwargs = {'group_name': group_name, 'tag': execution_tag,
342              'disable_sysinfo': parser.options.disable_sysinfo}
343    if control_filename:
344        kwargs['control_filename'] = control_filename
345    job = server_job.server_job(control, parser.args[1:], results, label,
346                                user, machines, client, parse_job,
347                                ssh_user, ssh_port, ssh_pass,
348                                ssh_verbosity_flag, ssh_options,
349                                test_retry, **kwargs)
350
351    job.logging.start_logging()
352    job.init_parser()
353
354    # perform checks
355    job.precheck()
356
357    # run the job
358    exit_code = 0
359    try:
360        try:
361            if repair:
362                job.repair(host_protection, job_labels)
363            elif verify:
364                job.verify(job_labels)
365            elif provision:
366                job.provision(job_labels)
367            elif reset:
368                job.reset(job_labels)
369            elif cleanup:
370                job.cleanup(job_labels)
371            else:
372                if use_ssp:
373                    try:
374                        _run_with_ssp(container_name, job_or_task_id, results,
375                                      parser, ssp_url)
376                    finally:
377                        # Update the ownership of files in result folder.
378                        # TODO(dshi): crbug.com/459344 Skip following action
379                        # when test container can be unprivileged container.
380                        if results:
381                            lxc.run('chown -R %s %s' % (os.getuid(), results))
382                            lxc.run('chgrp -R %s %s' % (os.getgid(), results))
383                else:
384                    job.run(install_before, install_after,
385                            verify_job_repo_url=verify_job_repo_url,
386                            only_collect_crashinfo=collect_crashinfo,
387                            skip_crash_collection=skip_crash_collection,
388                            job_labels=job_labels,
389                            use_packaging=(not no_use_packaging))
390        finally:
391            while job.hosts:
392                host = job.hosts.pop()
393                host.close()
394    except:
395        exit_code = 1
396        traceback.print_exc()
397
398    if pid_file_manager:
399        pid_file_manager.num_tests_failed = job.num_tests_failed
400        pid_file_manager.close_file(exit_code)
401    job.cleanup_parser()
402
403    sys.exit(exit_code)
404
405
406def record_autoserv(options, duration_secs):
407    """Record autoserv end-to-end time in metadata db.
408
409    @param options: parser options.
410    @param duration_secs: How long autoserv has taken, in secs.
411    """
412    # Get machine hostname
413    machines = options.machines.replace(
414            ',', ' ').strip().split() if options.machines else []
415    num_machines = len(machines)
416    if num_machines > 1:
417        # Skip the case where atomic group is used.
418        return
419    elif num_machines == 0:
420        machines.append('hostless')
421
422    # Determine the status that will be reported.
423    s = job_overhead.STATUS
424    task_mapping = {
425            'reset': s.RESETTING, 'verify': s.VERIFYING,
426            'provision': s.PROVISIONING, 'repair': s.REPAIRING,
427            'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
428    # option_dict will be like {'reset': True, 'repair': False, ...}
429    option_dict = ast.literal_eval(str(options))
430    match = filter(lambda task: option_dict.get(task) == True, task_mapping)
431    status = task_mapping[match[0]] if match else s.RUNNING
432    is_special_task = status not in [s.RUNNING, s.GATHERING]
433    job_or_task_id = job_directories.get_job_id_or_task_id(options.results)
434    job_overhead.record_state_duration(
435            job_or_task_id, machines[0], status, duration_secs,
436            is_special_task=is_special_task)
437
438
439def main():
440    start_time = datetime.datetime.now()
441    # White list of tests with run time measurement enabled.
442    measure_run_time_tests_names = global_config.global_config.get_config_value(
443                        'AUTOSERV', 'measure_run_time_tests', type=str)
444    if measure_run_time_tests_names:
445        measure_run_time_tests = [t.strip() for t in
446                                  measure_run_time_tests_names.split(',')]
447    else:
448        measure_run_time_tests = []
449    # grab the parser
450    parser = autoserv_parser.autoserv_parser
451    parser.parse_args()
452
453    if len(sys.argv) == 1:
454        parser.parser.print_help()
455        sys.exit(1)
456
457    # If the job requires to run with server-side package, try to stage server-
458    # side package first. If that fails with error that autotest server package
459    # does not exist, fall back to run the job without using server-side
460    # packaging. If option warn_no_ssp is specified, that means autoserv is
461    # running in a drone does not support SSP, thus no need to stage server-side
462    # package.
463    ssp_url = None
464    ssp_url_warning = False
465    if (not parser.options.warn_no_ssp and parser.options.require_ssp):
466        ssp_url = _stage_ssp(parser)
467        # The build does not have autotest server package. Fall back to not
468        # to use server-side package. Logging is postponed until logging being
469        # set up.
470        ssp_url_warning = not ssp_url
471
472    if parser.options.no_logging:
473        results = None
474    else:
475        results = parser.options.results
476        if not results:
477            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
478        results  = os.path.abspath(results)
479        resultdir_exists = False
480        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
481            if os.path.exists(os.path.join(results, filename)):
482                resultdir_exists = True
483        if not parser.options.use_existing_results and resultdir_exists:
484            error = "Error: results directory already exists: %s\n" % results
485            sys.stderr.write(error)
486            sys.exit(1)
487
488        # Now that we certified that there's no leftover results dir from
489        # previous jobs, lets create the result dir since the logging system
490        # needs to create the log file in there.
491        if not os.path.isdir(results):
492            os.makedirs(results)
493
494    # Server-side packaging will only be used if it's required and the package
495    # is available. If warn_no_ssp is specified, it means that autoserv is
496    # running in a drone does not have SSP supported and a warning will be logs.
497    # Therefore, it should not run with SSP.
498    use_ssp = (not parser.options.warn_no_ssp and parser.options.require_ssp
499               and ssp_url)
500    if use_ssp:
501        log_dir = os.path.join(results, 'wrapper') if results else None
502        if log_dir and not os.path.exists(log_dir):
503            os.makedirs(log_dir)
504    else:
505        log_dir = results
506    logging_manager.configure_logging(
507            server_logging_config.ServerLoggingConfig(),
508            results_dir=log_dir,
509            use_console=not parser.options.no_tee,
510            verbose=parser.options.verbose,
511            no_console_prefix=parser.options.no_console_prefix)
512
513    if ssp_url_warning:
514        logging.warn(
515                'Autoserv is required to run with server-side packaging. '
516                'However, no server-side package can be found based on '
517                '`--image`, host attribute job_repo_url or host label of '
518                'cros-version. The test will be executed without '
519                'server-side packaging supported.')
520
521    if results:
522        logging.info("Results placed in %s" % results)
523
524        # wait until now to perform this check, so it get properly logged
525        if (parser.options.use_existing_results and not resultdir_exists and
526            not lxc.is_in_container()):
527            logging.error("No existing results directory found: %s", results)
528            sys.exit(1)
529
530    logging.debug('autoserv is running in drone %s.', socket.gethostname())
531    logging.debug('autoserv command was: %s', ' '.join(sys.argv))
532
533    if parser.options.write_pidfile and results:
534        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
535                                                  results)
536        pid_file_manager.open_file()
537    else:
538        pid_file_manager = None
539
540    autotest.BaseAutotest.set_install_in_tmpdir(
541        parser.options.install_in_tmpdir)
542
543    timer = None
544    try:
545        # Take the first argument as control file name, get the test name from
546        # the control file. If the test name exists in the list of tests with
547        # run time measurement enabled, start a timer to begin measurement.
548        if (len(parser.args) > 0 and parser.args[0] != '' and
549            parser.options.machines):
550            try:
551                test_name = control_data.parse_control(parser.args[0],
552                                                       raise_warnings=True).name
553            except control_data.ControlVariableException:
554                logging.debug('Failed to retrieve test name from control file.')
555                test_name = None
556            if test_name in measure_run_time_tests:
557                machines = parser.options.machines.replace(',', ' '
558                                                           ).strip().split()
559                try:
560                    afe = frontend.AFE()
561                    board = server_utils.get_board_from_afe(machines[0], afe)
562                    timer = autotest_stats.Timer('autoserv_run_time.%s.%s' %
563                                                 (board, test_name))
564                    timer.start()
565                except (urllib2.HTTPError, urllib2.URLError):
566                    # Ignore error if RPC failed to get board
567                    pass
568    except control_data.ControlVariableException as e:
569        logging.error(str(e))
570    exit_code = 0
571    # TODO(beeps): Extend this to cover different failure modes.
572    # Testing exceptions are matched against labels sent to autoserv. Eg,
573    # to allow only the hostless job to run, specify
574    # testing_exceptions: test_suite in the shadow_config. To allow both
575    # the hostless job and dummy_Pass to run, specify
576    # testing_exceptions: test_suite,dummy_Pass. You can figure out
577    # what label autoserv is invoked with by looking through the logs of a test
578    # for the autoserv command's -l option.
579    testing_exceptions = global_config.global_config.get_config_value(
580            'AUTOSERV', 'testing_exceptions', type=list, default=[])
581    test_mode = global_config.global_config.get_config_value(
582            'AUTOSERV', 'testing_mode', type=bool, default=False)
583    test_mode = (results_mocker and test_mode and not
584                 any([ex in parser.options.label
585                      for ex in testing_exceptions]))
586    is_task = (parser.options.verify or parser.options.repair or
587               parser.options.provision or parser.options.reset or
588               parser.options.cleanup or parser.options.collect_crashinfo)
589    try:
590        try:
591            if test_mode:
592                # The parser doesn't run on tasks anyway, so we can just return
593                # happy signals without faking results.
594                if not is_task:
595                    machine = parser.options.results.split('/')[-1]
596
597                    # TODO(beeps): The proper way to do this would be to
598                    # refactor job creation so we can invoke job.record
599                    # directly. To do that one needs to pipe the test_name
600                    # through run_autoserv and bail just before invoking
601                    # the server job. See the comment in
602                    # puppylab/results_mocker for more context.
603                    results_mocker.ResultsMocker(
604                            test_name if test_name else 'unknown-test',
605                            parser.options.results, machine
606                            ).mock_results()
607                return
608            else:
609                run_autoserv(pid_file_manager, results, parser, ssp_url,
610                             use_ssp)
611        except SystemExit as e:
612            exit_code = e.code
613            if exit_code:
614                logging.exception(e)
615        except Exception as e:
616            # If we don't know what happened, we'll classify it as
617            # an 'abort' and return 1.
618            logging.exception(e)
619            exit_code = 1
620    finally:
621        if pid_file_manager:
622            pid_file_manager.close_file(exit_code)
623        if timer:
624            timer.stop()
625        # Record the autoserv duration time. Must be called
626        # just before the system exits to ensure accuracy.
627        duration_secs = (datetime.datetime.now() - start_time).total_seconds()
628        record_autoserv(parser.options, duration_secs)
629    sys.exit(exit_code)
630
631
632if __name__ == '__main__':
633    main()
634