autotest/server/autoserv.py

#!/usr/bin/python -u
# Copyright 2007-2008 Martin J. Bligh <mbligh@google.com>, Google Inc.
# Released under the GPL v2

"""
Run a control file through the server side engine
"""

import ast
import datetime
import getpass
import logging
import os
import re
import signal
import sys
import traceback
import time
import urllib2

import common

from autotest_lib.client.common_lib import control_data
from autotest_lib.client.common_lib import global_config
try:
    from autotest_lib.puppylab import results_mocker
except ImportError:
    results_mocker = None

require_atfork = global_config.global_config.get_config_value(
        'AUTOSERV', 'require_atfork_module', type=bool, default=True)


# Number of seconds to wait before returning if testing mode is enabled
TESTING_MODE_SLEEP_SECS = 1


try:
    import atfork
    atfork.monkeypatch_os_fork_functions()
    import atfork.stdlib_fixer
    # Fix the Python standard library for threading+fork safety with its
    # internal locks.  http://code.google.com/p/python-atfork/
    import warnings
    warnings.filterwarnings('ignore', 'logging module already imported')
    atfork.stdlib_fixer.fix_logging_module()
except ImportError, e:
    from autotest_lib.client.common_lib import global_config
    if global_config.global_config.get_config_value(
            'AUTOSERV', 'require_atfork_module', type=bool, default=False):
        print >>sys.stderr, 'Please run utils/build_externals.py'
        print e
        sys.exit(1)

from autotest_lib.server import frontend
from autotest_lib.server import server_logging_config
from autotest_lib.server import server_job, utils, autoserv_parser, autotest
from autotest_lib.server import utils as server_utils
from autotest_lib.site_utils import job_overhead
from autotest_lib.client.common_lib import pidfile, logging_manager
from autotest_lib.client.common_lib.cros.graphite import stats

def log_alarm(signum, frame):
    logging.error("Received SIGALARM. Ignoring and continuing on.")
    sys.exit(1)

def run_autoserv(pid_file_manager, results, parser):
    # send stdin to /dev/null
    dev_null = os.open(os.devnull, os.O_RDONLY)
    os.dup2(dev_null, sys.stdin.fileno())
    os.close(dev_null)

    # Create separate process group
    os.setpgrp()

    # Implement SIGTERM handler
    def handle_sigterm(signum, frame):
        logging.debug('Received SIGTERM')
        if pid_file_manager:
            pid_file_manager.close_file(1, signal.SIGTERM)
        logging.debug('Finished writing to pid_file. Killing process.')
        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
        # This sleep allows the pending output to be logged before the kill
        # signal is sent.
        time.sleep(.1)
        os.killpg(os.getpgrp(), signal.SIGKILL)

    # Set signal handler
    signal.signal(signal.SIGTERM, handle_sigterm)

    # faulthandler is only needed to debug in the Lab and is not avaliable to
    # be imported in the chroot as part of VMTest, so Try-Except it.
    try:
        import faulthandler
        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
        logging.debug('faulthandler registered on SIGTERM.')
    except ImportError:
        pass

    # Ignore SIGTTOU's generated by output from forked children.
    signal.signal(signal.SIGTTOU, signal.SIG_IGN)

    # If we received a SIGALARM, let's be loud about it.
    signal.signal(signal.SIGALRM, log_alarm)

    # Server side tests that call shell scripts often depend on $USER being set
    # but depending on how you launch your autotest scheduler it may not be set.
    os.environ['USER'] = getpass.getuser()

    if parser.options.machines:
        machines = parser.options.machines.replace(',', ' ').strip().split()
    else:
        machines = []
    machines_file = parser.options.machines_file
    label = parser.options.label
    group_name = parser.options.group_name
    user = parser.options.user
    client = parser.options.client
    server = parser.options.server
    install_before = parser.options.install_before
    install_after = parser.options.install_after
    verify = parser.options.verify
    repair = parser.options.repair
    cleanup = parser.options.cleanup
    provision = parser.options.provision
    reset = parser.options.reset
    job_labels = parser.options.job_labels
    no_tee = parser.options.no_tee
    parse_job = parser.options.parse_job
    execution_tag = parser.options.execution_tag
    if not execution_tag:
        execution_tag = parse_job
    host_protection = parser.options.host_protection
    ssh_user = parser.options.ssh_user
    ssh_port = parser.options.ssh_port
    ssh_pass = parser.options.ssh_pass
    collect_crashinfo = parser.options.collect_crashinfo
    control_filename = parser.options.control_filename
    test_retry = parser.options.test_retry
    verify_job_repo_url = parser.options.verify_job_repo_url
    skip_crash_collection = parser.options.skip_crash_collection
    ssh_verbosity = int(parser.options.ssh_verbosity)
    ssh_options = parser.options.ssh_options

    # can't be both a client and a server side test
    if client and server:
        parser.parser.error("Can not specify a test as both server and client!")

    if provision and client:
        parser.parser.error("Cannot specify provisioning and client!")

    is_special_task = (verify or repair or cleanup or collect_crashinfo or
                       provision or reset)
    if len(parser.args) < 1 and not is_special_task:
        parser.parser.error("Missing argument: control file")

    if ssh_verbosity > 0:
        # ssh_verbosity is an integer between 0 and 3, inclusive
        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
    else:
        ssh_verbosity_flag = ''

    # We have a control file unless it's just a verify/repair/cleanup job
    if len(parser.args) > 0:
        control = parser.args[0]
    else:
        control = None

    if machines_file:
        machines = []
        for m in open(machines_file, 'r').readlines():
            # remove comments, spaces
            m = re.sub('#.*', '', m).strip()
            if m:
                machines.append(m)
        print "Read list of machines from file: %s" % machines_file
        print ','.join(machines)

    if machines:
        for machine in machines:
            if not machine or re.search('\s', machine):
                parser.parser.error("Invalid machine: %s" % str(machine))
        machines = list(set(machines))
        machines.sort()

    if group_name and len(machines) < 2:
        parser.parser.error("-G %r may only be supplied with more than one machine."
               % group_name)

    kwargs = {'group_name': group_name, 'tag': execution_tag,
              'disable_sysinfo': parser.options.disable_sysinfo}
    if control_filename:
        kwargs['control_filename'] = control_filename
    job = server_job.server_job(control, parser.args[1:], results, label,
                                user, machines, client, parse_job,
                                ssh_user, ssh_port, ssh_pass,
                                ssh_verbosity_flag, ssh_options,
                                test_retry, **kwargs)
    job.logging.start_logging()
    job.init_parser()

    # perform checks
    job.precheck()

    # run the job
    exit_code = 0
    try:
        try:
            if repair:
                job.repair(host_protection, job_labels)
            elif verify:
                job.verify(job_labels)
            elif provision:
                job.provision(job_labels)
            elif reset:
                job.reset(job_labels)
            elif cleanup:
                job.cleanup(job_labels)
            else:
                job.run(install_before, install_after,
                        verify_job_repo_url=verify_job_repo_url,
                        only_collect_crashinfo=collect_crashinfo,
                        skip_crash_collection=skip_crash_collection,
                        job_labels=job_labels)
        finally:
            while job.hosts:
                host = job.hosts.pop()
                host.close()
    except:
        exit_code = 1
        traceback.print_exc()

    if pid_file_manager:
        pid_file_manager.num_tests_failed = job.num_tests_failed
        pid_file_manager.close_file(exit_code)
    job.cleanup_parser()

    sys.exit(exit_code)


def _get_job_id_or_task_id(result_dir, machine, is_special_task):
    """Extract job id or special task id from result_dir

    @param result_dir: path to the result dir.
    @param machine: hostname of the machine.
    @param is_special_task: True/False, whether it is a special task.

    @returns: integer representing the job id or task id.
    """
    if not result_dir:
        return
    result_dir = os.path.abspath(result_dir)
    if is_special_task:
        # special task result dir is like
        # /usr/local/autotest/results/hosts/chromeos1-rack5-host6/1343-cleanup
        pattern = '.*/hosts/%s/(\d+)-[^/]+' % machine
    else:
        # non-special task result dir is like
        # /usr/local/autotest/results/2032-chromeos-test/chromeos1-rack5-host6
        pattern ='.*/(\d+)-[^/]+/%s' % machine
    m = re.match(pattern, result_dir)
    return int(m.group(1)) if m else None


def record_autoserv(options, duration_secs):
    """Record autoserv end-to-end time in metadata db.

    @param options: parser options.
    @param duration_secs: How long autoserv has taken, in secs.
    """
    # Get machine hostname
    machines = options.machines.replace(
            ',', ' ').strip().split() if options.machines else []
    num_machines = len(machines)
    if num_machines > 1:
        # Skip the case where atomic group is used.
        return
    elif num_machines == 0:
        machines.append('hostless')

    # Determine the status that will be reported.
    s = job_overhead.STATUS
    task_mapping = {
            'reset': s.RESETTING, 'verify': s.VERIFYING,
            'provision': s.PROVISIONING, 'repair': s.REPAIRING,
            'cleanup': s.CLEANING, 'collect_crashinfo': s.GATHERING}
    # option_dict will be like {'reset': True, 'repair': False, ...}
    option_dict = ast.literal_eval(str(options))
    match = filter(lambda task: option_dict.get(task) == True, task_mapping)
    status = task_mapping[match[0]] if match else s.RUNNING
    is_special_task = status not in [s.RUNNING, s.GATHERING]
    job_or_task_id = _get_job_id_or_task_id(
            options.results, machines[0], is_special_task)
    job_overhead.record_state_duration(
            job_or_task_id, machines[0], status, duration_secs,
            is_special_task=is_special_task)


def main():
    start_time = datetime.datetime.now()
    # White list of tests with run time measurement enabled.
    measure_run_time_tests_names = global_config.global_config.get_config_value(
                        'AUTOSERV', 'measure_run_time_tests', type=str)
    if measure_run_time_tests_names:
        measure_run_time_tests = [t.strip() for t in
                                  measure_run_time_tests_names.split(',')]
    else:
        measure_run_time_tests = []
    # grab the parser
    parser = autoserv_parser.autoserv_parser
    parser.parse_args()

    if len(sys.argv) == 1:
        parser.parser.print_help()
        sys.exit(1)

    if parser.options.no_logging:
        results = None
    else:
        results = parser.options.results
        if not results:
            results = 'results.' + time.strftime('%Y-%m-%d-%H.%M.%S')
        results  = os.path.abspath(results)
        resultdir_exists = False
        for filename in ('control.srv', 'status.log', '.autoserv_execute'):
            if os.path.exists(os.path.join(results, filename)):
                resultdir_exists = True
        if not parser.options.use_existing_results and resultdir_exists:
            error = "Error: results directory already exists: %s\n" % results
            sys.stderr.write(error)
            sys.exit(1)

        # Now that we certified that there's no leftover results dir from
        # previous jobs, lets create the result dir since the logging system
        # needs to create the log file in there.
        if not os.path.isdir(results):
            os.makedirs(results)

    logging_manager.configure_logging(
            server_logging_config.ServerLoggingConfig(), results_dir=results,
            use_console=not parser.options.no_tee,
            verbose=parser.options.verbose,
            no_console_prefix=parser.options.no_console_prefix)
    if results:
        logging.info("Results placed in %s" % results)

        # wait until now to perform this check, so it get properly logged
        if parser.options.use_existing_results and not resultdir_exists:
            logging.error("No existing results directory found: %s", results)
            sys.exit(1)

    logging.debug('autoserv command was: %s', ' '.join(sys.argv))

    if parser.options.write_pidfile:
        pid_file_manager = pidfile.PidFileManager(parser.options.pidfile_label,
                                                  results)
        pid_file_manager.open_file()
    else:
        pid_file_manager = None

    autotest.BaseAutotest.set_install_in_tmpdir(
        parser.options.install_in_tmpdir)

    timer = None
    try:
        # Take the first argument as control file name, get the test name from
        # the control file. If the test name exists in the list of tests with
        # run time measurement enabled, start a timer to begin measurement.
        if (len(parser.args) > 0 and parser.args[0] != '' and
            parser.options.machines):
            try:
                test_name = control_data.parse_control(parser.args[0],
                                                       raise_warnings=True).name
            except control_data.ControlVariableException:
                logging.debug('Failed to retrieve test name from control file.')
                test_name = None
            if test_name in measure_run_time_tests:
                machines = parser.options.machines.replace(',', ' '
                                                           ).strip().split()
                try:
                    afe = frontend.AFE()
                    board = server_utils.get_board_from_afe(machines[0], afe)
                    timer = stats.Timer('autoserv_run_time.%s.%s' %
                                        (board, test_name))
                    timer.start()
                except (urllib2.HTTPError, urllib2.URLError):
                    # Ignore error if RPC failed to get board
                    pass
    except control_data.ControlVariableException as e:
        logging.error(str(e))
    exit_code = 0
    # TODO(beeps): Extend this to cover different failure modes.
    # Testing exceptions are matched against labels sent to autoserv. Eg,
    # to allow only the hostless job to run, specify
    # testing_exceptions: test_suite in the shadow_config. To allow both
    # the hostless job and dummy_Pass to run, specify
    # testing_exceptions: test_suite,dummy_Pass. You can figure out
    # what label autoserv is invoked with by looking through the logs of a test
    # for the autoserv command's -l option.
    testing_exceptions = global_config.global_config.get_config_value(
            'AUTOSERV', 'testing_exceptions', type=list, default=[])
    test_mode = global_config.global_config.get_config_value(
            'AUTOSERV', 'testing_mode', type=bool, default=False)
    test_mode = (results_mocker and test_mode and not
                 any([ex in parser.options.label
                      for ex in testing_exceptions]))
    is_task = (parser.options.verify or parser.options.repair or
               parser.options.provision or parser.options.reset or
               parser.options.cleanup or parser.options.collect_crashinfo)
    try:
        try:
            if test_mode:
                # The parser doesn't run on tasks anyway, so we can just return
                # happy signals without faking results.
                if not is_task:
                    machine = parser.options.results.split('/')[-1]

                    # TODO(beeps): The proper way to do this would be to
                    # refactor job creation so we can invoke job.record
                    # directly. To do that one needs to pipe the test_name
                    # through run_autoserv and bail just before invoking
                    # the server job. See the comment in
                    # puppylab/results_mocker for more context.
                    results_mocker.ResultsMocker(
                            test_name if test_name else 'unknown-test',
                            parser.options.results, machine
                            ).mock_results()
                return
            else:
                run_autoserv(pid_file_manager, results, parser)
        except SystemExit as e:
            exit_code = e.code
            if exit_code:
                logging.exception(e)
        except Exception as e:
            # If we don't know what happened, we'll classify it as
            # an 'abort' and return 1.
            logging.exception(e)
            exit_code = 1
    finally:
        if pid_file_manager:
            pid_file_manager.close_file(exit_code)
        if timer:
            timer.stop()
        # Record the autoserv duration time. Must be called
        # just before the system exits to ensure accuracy.
        duration_secs = (datetime.datetime.now() - start_time).total_seconds()
        record_autoserv(parser.options, duration_secs)
    sys.exit(exit_code)


if __name__ == '__main__':
    main()