1b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# Use of this source code is governed by a BSD-style license that can be
3b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# found in the LICENSE file.
4b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
5b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# This file lets us test the repair supporting code.
6b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# We could not easily unit test it if it was in the repair file as it makes
7b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# a function call that is not protected by a __name__ == ??? guard.
8b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
9cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsimport datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib
10b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
11b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodimport common
12b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
13cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsfrom autotest_lib.client.common_lib import global_config, mail, logging_config
14b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodfrom autotest_lib.server import frontend
15cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsfrom autotest_lib.server.cros.dynamic_suite import reporting
16cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
17cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
18cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps# Receiver and sender information, if we need to send an email
19cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_NOTIFY_ADDRESS = global_config.global_config.get_config_value(
20cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    'SCHEDULER', 'notify_email_errors', default='')
21cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_SENDER_ADDRESS = global_config.global_config.get_config_value(
22cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    'SCHEDULER', "notify_email_from", default=getpass.getuser())
23b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
243da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood# Ignore any jobs that were ran more than this many mins past the max job
253da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood# timeout.
263da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood_CUTOFF_AFTER_TIMEOUT_MINS = 60
27cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value(
28cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int,
29cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    default=0)
30b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
31b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
32b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodclass MachineDeathLogger(logging_config.LoggingConfig):
33b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
34b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    Used to log information about a machine going into the Repair Failed state.
35b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
36b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    We use this so that if the default log location ever changes it will also
37b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    change for this logger and to keep this information separate from the
38b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    other logs.
39b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
40b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
41b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s',
42b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood                                       datefmt='%m/%d %H:%M:%S')
43cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    LOGFILE_NAME = 'machine_death.log'
44b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
45b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    def __init__(self):
46b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        super(MachineDeathLogger, self).__init__(False)
47b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        self.logger = logging.getLogger('machine_death')
48b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
49b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        super(MachineDeathLogger, self).configure_logging(use_console=False)
50b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        log_dir = self.get_server_log_dir()
51cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps        self.add_file_handler(self.LOGFILE_NAME, logging.ERROR,
52cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps                              log_dir=log_dir)
53b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
54b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
55b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hooddef _find_problem_test(machine, rpc):
56b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
57cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    Find the last job that ran on the machine.
58cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
59cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS.
60cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    If global_config doesn't have a job_max_runtime_mins_default we will search
61cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    only as far as _CUTOFF_AFTER_TIMEOUT_MINS.
62b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
63b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    @param machine: The hostname (e.g. IP address) of the machine to find the
64b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        last ran job on it.
65b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
66b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    @param rpc: The rpc object to contact the server with.
67b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
68b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    @return the job status dictionary for the job that last ran on the machine
69b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        or None if there is no such job.
70b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
71cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
72b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    # Going through the RPC interface means we cannot use the latest() django
733da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood    # QuerySet function. So we will instead look at the past
743da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood    # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
753da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood    # and pick the most recent run from there.
76b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    cutoff = (datetime.datetime.today() -
77cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps              datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) -
783da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood              datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))
79b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
80b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    results = rpc.run('get_host_queue_entries', host__hostname=machine,
81b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood                      started_on__gte=str(cutoff))
82b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
83b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    if results:
84b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        return max(results, key=operator.itemgetter('started_on'))
85b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    else:
86b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        return None
87b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
88b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
89b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hooddef flag_problem_test(machine):
90b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
91b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    Notify people about the last job that ran on a machine.
92b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
93cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    This method is invoked everytime a machine fails to repair, and attempts
94cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    to identify the last test that ran on the machine. If successfull, it files
95cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    a bug, or sends out an email, or just logs the fact.
96b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
97b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    @param machine: The hostname (e.g. IP address) of the machine to find the
98b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        last job ran on it.
99b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
100b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    """
101b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    rpc = frontend.AFE()
102cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    logger = MachineDeathLogger()
103b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood
104b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood    try:
105b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        problem_test = _find_problem_test(machine, rpc)
106cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    except (urllib2.URLError, xmlrpclib.ProtocolError):
107b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood        logger.logger.error('%s | ERROR: Could not contact RPC server'
108b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood                            % machine)
109cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps        return
110cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
111cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps    if problem_test:
112c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng        job_id = problem_test['job']['id']
113cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps        job_name = problem_test['job']['name']
114c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng        bug = reporting.MachineKillerBug(job_id=job_id,
115c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng                                         job_name=job_name,
116c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng                                         machine=machine)
117c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng        reporter = reporting.Reporter()
118c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng        bug_id = reporter.report(bug)[0]
119cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps
120cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps        if bug_id is None:
121cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps            try:
122cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps                email_prefix = ('The following test is killing a machine, '
123cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps                                'could not file a bug to report this:\n\n')
124cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps                mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '',
125c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng                          bug.title(), email_prefix + bug.summary())
126cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps            except smtplib.SMTPDataError:
127cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps                logger.logger.error('%s | %d | %s'
128c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng                                    % (machine, job_id, job_name))
129