1b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# Use of this source code is governed by a BSD-style license that can be 3b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# found in the LICENSE file. 4b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 5b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# This file lets us test the repair supporting code. 6b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# We could not easily unit test it if it was in the repair file as it makes 7b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood# a function call that is not protected by a __name__ == ??? guard. 8b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 9cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsimport datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib 10b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 11b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodimport common 12b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 13cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsfrom autotest_lib.client.common_lib import global_config, mail, logging_config 14b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodfrom autotest_lib.server import frontend 15cbb6306ae5a7bffecfb64d42d503305709aace2ebeepsfrom autotest_lib.server.cros.dynamic_suite import reporting 16cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 17cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 18cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps# Receiver and sender information, if we need to send an email 19cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_NOTIFY_ADDRESS = global_config.global_config.get_config_value( 20cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 'SCHEDULER', 'notify_email_errors', default='') 21cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_SENDER_ADDRESS = global_config.global_config.get_config_value( 22cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 'SCHEDULER', "notify_email_from", default=getpass.getuser()) 23b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 243da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood# Ignore any jobs that were ran more than this many mins past the max job 253da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood# timeout. 263da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood_CUTOFF_AFTER_TIMEOUT_MINS = 60 27cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value( 28cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int, 29cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps default=0) 30b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 31b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 32b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hoodclass MachineDeathLogger(logging_config.LoggingConfig): 33b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 34b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood Used to log information about a machine going into the Repair Failed state. 35b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 36b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood We use this so that if the default log location ever changes it will also 37b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood change for this logger and to keep this information separate from the 38b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood other logs. 39b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 40b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 41b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s', 42b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood datefmt='%m/%d %H:%M:%S') 43cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps LOGFILE_NAME = 'machine_death.log' 44b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 45b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood def __init__(self): 46b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood super(MachineDeathLogger, self).__init__(False) 47b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood self.logger = logging.getLogger('machine_death') 48b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 49b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood super(MachineDeathLogger, self).configure_logging(use_console=False) 50b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood log_dir = self.get_server_log_dir() 51cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps self.add_file_handler(self.LOGFILE_NAME, logging.ERROR, 52cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps log_dir=log_dir) 53b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 54b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 55b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hooddef _find_problem_test(machine, rpc): 56b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 57cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps Find the last job that ran on the machine. 58cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 59cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS. 60cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps If global_config doesn't have a job_max_runtime_mins_default we will search 61cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps only as far as _CUTOFF_AFTER_TIMEOUT_MINS. 62b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 63b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood @param machine: The hostname (e.g. IP address) of the machine to find the 64b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood last ran job on it. 65b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 66b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood @param rpc: The rpc object to contact the server with. 67b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 68b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood @return the job status dictionary for the job that last ran on the machine 69b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood or None if there is no such job. 70b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 71cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 72b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood # Going through the RPC interface means we cannot use the latest() django 733da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood # QuerySet function. So we will instead look at the past 743da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS 753da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood # and pick the most recent run from there. 76b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood cutoff = (datetime.datetime.today() - 77cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) - 783da07f9bc1d3784182d572e0c7f4e5a02a9c8b31Keyar Hood datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS)) 79b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 80b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood results = rpc.run('get_host_queue_entries', host__hostname=machine, 81b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood started_on__gte=str(cutoff)) 82b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 83b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood if results: 84b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood return max(results, key=operator.itemgetter('started_on')) 85b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood else: 86b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood return None 87b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 88b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 89b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hooddef flag_problem_test(machine): 90b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 91b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood Notify people about the last job that ran on a machine. 92b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 93cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps This method is invoked everytime a machine fails to repair, and attempts 94cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps to identify the last test that ran on the machine. If successfull, it files 95cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps a bug, or sends out an email, or just logs the fact. 96b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 97b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood @param machine: The hostname (e.g. IP address) of the machine to find the 98b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood last job ran on it. 99b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 100b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood """ 101b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood rpc = frontend.AFE() 102cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps logger = MachineDeathLogger() 103b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood 104b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood try: 105b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood problem_test = _find_problem_test(machine, rpc) 106cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps except (urllib2.URLError, xmlrpclib.ProtocolError): 107b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood logger.logger.error('%s | ERROR: Could not contact RPC server' 108b6edeb0ae34f912254cc9bec4837ca965e61b290Keyar Hood % machine) 109cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps return 110cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 111cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps if problem_test: 112c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng job_id = problem_test['job']['id'] 113cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps job_name = problem_test['job']['name'] 114c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng bug = reporting.MachineKillerBug(job_id=job_id, 115c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng job_name=job_name, 116c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng machine=machine) 117c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng reporter = reporting.Reporter() 118c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng bug_id = reporter.report(bug)[0] 119cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 120cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps if bug_id is None: 121cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps try: 122cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps email_prefix = ('The following test is killing a machine, ' 123cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps 'could not file a bug to report this:\n\n') 124cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '', 125c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng bug.title(), email_prefix + bug.summary()) 126cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps except smtplib.SMTPDataError: 127cbb6306ae5a7bffecfb64d42d503305709aace2ebeeps logger.logger.error('%s | %d | %s' 128c8cbae23a1be49f4ef766284d77b1b6d60744d2bFang Deng % (machine, job_id, job_name)) 129