1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5# This file lets us test the repair supporting code. 6# We could not easily unit test it if it was in the repair file as it makes 7# a function call that is not protected by a __name__ == ??? guard. 8 9import datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib 10 11import common 12 13from autotest_lib.client.common_lib import global_config, mail, logging_config 14from autotest_lib.server import frontend 15from autotest_lib.server.cros.dynamic_suite import reporting 16 17 18# Receiver and sender information, if we need to send an email 19_NOTIFY_ADDRESS = global_config.global_config.get_config_value( 20 'SCHEDULER', 'notify_email_errors', default='') 21_SENDER_ADDRESS = global_config.global_config.get_config_value( 22 'SCHEDULER', "notify_email_from", default=getpass.getuser()) 23 24# Ignore any jobs that were ran more than this many mins past the max job 25# timeout. 26_CUTOFF_AFTER_TIMEOUT_MINS = 60 27_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value( 28 'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int, 29 default=0) 30 31 32class MachineDeathLogger(logging_config.LoggingConfig): 33 """ 34 Used to log information about a machine going into the Repair Failed state. 35 36 We use this so that if the default log location ever changes it will also 37 change for this logger and to keep this information separate from the 38 other logs. 39 40 """ 41 file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s', 42 datefmt='%m/%d %H:%M:%S') 43 LOGFILE_NAME = 'machine_death.log' 44 45 def __init__(self): 46 super(MachineDeathLogger, self).__init__(False) 47 self.logger = logging.getLogger('machine_death') 48 49 super(MachineDeathLogger, self).configure_logging(use_console=False) 50 log_dir = self.get_server_log_dir() 51 self.add_file_handler(self.LOGFILE_NAME, logging.ERROR, 52 log_dir=log_dir) 53 54 55def _find_problem_test(machine, rpc): 56 """ 57 Find the last job that ran on the machine. 58 59 Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS. 60 If global_config doesn't have a job_max_runtime_mins_default we will search 61 only as far as _CUTOFF_AFTER_TIMEOUT_MINS. 62 63 @param machine: The hostname (e.g. IP address) of the machine to find the 64 last ran job on it. 65 66 @param rpc: The rpc object to contact the server with. 67 68 @return the job status dictionary for the job that last ran on the machine 69 or None if there is no such job. 70 """ 71 72 # Going through the RPC interface means we cannot use the latest() django 73 # QuerySet function. So we will instead look at the past 74 # job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS 75 # and pick the most recent run from there. 76 cutoff = (datetime.datetime.today() - 77 datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) - 78 datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS)) 79 80 results = rpc.run('get_host_queue_entries', host__hostname=machine, 81 started_on__gte=str(cutoff)) 82 83 if results: 84 return max(results, key=operator.itemgetter('started_on')) 85 else: 86 return None 87 88 89def flag_problem_test(machine): 90 """ 91 Notify people about the last job that ran on a machine. 92 93 This method is invoked everytime a machine fails to repair, and attempts 94 to identify the last test that ran on the machine. If successfull, it files 95 a bug, or sends out an email, or just logs the fact. 96 97 @param machine: The hostname (e.g. IP address) of the machine to find the 98 last job ran on it. 99 100 """ 101 rpc = frontend.AFE() 102 logger = MachineDeathLogger() 103 104 try: 105 problem_test = _find_problem_test(machine, rpc) 106 except (urllib2.URLError, xmlrpclib.ProtocolError): 107 logger.logger.error('%s | ERROR: Could not contact RPC server' 108 % machine) 109 return 110 111 if problem_test: 112 job_id = problem_test['job']['id'] 113 job_name = problem_test['job']['name'] 114 bug = reporting.MachineKillerBug(job_id=job_id, 115 job_name=job_name, 116 machine=machine) 117 reporter = reporting.Reporter() 118 bug_id = reporter.report(bug)[0] 119 120 if bug_id is None: 121 try: 122 email_prefix = ('The following test is killing a machine, ' 123 'could not file a bug to report this:\n\n') 124 mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '', 125 bug.title(), email_prefix + bug.summary()) 126 except smtplib.SMTPDataError: 127 logger.logger.error('%s | %d | %s' 128 % (machine, job_id, job_name)) 129