1import os, time, logging, shutil 2 3from autotest_lib.client.common_lib import global_config 4from autotest_lib.client.common_lib.cros.graphite import autotest_stats 5from autotest_lib.client.cros import constants 6from autotest_lib.server import utils 7 8 9# import any site hooks for the crashdump and crashinfo collection 10get_site_crashdumps = utils.import_site_function( 11 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps", 12 lambda host, test_start_time: None) 13get_site_crashinfo = utils.import_site_function( 14 __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo", 15 lambda host, test_start_time: None) 16 17 18_timer = autotest_stats.Timer('crash_collection') 19 20@_timer.decorate 21def get_crashdumps(host, test_start_time): 22 get_site_crashdumps(host, test_start_time) 23 24 25@_timer.decorate 26def get_crashinfo(host, test_start_time): 27 logging.info("Collecting crash information...") 28 29 # get_crashdumps collects orphaned crashdumps and symbolicates all 30 # collected crashdumps. Symbolicating could happen 31 # during a postjob task as well, at which time some crashdumps could have 32 # already been pulled back from machine. So it doesn't necessarily need 33 # to wait for the machine to come up. 34 get_crashdumps(host, test_start_time) 35 36 if wait_for_machine_to_recover(host): 37 # run any site-specific collection 38 get_site_crashinfo(host, test_start_time) 39 40 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 41 collect_messages(host) 42 collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg")) 43 collect_uncollected_logs(host) 44 45 # Collect everything in /var/log. 46 log_path = os.path.join(crashinfo_dir, 'var') 47 os.makedirs(log_path) 48 collect_log_file(host, constants.LOG_DIR, log_path) 49 50 # Collect console-ramoops 51 log_path = os.path.join( 52 crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS)) 53 collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path) 54 # Collect i915_error_state, only available on intel systems. 55 # i915 contains the Intel graphics state. It might contain useful data 56 # when a DUT hangs, times out or crashes. 57 log_path = os.path.join( 58 crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE)) 59 collect_log_file(host, constants.LOG_I915_ERROR_STATE, 60 log_path, use_tmp=True) 61 62 63# Load default for number of hours to wait before giving up on crash collection. 64HOURS_TO_WAIT = global_config.global_config.get_config_value( 65 'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0) 66 67 68def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT): 69 """Wait for a machine (possibly down) to become accessible again. 70 71 @param host: A RemoteHost instance to wait on 72 @param hours_to_wait: Number of hours to wait before giving up 73 74 @returns: True if the machine comes back up, False otherwise 75 """ 76 current_time = time.strftime("%b %d %H:%M:%S", time.localtime()) 77 if host.is_up(): 78 logging.info("%s already up, collecting crash info", host.hostname) 79 return True 80 81 logging.info("Waiting %s hours for %s to come up (%s)", 82 hours_to_wait, host.hostname, current_time) 83 if not host.wait_up(timeout=hours_to_wait * 3600): 84 autotest_stats.Counter('collect_crashinfo_timeout').increment() 85 logging.warning("%s down, unable to collect crash info", 86 host.hostname) 87 return False 88 else: 89 logging.info("%s is back up, collecting crash info", host.hostname) 90 return True 91 92 93def get_crashinfo_dir(host, dir_prefix): 94 """Find and if necessary create a directory to store crashinfo in. 95 96 @param host: The RemoteHost object that crashinfo will be collected from 97 @param dir_prefix: Prefix of directory name. 98 99 @returns: The path to an existing directory for writing crashinfo into 100 """ 101 host_resultdir = getattr(getattr(host, "job", None), "resultdir", None) 102 if host_resultdir: 103 infodir = host_resultdir 104 else: 105 infodir = os.path.abspath(os.getcwd()) 106 infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname)) 107 if not os.path.exists(infodir): 108 os.mkdir(infodir) 109 return infodir 110 111 112def collect_log_file(host, log_path, dest_path, use_tmp=False): 113 """Collects a log file from the remote machine. 114 115 Log files are collected from the remote machine and written into the 116 destination path. If dest_path is a directory, the log file will be named 117 using the basename of the remote log path. 118 119 @param host: The RemoteHost to collect logs from 120 @param log_path: The remote path to collect the log file from 121 @param dest_path: A path (file or directory) to write the copies logs into 122 @param use_tmp: If True, will first copy the logs to a temporary directory 123 on the host and download logs from there. 124 125 """ 126 logging.info('Collecting %s...', log_path) 127 try: 128 source_path = log_path 129 if use_tmp: 130 devnull = open('/dev/null', 'w') 131 tmpdir = host.run('mktemp -d', stdout_tee=devnull).stdout.strip() 132 host.run('cp -rp %s %s' % (log_path, tmpdir)) 133 source_path = os.path.join(tmpdir, os.path.basename(log_path)) 134 host.get_file(source_path, dest_path, preserve_perm=False) 135 if use_tmp: 136 host.run('rm -rf %s' % tmpdir) 137 except Exception, e: 138 logging.warning('Collection of %s failed: %s', log_path, e) 139 140 141def collect_command(host, command, dest_path): 142 """Collects the result of a command on the remote machine. 143 144 The standard output of the command will be collected and written into the 145 desitionation path. The destination path is assumed to be filename and 146 not a directory. 147 148 @param host: The RemoteHost to collect from 149 @param command: A shell command to run on the remote machine and capture 150 the output from. 151 @param dest_path: A file path to write the results of the log into 152 """ 153 logging.info("Collecting '%s' ...", command) 154 devnull = open("/dev/null", "w") 155 try: 156 try: 157 result = host.run(command, stdout_tee=devnull).stdout 158 utils.open_write_close(dest_path, result) 159 except Exception, e: 160 logging.warning("Collection of '%s' failed:\n%s", command, e) 161 finally: 162 devnull.close() 163 164 165def collect_uncollected_logs(host): 166 """Collects any leftover uncollected logs from the client. 167 168 @param host: The RemoteHost to collect from 169 """ 170 if host.job: 171 try: 172 logs = host.job.get_client_logs() 173 for hostname, remote_path, local_path in logs: 174 if hostname == host.hostname: 175 logging.info("Retrieving logs from %s:%s into %s", 176 hostname, remote_path, local_path) 177 host.get_file(remote_path + "/", local_path + "/") 178 except Exception, e: 179 logging.warning("Error while trying to collect stranded " 180 "Autotest client logs: %s", e) 181 182 183def collect_messages(host): 184 """Collects the 'new' contents of /var/log/messages. 185 186 If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects 187 the contents of /var/log/messages excluding whatever initial contents 188 are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not 189 present, simply collects the entire contents of /var/log/messages. 190 191 @param host: The RemoteHost to collect from 192 """ 193 crashinfo_dir = get_crashinfo_dir(host, 'crashinfo') 194 195 try: 196 # paths to the messages files 197 messages = os.path.join(crashinfo_dir, "messages") 198 messages_raw = os.path.join(crashinfo_dir, "messages.raw") 199 messages_at_start = os.path.join(crashinfo_dir, "messages.at_start") 200 201 # grab the files from the remote host 202 collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH, 203 messages_at_start) 204 collect_log_file(host, "/var/log/messages", messages_raw) 205 206 # figure out how much of messages.raw to skip 207 if os.path.exists(messages_at_start): 208 # if the first lines of the messages at start should match the 209 # first lines of the current messages; if they don't then messages 210 # has been erase or rotated and we just grab all of it 211 first_line_at_start = utils.read_one_line(messages_at_start) 212 first_line_now = utils.read_one_line(messages_raw) 213 if first_line_at_start != first_line_now: 214 size_at_start = 0 215 else: 216 size_at_start = os.path.getsize(messages_at_start) 217 else: 218 size_at_start = 0 219 raw_messages_file = open(messages_raw) 220 messages_file = open(messages, "w") 221 raw_messages_file.seek(size_at_start) 222 shutil.copyfileobj(raw_messages_file, messages_file) 223 raw_messages_file.close() 224 messages_file.close() 225 226 # get rid of the "raw" versions of messages 227 os.remove(messages_raw) 228 if os.path.exists(messages_at_start): 229 os.remove(messages_at_start) 230 except Exception, e: 231 logging.warning("Error while collecting /var/log/messages: %s", e) 232