1import os, time, logging, shutil
2
3from autotest_lib.client.common_lib import global_config
4from autotest_lib.client.common_lib.cros.graphite import autotest_stats
5from autotest_lib.client.cros import constants
6from autotest_lib.server import utils
7
8
9# import any site hooks for the crashdump and crashinfo collection
10get_site_crashdumps = utils.import_site_function(
11    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashdumps",
12    lambda host, test_start_time: None)
13get_site_crashinfo = utils.import_site_function(
14    __file__, "autotest_lib.server.site_crashcollect", "get_site_crashinfo",
15    lambda host, test_start_time: None)
16
17
18_timer = autotest_stats.Timer('crash_collection')
19
20@_timer.decorate
21def get_crashdumps(host, test_start_time):
22    get_site_crashdumps(host, test_start_time)
23
24
25@_timer.decorate
26def get_crashinfo(host, test_start_time):
27    logging.info("Collecting crash information...")
28
29    # get_crashdumps collects orphaned crashdumps and symbolicates all
30    # collected crashdumps. Symbolicating could happen
31    # during a postjob task as well, at which time some crashdumps could have
32    # already been pulled back from machine. So it doesn't necessarily need
33    # to wait for the machine to come up.
34    get_crashdumps(host, test_start_time)
35
36    if wait_for_machine_to_recover(host):
37        # run any site-specific collection
38        get_site_crashinfo(host, test_start_time)
39
40        crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
41        collect_messages(host)
42        collect_command(host, "dmesg", os.path.join(crashinfo_dir, "dmesg"))
43        collect_uncollected_logs(host)
44
45        # Collect everything in /var/log.
46        log_path = os.path.join(crashinfo_dir, 'var')
47        os.makedirs(log_path)
48        collect_log_file(host, constants.LOG_DIR, log_path)
49
50        # Collect console-ramoops
51        log_path = os.path.join(
52                crashinfo_dir, os.path.basename(constants.LOG_CONSOLE_RAMOOPS))
53        collect_log_file(host, constants.LOG_CONSOLE_RAMOOPS, log_path)
54        # Collect i915_error_state, only available on intel systems.
55        # i915 contains the Intel graphics state. It might contain useful data
56        # when a DUT hangs, times out or crashes.
57        log_path = os.path.join(
58                crashinfo_dir, os.path.basename(constants.LOG_I915_ERROR_STATE))
59        collect_log_file(host, constants.LOG_I915_ERROR_STATE,
60                         log_path, use_tmp=True)
61
62
63# Load default for number of hours to wait before giving up on crash collection.
64HOURS_TO_WAIT = global_config.global_config.get_config_value(
65    'SERVER', 'crash_collection_hours_to_wait', type=float, default=4.0)
66
67
68def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
69    """Wait for a machine (possibly down) to become accessible again.
70
71    @param host: A RemoteHost instance to wait on
72    @param hours_to_wait: Number of hours to wait before giving up
73
74    @returns: True if the machine comes back up, False otherwise
75    """
76    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
77    if host.is_up():
78        logging.info("%s already up, collecting crash info", host.hostname)
79        return True
80
81    logging.info("Waiting %s hours for %s to come up (%s)",
82                 hours_to_wait, host.hostname, current_time)
83    if not host.wait_up(timeout=hours_to_wait * 3600):
84        autotest_stats.Counter('collect_crashinfo_timeout').increment()
85        logging.warning("%s down, unable to collect crash info",
86                        host.hostname)
87        return False
88    else:
89        logging.info("%s is back up, collecting crash info", host.hostname)
90        return True
91
92
93def get_crashinfo_dir(host, dir_prefix):
94    """Find and if necessary create a directory to store crashinfo in.
95
96    @param host: The RemoteHost object that crashinfo will be collected from
97    @param dir_prefix: Prefix of directory name.
98
99    @returns: The path to an existing directory for writing crashinfo into
100    """
101    host_resultdir = getattr(getattr(host, "job", None), "resultdir", None)
102    if host_resultdir:
103        infodir = host_resultdir
104    else:
105        infodir = os.path.abspath(os.getcwd())
106    infodir = os.path.join(infodir, "%s.%s" % (dir_prefix, host.hostname))
107    if not os.path.exists(infodir):
108        os.mkdir(infodir)
109    return infodir
110
111
112def collect_log_file(host, log_path, dest_path, use_tmp=False):
113    """Collects a log file from the remote machine.
114
115    Log files are collected from the remote machine and written into the
116    destination path. If dest_path is a directory, the log file will be named
117    using the basename of the remote log path.
118
119    @param host: The RemoteHost to collect logs from
120    @param log_path: The remote path to collect the log file from
121    @param dest_path: A path (file or directory) to write the copies logs into
122    @param use_tmp: If True, will first copy the logs to a temporary directory
123                    on the host and download logs from there.
124
125    """
126    logging.info('Collecting %s...', log_path)
127    try:
128        source_path = log_path
129        if use_tmp:
130            devnull = open('/dev/null', 'w')
131            tmpdir = host.run('mktemp -d', stdout_tee=devnull).stdout.strip()
132            host.run('cp -rp %s %s' % (log_path, tmpdir))
133            source_path = os.path.join(tmpdir, os.path.basename(log_path))
134        host.get_file(source_path, dest_path, preserve_perm=False)
135        if use_tmp:
136            host.run('rm -rf %s' % tmpdir)
137    except Exception, e:
138        logging.warning('Collection of %s failed: %s', log_path, e)
139
140
141def collect_command(host, command, dest_path):
142    """Collects the result of a command on the remote machine.
143
144    The standard output of the command will be collected and written into the
145    desitionation path. The destination path is assumed to be filename and
146    not a directory.
147
148    @param host: The RemoteHost to collect from
149    @param command: A shell command to run on the remote machine and capture
150        the output from.
151    @param dest_path: A file path to write the results of the log into
152    """
153    logging.info("Collecting '%s' ...", command)
154    devnull = open("/dev/null", "w")
155    try:
156        try:
157            result = host.run(command, stdout_tee=devnull).stdout
158            utils.open_write_close(dest_path, result)
159        except Exception, e:
160            logging.warning("Collection of '%s' failed:\n%s", command, e)
161    finally:
162        devnull.close()
163
164
165def collect_uncollected_logs(host):
166    """Collects any leftover uncollected logs from the client.
167
168    @param host: The RemoteHost to collect from
169    """
170    if host.job:
171        try:
172            logs = host.job.get_client_logs()
173            for hostname, remote_path, local_path in logs:
174                if hostname == host.hostname:
175                    logging.info("Retrieving logs from %s:%s into %s",
176                                 hostname, remote_path, local_path)
177                    host.get_file(remote_path + "/", local_path + "/")
178        except Exception, e:
179            logging.warning("Error while trying to collect stranded "
180                            "Autotest client logs: %s", e)
181
182
183def collect_messages(host):
184    """Collects the 'new' contents of /var/log/messages.
185
186    If host.VAR_LOG_MESSAGE_COPY_PATH is on the remote machine, collects
187    the contents of /var/log/messages excluding whatever initial contents
188    are already present in host.VAR_LOG_MESSAGE_COPY_PATH. If it is not
189    present, simply collects the entire contents of /var/log/messages.
190
191    @param host: The RemoteHost to collect from
192    """
193    crashinfo_dir = get_crashinfo_dir(host, 'crashinfo')
194
195    try:
196        # paths to the messages files
197        messages = os.path.join(crashinfo_dir, "messages")
198        messages_raw = os.path.join(crashinfo_dir, "messages.raw")
199        messages_at_start = os.path.join(crashinfo_dir, "messages.at_start")
200
201        # grab the files from the remote host
202        collect_log_file(host, host.VAR_LOG_MESSAGES_COPY_PATH,
203                         messages_at_start)
204        collect_log_file(host, "/var/log/messages", messages_raw)
205
206        # figure out how much of messages.raw to skip
207        if os.path.exists(messages_at_start):
208            # if the first lines of the messages at start should match the
209            # first lines of the current messages; if they don't then messages
210            # has been erase or rotated and we just grab all of it
211            first_line_at_start = utils.read_one_line(messages_at_start)
212            first_line_now = utils.read_one_line(messages_raw)
213            if first_line_at_start != first_line_now:
214                size_at_start = 0
215            else:
216                size_at_start = os.path.getsize(messages_at_start)
217        else:
218            size_at_start = 0
219        raw_messages_file = open(messages_raw)
220        messages_file = open(messages, "w")
221        raw_messages_file.seek(size_at_start)
222        shutil.copyfileobj(raw_messages_file, messages_file)
223        raw_messages_file.close()
224        messages_file.close()
225
226        # get rid of the "raw" versions of messages
227        os.remove(messages_raw)
228        if os.path.exists(messages_at_start):
229            os.remove(messages_at_start)
230    except Exception, e:
231        logging.warning("Error while collecting /var/log/messages: %s", e)
232