autotest/server/site_crashcollect.py

# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import logging
import os
import re
from autotest_lib.client.common_lib import utils as client_utils
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.client.cros import constants
from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY
from autotest_lib.server import utils


CRASH_SERVER_OVERLOAD = 'crash_server_overload'
CRASH_SERVER_FOUND = 'crash_server_found'
SYMBOLICATE_TIMEDOUT = 'symbolicate_timedout'

timer = autotest_stats.Timer('crash_collect')

def generate_minidump_stacktrace(minidump_path):
    """
    Generates a stacktrace for the specified minidump.

    This function expects the debug symbols to reside under:
        /build/<board>/usr/lib/debug

    @param minidump_path: absolute path to minidump to by symbolicated.
    @raise client_utils.error.CmdError if minidump_stackwalk return code != 0.
    """
    symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir()
    logging.info('symbol_dir: %s', symbol_dir)
    client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
                     (minidump_path, symbol_dir, minidump_path))


@timer.decorate
def symbolicate_minidump_with_devserver(minidump_path, resultdir):
    """
    Generates a stack trace for the specified minidump by consulting devserver.

    This function assumes the debug symbols have been staged on the devserver.

    @param minidump_path: absolute path to minidump to by symbolicated.
    @param resultdir: server job's result directory.
    @raise DevServerException upon failure, HTTP or otherwise.
    """
    # First, look up what build we tested.  If we can't find this, we can't
    # get the right debug symbols, so we might as well give up right now.
    keyvals = client_utils.read_keyval(resultdir)
    if JOB_BUILD_KEY not in keyvals:
        raise dev_server.DevServerException(
            'Cannot determine build being tested.')

    crashserver_name = dev_server.get_least_loaded_devserver(
            devserver_type=dev_server.CrashServer)
    if not crashserver_name:
        autotest_stats.Counter(CRASH_SERVER_OVERLOAD).increment()
        raise dev_server.DevServerException(
                'No crash server has the capacity to symbolicate the dump.')
    else:
        autotest_stats.Counter(CRASH_SERVER_FOUND).increment()
    devserver = dev_server.CrashServer(crashserver_name)
    trace_text = devserver.symbolicate_dump(
        minidump_path, keyvals[JOB_BUILD_KEY])
    if not trace_text:
        raise dev_server.DevServerException('Unknown error!!')
    with open(minidump_path + '.txt', 'w') as trace_file:
        trace_file.write(trace_text)


def find_and_generate_minidump_stacktraces(host_resultdir):
    """
    Finds all minidump files and generates a stack trace for each.

    Enumerates all files under the test results directory (recursively)
    and generates a stack trace file for the minidumps.  Minidump files are
    identified as files with .dmp extension.  The stack trace filename is
    composed by appending the .txt extension to the minidump filename.

    @param host_resultdir: Directory to walk looking for dmp files.

    @returns The list of generated minidumps.
    """
    minidumps = []
    for dir, subdirs, files in os.walk(host_resultdir):
        for file in files:
            if not file.endswith('.dmp'):
                continue
            minidump = os.path.join(dir, file)

            # First, try to symbolicate locally.
            try:
                generate_minidump_stacktrace(minidump)
                logging.info('Generated stack trace for dump %s', minidump)
                minidumps.append(minidump)
                continue
            except client_utils.error.CmdError as err:
                logging.warning('Failed to generate stack trace locally for '
                             'dump %s (rc=%d):\n%r',
                             minidump, err.result_obj.exit_status, err)

            # If that did not succeed, try to symbolicate using the dev server.
            try:
                logging.info('Generating stack trace for %s', minidump)
                minidumps.append(minidump)
                is_timeout, _ = retry.timeout(
                        symbolicate_minidump_with_devserver,
                        args=(minidump, host_resultdir),
                        timeout_sec=600)
                if is_timeout:
                    logging.warn('Generating stack trace is timed out for dump '
                                 '%s', minidump)
                    autotest_stats.Counter(SYMBOLICATE_TIMEDOUT).increment()
                else:
                    logging.info('Generated stack trace for dump %s', minidump)
                continue
            except dev_server.DevServerException as e:
                logging.warning('Failed to generate stack trace on devserver for '
                             'dump %s:\n%r', minidump, e)
    return minidumps


def fetch_orphaned_crashdumps(host, host_resultdir):
    """
    Copy all of the crashes in the crash directory over to the results folder.

    @param host A host object of the device we're to pull crashes from.
    @param host_resultdir The result directory for this host for this test run.
    @return The list of minidumps that we pulled back from the host.
    """
    minidumps = []
    for file in host.list_files_glob(os.path.join(constants.CRASH_DIR, '*')):
        logging.info('Collecting %s...', file)
        host.get_file(file, host_resultdir, preserve_perm=False)
        minidumps.append(file)
    return minidumps


def get_site_crashdumps(host, test_start_time):
    """
    Copy all of the crashdumps from a host to the results directory.

    @param host The host object from which to pull crashes
    @param test_start_time When the test we just ran started.
    @return A list of all the minidumps
    """
    host_resultdir = getattr(getattr(host, 'job', None), 'resultdir', None)
    infodir = os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname)
    if not os.path.exists(infodir):
        os.mkdir(infodir)

    # TODO(milleral): handle orphans differently. crosbug.com/38202
    try:
        orphans = fetch_orphaned_crashdumps(host, infodir)
    except Exception as e:
        orphans = []
        logging.warning('Collection of orphaned crash dumps failed %s', e)

    minidumps = find_and_generate_minidump_stacktraces(host_resultdir)

    # Record all crashdumps in status.log of the job:
    # - If one server job runs several client jobs we will only record
    # crashdumps in the status.log of the high level server job.
    # - We will record these crashdumps whether or not we successfully
    # symbolicate them.
    if host.job and minidumps or orphans:
        host.job.record('INFO', None, None, 'Start crashcollection record')
        for minidump in minidumps:
            host.job.record('INFO', None, 'New Crash Dump', minidump)
        for orphan in orphans:
            host.job.record('INFO', None, 'Orphaned Crash Dump', orphan)
        host.job.record('INFO', None, None, 'End crashcollection record')

    orphans.extend(minidumps)

    for minidump in orphans:
        report_bug_from_crash(host, minidump)

    return orphans


def find_package_of(host, exec_name):
    """
    Find the package that an executable came from.

    @param host A host object that has the executable.
    @param exec_name Name of or path to executable.
    @return The name of the package that installed the executable.
    """
    # Run "portageq owners" on "host" to determine which package owns
    # "exec_name."  Portageq queue output consists of package names followed
    # tab-prefixed path names.  For example, owners of "python:"
    #
    # sys-devel/gdb-7.7.1-r2
    #         /usr/share/gdb/python
    # chromeos-base/dev-install-0.0.1-r711
    #         /usr/bin/python
    # dev-lang/python-2.7.3-r7
    #         /etc/env.d/python
    #
    # This gets piped into "xargs stat" to annotate each line with
    # information about the path, so we later can consider only packages
    # with executable files.  After annotation the above looks like:
    #
    # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ...
    # stat: cannot stat '/usr/share/gdb/python': ...
    # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ...
    # 755 -rwxr-xr-x /usr/bin/python
    # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ...
    # 755 drwxr-xr-x /etc/env.d/python
    #
    # Package names are surrounded by "@@@" to facilitate parsing.  Lines
    # starting with an octal number were successfully annotated, because
    # the path existed on "host."
    # The above is then parsed to find packages which contain executable files
    # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711."
    #
    # TODO(milleral): portageq can show scary looking error messages
    # in the debug logs via stderr. We only look at stdout, so those
    # get filtered, but it would be good to silence them.
    cmd = ('portageq owners / ' + exec_name +
            r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"'
            r'| tr \\n \\0'
            ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1')
    portageq = host.run(cmd, ignore_status=True)

    # Parse into a set of names of packages containing an executable file.
    packages = set()
    pkg = ''
    pkg_re = re.compile('@@@ (.*) @@@')
    path_re = re.compile('^([0-7]{3,}) (.)')
    for line in portageq.stdout.splitlines():
        match = pkg_re.search(line)
        if match:
            pkg = match.group(1)
            continue
        match = path_re.match(line)
        if match:
            isexec = int(match.group(1), 8) & 0o111
            isfile = match.group(2) == '-'
            if pkg and isexec and isfile:
                packages.add(pkg)

    # If exactly one package found it must be the one we want, return it.
    if len(packages) == 1:
        return packages.pop()

    # TODO(milleral): Decide if it really is an error if not exactly one
    # package is found.
    # It is highly questionable as to if this should be left in the
    # production version of this code or not.
    if len(packages) == 0:
        logging.warning('find_package_of() found no packages for "%s"',
                        exec_name)
    else:
        logging.warning('find_package_of() found multiple packages for "%s": '
                        '%s', exec_name, ', '.join(packages))
    return ''


def report_bug_from_crash(host, minidump_path):
    """
    Given a host to query and a minidump, file a bug about the crash.

    @param host A host object that is where the dump came from
    @param minidump_path The path to the dump file that should be reported.
    """
    # TODO(milleral): Once this has actually been tested, remove the
    # try/except. In the meantime, let's make sure nothing dies because of
    # the fact that this code isn't very heavily tested.
    try:
        meta_path = os.path.splitext(minidump_path)[0] + '.meta'
        with open(meta_path, 'r') as f:
            for line in f.readlines():
                parts = line.split('=')
                if parts[0] == 'exec_name':
                    package = find_package_of(host, parts[1].strip())
                    if not package:
                        package = '<unknown package>'
                    logging.info('Would report crash on %s.', package)
                    break
    except Exception as e:
        logging.warning('Crash detection failed with: %s', e)