1#!/usr/bin/env python
2# Copyright 2015 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Cleanup orphaned containers.
7
8If an autoserv process dies without being able to call handler of SIGTERM, the
9container used to run the test will be orphaned. This adds overhead to the
10drone. This script is used to clean up such containers.
11
12This module also checks if the test job associated with a container has
13finished. If so, kill the autoserv process for the test job and destroy the
14container. To avoid racing condition, this only applies to job finished at least
151 hour ago.
16
17"""
18
19import argparse
20import datetime
21import logging
22import os
23import signal
24
25import common
26from autotest_lib.client.common_lib import logging_config
27from autotest_lib.client.common_lib import time_utils
28from autotest_lib.client.common_lib import utils
29from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
30from autotest_lib.site_utils import lxc
31
32
33AFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
34# The cutoff time to declare a test job is completed and container is orphaned.
35# This is to avoid a race condition that scheduler aborts a job and autoserv
36# is still in the process of destroying the container it used.
37FINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
38
39def is_container_orphaned(container):
40    """Check if a container is orphaned.
41
42    A container is orphaned if any of these condition is True:
43    1. The autoserv process created the container is no longer running.
44    2. The test job is finished at least 1 hour ago.
45
46    @param container: A Container object.
47
48    @return: True if the container is orphaned.
49
50    """
51    logging.debug('Checking if container is orphaned: %s', container.name)
52    if container.id is None:
53        logging.debug('Container %s is not created for test.', container.name)
54        return False
55
56    job_id = container.id.job_id
57    pid = container.id.pid
58
59    if pid and not utils.pid_is_alive(pid):
60        logging.debug('Process with PID %s is not alive, container %s is '
61                      'orphaned.', pid, container.name)
62        return True
63
64    try:
65        hqes = AFE.get_host_queue_entries(job_id=job_id)
66    except Exception as e:
67        logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
68        return False
69
70    if not hqes:
71        # The job has not run yet.
72        return False
73    for hqe in hqes:
74        if hqe.active or not hqe.complete:
75            logging.debug('Test job %s is not completed yet, container %s is '
76                          'not orphaned.', job_id, container.name)
77            return False
78        if (hqe.finished_on and
79            (time_utils.time_string_to_datetime(hqe.finished_on) >
80             FINISHED_JOB_CUTOFF_TIME)):
81            logging.debug('Test job %s was completed less than an hour ago.',
82                          job_id)
83            return False
84
85    logging.debug('Test job %s was completed, container %s is orphaned.',
86                  job_id, container.name)
87    return True
88
89
90def cleanup(container, options):
91    """Cleanup orphaned container.
92
93    @param container: A Container object to be cleaned up.
94    @param options: Options to do cleanup.
95
96    @return: True if cleanup is successful. False otherwise.
97
98    """
99    if not options.execute:
100        logging.info('dryrun: Cleanup container %s', container.name)
101        return False
102
103    try:
104        # cleanup is protected by is_container_orphaned.  At this point the
105        # container may be assumed to have a valid ID.
106        pid = container.id.pid
107        # Kill autoserv process
108        if pid and utils.pid_is_alive(pid):
109            logging.info('Stopping process %s...', pid)
110            utils.nuke_pid(int(pid), (signal.SIGKILL,))
111
112        # Destroy container
113        logging.info('Destroying container %s...', container.name)
114        container.destroy()
115        return True
116    except Exception as e:
117        logging.error('Failed to cleanup container %s. Error: %s',
118                      container.name, e)
119        return False
120
121
122def parse_options():
123    """Parse command line inputs.
124
125    @return: Options to run the script.
126    """
127    parser = argparse.ArgumentParser()
128    parser.add_argument('-v', '--verbose', action='store_true',
129                        default=False,
130                        help='Print out ALL entries.')
131    parser.add_argument('-x', '--execute', action='store_true',
132                        default=False,
133                        help=('Execute the actions to kill autoserv processes '
134                              'and destroy containers. Default is False to do '
135                              'dry run'))
136    # TODO(dshi): Consider to adopt the scheduler log model:
137    # 1. Create one log per run.
138    # 2. Create a symlink to the latest log.
139    parser.add_argument('-l', '--logfile', type=str,
140                        default=None,
141                        help='Path to the log file to save logs.')
142    return parser.parse_args()
143
144
145def main(options):
146    """Main script.
147
148    @param options: Options to run the script.
149    """
150    config = logging_config.LoggingConfig()
151    if options.logfile:
152        config.add_file_handler(
153                file_path=os.path.abspath(options.logfile),
154                level=logging.DEBUG if options.verbose else logging.INFO)
155
156    bucket = lxc.ContainerBucket()
157    logging.info('')
158    logging.info('Cleaning container bucket %s', bucket.container_path)
159    success_count = 0
160    failure_count = 0
161    for container in bucket.get_all().values():
162        if is_container_orphaned(container):
163            if cleanup(container, options):
164                success_count += 1
165            else:
166                failure_count += 1
167    logging.info('Cleanup finished.')
168
169
170if __name__ == '__main__':
171    options = parse_options()
172    main(options)
173