1f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi#!/usr/bin/env python
2f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# Copyright 2015 The Chromium Authors. All rights reserved.
3f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# Use of this source code is governed by a BSD-style license that can be
4f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# found in the LICENSE file.
5f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
6f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi"""Cleanup orphaned containers.
7f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
8f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiIf an autoserv process dies without being able to call handler of SIGTERM, the
9f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shicontainer used to run the test will be orphaned. This adds overhead to the
10f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidrone. This script is used to clean up such containers.
11f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
12f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiThis module also checks if the test job associated with a container has
13f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifinished. If so, kill the autoserv process for the test job and destroy the
14f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shicontainer. To avoid racing condition, this only applies to job finished at least
15f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi1 hour ago.
16f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
17f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi"""
18f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
19f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport argparse
20f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport datetime
21f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport logging
22f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport os
23f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport re
24f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport signal
25f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport socket
26f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
27f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport common
28f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import logging_config
29f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import time_utils
30f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import utils
31f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib.cros.graphite import autotest_stats
32f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
33f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.site_utils import lxc
34f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
35f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
36f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiAFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10)
37f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# The cutoff time to declare a test job is completed and container is orphaned.
38f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# This is to avoid a race condition that scheduler aborts a job and autoserv
39f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# is still in the process of destroying the container it used.
40f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiFINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1)
41f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
42f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef get_info(container_name):
43f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """Get job id and autoserv process id from container name.
44f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
45f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @param container: Name of the container.
46f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
47f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @return: job id and autoserv process id for the given container name.
48f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
49f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """
50f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    match = re.match('test_(\d+)_(\d+)_(\d+)', container_name)
51f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if not match:
52f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        # Container is not created for test, e.g., the base container.
53f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return None, None
54f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    job_id = int(match.groups()[0])
55f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    pid = match.groups()[2]
56f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    return job_id, pid
57f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
58f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
59f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef is_container_orphaned(container):
60f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """Check if a container is orphaned.
61f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
62f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    A container is orphaned if any of these condition is True:
63f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    1. The autoserv process created the container is no longer running.
64f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    2. The test job is finished at least 1 hour ago.
65f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
66f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @param container: A Container object.
67f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
68f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @return: True if the container is orphaned.
69f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
70f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """
71f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    logging.debug('Checking if container is orphaned: %s', container.name)
72f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    job_id, pid = get_info(container.name)
73f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if not job_id:
74f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.debug('Container %s is not created for test.', container.name)
75f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return False
76f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
77f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if pid and not utils.pid_is_alive(pid):
78f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.debug('Process with PID %s is not alive, container %s is '
79f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                      'orphaned.', pid, container.name)
80f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return True
81f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
82f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    try:
83f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        hqes = AFE.get_host_queue_entries(job_id=job_id)
84f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    except Exception as e:
85f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e)
86f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return False
87f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
88f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if not hqes:
89f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        # The job has not run yet.
90f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return False
91f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    for hqe in hqes:
92f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        if hqe.active or not hqe.complete:
93f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            logging.debug('Test job %s is not completed yet, container %s is '
94f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                          'not orphaned.', job_id, container.name)
95f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            return False
96f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        if (hqe.finished_on and
97f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            (time_utils.time_string_to_datetime(hqes.finished_on) >
98f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi             FINISHED_JOB_CUTOFF_TIME)):
99f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            logging.debug('Test job %s was completed less than an hour ago.',
100f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                          job_id)
101f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            return False
102f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
103f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    logging.debug('Test job %s was completed, container %s is orphaned.',
104f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                  job_id, container.name)
105f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    return True
106f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
107f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
108f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef cleanup(container, options):
109f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """Cleanup orphaned container.
110f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
111f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @param container: A Container object to be cleaned up.
112f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @param options: Options to do cleanup.
113f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
114f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @return: True if cleanup is successful. False otherwise.
115f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
116f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """
117f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if not options.execute:
118f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.info('dryrun: Cleanup container %s', container.name)
119f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return False
120f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
121f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    try:
122f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        _, pid = get_info(container.name)
123f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        # Kill autoserv process
124f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        if pid and utils.pid_is_alive(pid):
125f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            logging.info('Stopping process %s...', pid)
126c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi            utils.nuke_pid(int(pid), (signal.SIGKILL,))
127f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
128f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        # Destroy container
129f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.info('Destroying container %s...', container.name)
130f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        container.destroy()
131f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return True
132f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    except Exception as e:
133f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        logging.error('Failed to cleanup container %s. Error: %s',
134f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                      container.name, e)
135f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        return False
136f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
137f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
138f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef parse_options():
139f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """Parse command line inputs.
140f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
141f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @return: Options to run the script.
142f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """
143f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    parser = argparse.ArgumentParser()
144f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    parser.add_argument('-v', '--verbose', action='store_true',
145f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        default=False,
146f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        help='Print out ALL entries.')
147f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    parser.add_argument('-x', '--execute', action='store_true',
148f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        default=False,
149f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        help=('Execute the actions to kill autoserv processes '
150f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                              'and destroy containers. Default is False to do '
151f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                              'dry run'))
152c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi    # TODO(dshi): Consider to adopt the scheduler log model:
153c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi    # 1. Create one log per run.
154c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi    # 2. Create a symlink to the latest log.
155f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    parser.add_argument('-l', '--logfile', type=str,
156f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        default=None,
157f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                        help='Path to the log file to save logs.')
158f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    return parser.parse_args()
159f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
160f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
161f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef main(options):
162f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """Main script.
163f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
164f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    @param options: Options to run the script.
165f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    """
166f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    config = logging_config.LoggingConfig()
167f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if options.logfile:
168f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        config.add_file_handler(
169f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                file_path=os.path.abspath(options.logfile),
170f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                level=logging.DEBUG if options.verbose else logging.INFO)
171f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
172f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    bucket = lxc.ContainerBucket()
173c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi    logging.info('')
174f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    logging.info('Cleaning container bucket %s', bucket.container_path)
175f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    success_count = 0
176f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    failure_count = 0
177f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    for container in bucket.get_all().values():
178f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        if is_container_orphaned(container):
179f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            if cleanup(container, options):
180f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                success_count += 1
181f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi            else:
182f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi                failure_count += 1
183f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    if options.execute:
184f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        key = 'container_cleanup.%s' % socket.gethostname().replace('.', '_')
185f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        autotest_stats.Gauge(key).send('success', success_count)
186f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi        autotest_stats.Gauge(key).send('failure', failure_count)
187c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi    logging.info('Cleanup finished.')
188f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
189f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi
190f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiif __name__ == '__main__':
191f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    options = parse_options()
192f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi    main(options)
193