1f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi#!/usr/bin/env python 2f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# Copyright 2015 The Chromium Authors. All rights reserved. 3f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# Use of this source code is governed by a BSD-style license that can be 4f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# found in the LICENSE file. 5f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 6f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi"""Cleanup orphaned containers. 7f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 8f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiIf an autoserv process dies without being able to call handler of SIGTERM, the 9f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shicontainer used to run the test will be orphaned. This adds overhead to the 10f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidrone. This script is used to clean up such containers. 11f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 12f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiThis module also checks if the test job associated with a container has 13f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifinished. If so, kill the autoserv process for the test job and destroy the 14f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shicontainer. To avoid racing condition, this only applies to job finished at least 15f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi1 hour ago. 16f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 17f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi""" 18f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 19f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport argparse 20f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport datetime 21f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport logging 22f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport os 23f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport re 24f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport signal 25f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport socket 26f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 27f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiimport common 28f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import logging_config 29f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import time_utils 30f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib import utils 31f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.client.common_lib.cros.graphite import autotest_stats 32f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers 33f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shifrom autotest_lib.site_utils import lxc 34f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 35f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 36f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiAFE = frontend_wrappers.RetryingAFE(timeout_min=0.1, delay_sec=10) 37f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# The cutoff time to declare a test job is completed and container is orphaned. 38f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# This is to avoid a race condition that scheduler aborts a job and autoserv 39f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi# is still in the process of destroying the container it used. 40f4cb4da629e984b472c08788557b971d5e1f9df2Dan ShiFINISHED_JOB_CUTOFF_TIME = datetime.datetime.now() - datetime.timedelta(hours=1) 41f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 42f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef get_info(container_name): 43f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """Get job id and autoserv process id from container name. 44f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 45f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @param container: Name of the container. 46f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 47f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @return: job id and autoserv process id for the given container name. 48f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 49f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """ 50f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi match = re.match('test_(\d+)_(\d+)_(\d+)', container_name) 51f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if not match: 52f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi # Container is not created for test, e.g., the base container. 53f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return None, None 54f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi job_id = int(match.groups()[0]) 55f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi pid = match.groups()[2] 56f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return job_id, pid 57f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 58f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 59f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef is_container_orphaned(container): 60f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """Check if a container is orphaned. 61f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 62f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi A container is orphaned if any of these condition is True: 63f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 1. The autoserv process created the container is no longer running. 64f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 2. The test job is finished at least 1 hour ago. 65f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 66f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @param container: A Container object. 67f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 68f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @return: True if the container is orphaned. 69f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 70f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """ 71f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Checking if container is orphaned: %s', container.name) 72f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi job_id, pid = get_info(container.name) 73f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if not job_id: 74f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Container %s is not created for test.', container.name) 75f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 76f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 77f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if pid and not utils.pid_is_alive(pid): 78f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Process with PID %s is not alive, container %s is ' 79f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 'orphaned.', pid, container.name) 80f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return True 81f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 82f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi try: 83f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi hqes = AFE.get_host_queue_entries(job_id=job_id) 84f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi except Exception as e: 85f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.error('Failed to get hqe for job %s. Error: %s.', job_id, e) 86f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 87f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 88f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if not hqes: 89f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi # The job has not run yet. 90f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 91f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi for hqe in hqes: 92f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if hqe.active or not hqe.complete: 93f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Test job %s is not completed yet, container %s is ' 94f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 'not orphaned.', job_id, container.name) 95f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 96f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if (hqe.finished_on and 97f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi (time_utils.time_string_to_datetime(hqes.finished_on) > 98f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi FINISHED_JOB_CUTOFF_TIME)): 99f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Test job %s was completed less than an hour ago.', 100f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi job_id) 101f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 102f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 103f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.debug('Test job %s was completed, container %s is orphaned.', 104f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi job_id, container.name) 105f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return True 106f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 107f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 108f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef cleanup(container, options): 109f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """Cleanup orphaned container. 110f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 111f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @param container: A Container object to be cleaned up. 112f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @param options: Options to do cleanup. 113f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 114f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @return: True if cleanup is successful. False otherwise. 115f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 116f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """ 117f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if not options.execute: 118f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.info('dryrun: Cleanup container %s', container.name) 119f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 120f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 121f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi try: 122f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi _, pid = get_info(container.name) 123f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi # Kill autoserv process 124f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if pid and utils.pid_is_alive(pid): 125f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.info('Stopping process %s...', pid) 126c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi utils.nuke_pid(int(pid), (signal.SIGKILL,)) 127f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 128f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi # Destroy container 129f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.info('Destroying container %s...', container.name) 130f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi container.destroy() 131f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return True 132f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi except Exception as e: 133f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.error('Failed to cleanup container %s. Error: %s', 134f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi container.name, e) 135f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return False 136f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 137f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 138f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef parse_options(): 139f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """Parse command line inputs. 140f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 141f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @return: Options to run the script. 142f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """ 143f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi parser = argparse.ArgumentParser() 144f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi parser.add_argument('-v', '--verbose', action='store_true', 145f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi default=False, 146f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi help='Print out ALL entries.') 147f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi parser.add_argument('-x', '--execute', action='store_true', 148f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi default=False, 149f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi help=('Execute the actions to kill autoserv processes ' 150f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 'and destroy containers. Default is False to do ' 151f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 'dry run')) 152c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi # TODO(dshi): Consider to adopt the scheduler log model: 153c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi # 1. Create one log per run. 154c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi # 2. Create a symlink to the latest log. 155f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi parser.add_argument('-l', '--logfile', type=str, 156f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi default=None, 157f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi help='Path to the log file to save logs.') 158f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi return parser.parse_args() 159f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 160f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 161f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shidef main(options): 162f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """Main script. 163f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 164f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi @param options: Options to run the script. 165f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi """ 166f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi config = logging_config.LoggingConfig() 167f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if options.logfile: 168f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi config.add_file_handler( 169f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi file_path=os.path.abspath(options.logfile), 170f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi level=logging.DEBUG if options.verbose else logging.INFO) 171f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 172f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi bucket = lxc.ContainerBucket() 173c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi logging.info('') 174f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi logging.info('Cleaning container bucket %s', bucket.container_path) 175f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi success_count = 0 176f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi failure_count = 0 177f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi for container in bucket.get_all().values(): 178f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if is_container_orphaned(container): 179f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if cleanup(container, options): 180f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi success_count += 1 181f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi else: 182f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi failure_count += 1 183f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi if options.execute: 184f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi key = 'container_cleanup.%s' % socket.gethostname().replace('.', '_') 185f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi autotest_stats.Gauge(key).send('success', success_count) 186f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi autotest_stats.Gauge(key).send('failure', failure_count) 187c458f66959fde1d934abfff92d20b2dbf115b9c2Dan Shi logging.info('Cleanup finished.') 188f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 189f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi 190f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shiif __name__ == '__main__': 191f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi options = parse_options() 192f4cb4da629e984b472c08788557b971d5e1f9df2Dan Shi main(options) 193