1ca812ff8137186b7a7979a4647b501cda3914b7exixuan#! /usr/bin/python 2ca812ff8137186b7a7979a4647b501cda3914b7exixuan 3ca812ff8137186b7a7979a4647b501cda3914b7exixuan# Copyright 2017 The Chromium OS Authors. All rights reserved. 4ca812ff8137186b7a7979a4647b501cda3914b7exixuan# Use of this source code is governed by a BSD-style license that can be 5ca812ff8137186b7a7979a4647b501cda3914b7exixuan# found in the LICENSE file. 6ca812ff8137186b7a7979a4647b501cda3914b7exixuan 7ca812ff8137186b7a7979a4647b501cda3914b7exixuan""" 8ca812ff8137186b7a7979a4647b501cda3914b7exixuanSwarming bot manager running on servers that hold swarming bots. 9ca812ff8137186b7a7979a4647b501cda3914b7exixuanThis manages running swarming bots and routinely recovers any that die. 10ca812ff8137186b7a7979a4647b501cda3914b7exixuan""" 11ca812ff8137186b7a7979a4647b501cda3914b7exixuan 12ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport argparse 13ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport logging 14ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport signal 15ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport socket 16ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport sys 17ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport time 18ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport urllib2 19ca812ff8137186b7a7979a4647b501cda3914b7exixuan 20ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport common 21ca812ff8137186b7a7979a4647b501cda3914b7exixuanfrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers 22ca812ff8137186b7a7979a4647b501cda3914b7exixuanfrom autotest_lib.site_utils.chromeos_proxy import swarming_bots 23ca812ff8137186b7a7979a4647b501cda3914b7exixuan 24a91735362bc0572e011651dcb0fba66068cfae57xixuanfrom chromite.lib import metrics 25a168ce8e2e899e47456e3fcae79141b5465a2b63xixuanfrom chromite.lib import ts_mon_config 26a91735362bc0572e011651dcb0fba66068cfae57xixuan 27a91735362bc0572e011651dcb0fba66068cfae57xixuan 28ca812ff8137186b7a7979a4647b501cda3914b7exixuan# The seconds between consequent bot check. 29ca812ff8137186b7a7979a4647b501cda3914b7exixuanCHECK_INTERVAL = 180 30ca812ff8137186b7a7979a4647b501cda3914b7exixuan 31ca812ff8137186b7a7979a4647b501cda3914b7exixuan_shut_down = False 32ca812ff8137186b7a7979a4647b501cda3914b7exixuan 33a168ce8e2e899e47456e3fcae79141b5465a2b63xixuanmetrics_template = 'chromeos/autotest/swarming/bot_manager/%s' 34a91735362bc0572e011651dcb0fba66068cfae57xixuan 35ca812ff8137186b7a7979a4647b501cda3914b7exixuandef _parse_args(args): 36ca812ff8137186b7a7979a4647b501cda3914b7exixuan """Parse system arguments.""" 37ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser = argparse.ArgumentParser( 38ca812ff8137186b7a7979a4647b501cda3914b7exixuan description='Manage the set of swarming bots running on a server') 39ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument('afe', type=str, 40ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='AFE to get server role and status.') 41ca812ff8137186b7a7979a4647b501cda3914b7exixuan # TODO(xixuan): refactor together with swarming_bots. 42ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument( 43ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'id_range', type=str, 44ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='A range of integer, each bot created will be labeled ' 45ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'with an id from this range. E.g. "1-200"') 46ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument( 47ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'working_dir', type=str, 48ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='A working directory where bots will store files ' 49ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'generated at runtime') 50ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument( 51ca812ff8137186b7a7979a4647b501cda3914b7exixuan '-p', '--swarming_proxy', type=str, dest='swarming_proxy', 52ca812ff8137186b7a7979a4647b501cda3914b7exixuan default=swarming_bots.DEFAULT_SWARMING_PROXY, 53ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='The URL of the swarming instance to talk to, ' 54ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'Default to the one specified in global config') 55ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument( 56ca812ff8137186b7a7979a4647b501cda3914b7exixuan '-f', '--log_file', dest='log_file', 57ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='Path to the log file.') 58ca812ff8137186b7a7979a4647b501cda3914b7exixuan parser.add_argument( 59ca812ff8137186b7a7979a4647b501cda3914b7exixuan '-v', '--verbose', dest='verbose', action='store_true', 60ca812ff8137186b7a7979a4647b501cda3914b7exixuan help='Verbose mode') 61ca812ff8137186b7a7979a4647b501cda3914b7exixuan 62ca812ff8137186b7a7979a4647b501cda3914b7exixuan return parser.parse_args(args) 63ca812ff8137186b7a7979a4647b501cda3914b7exixuan 64ca812ff8137186b7a7979a4647b501cda3914b7exixuan 65ca812ff8137186b7a7979a4647b501cda3914b7exixuandef handle_signal(signum, frame): 66ca812ff8137186b7a7979a4647b501cda3914b7exixuan """Function called when being killed. 67ca812ff8137186b7a7979a4647b501cda3914b7exixuan 68ca812ff8137186b7a7979a4647b501cda3914b7exixuan @param signum: The signal received. 69ca812ff8137186b7a7979a4647b501cda3914b7exixuan @param frame: Ignored. 70ca812ff8137186b7a7979a4647b501cda3914b7exixuan """ 71ca812ff8137186b7a7979a4647b501cda3914b7exixuan del signum 72ca812ff8137186b7a7979a4647b501cda3914b7exixuan del frame 73ca812ff8137186b7a7979a4647b501cda3914b7exixuan 74ca812ff8137186b7a7979a4647b501cda3914b7exixuan _shut_down = True 75ca812ff8137186b7a7979a4647b501cda3914b7exixuan 76ca812ff8137186b7a7979a4647b501cda3914b7exixuan 77ca812ff8137186b7a7979a4647b501cda3914b7exixuandef is_server_in_prod(server_name, afe): 78ca812ff8137186b7a7979a4647b501cda3914b7exixuan """Validate server's role and status. 79ca812ff8137186b7a7979a4647b501cda3914b7exixuan 80ca812ff8137186b7a7979a4647b501cda3914b7exixuan @param server_name: the server name to be validated. 81ca812ff8137186b7a7979a4647b501cda3914b7exixuan @param afe: the afe server to get role & status info in server_db. 82ca812ff8137186b7a7979a4647b501cda3914b7exixuan 83ca812ff8137186b7a7979a4647b501cda3914b7exixuan @return: A boolean value, True when the server_name is in prod, False 84ca812ff8137186b7a7979a4647b501cda3914b7exixuan otherwise, or if RPC fails. 85ca812ff8137186b7a7979a4647b501cda3914b7exixuan """ 86ca812ff8137186b7a7979a4647b501cda3914b7exixuan logging.info('Validating server: %s', server_name) 87ca812ff8137186b7a7979a4647b501cda3914b7exixuan afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10, 88ca812ff8137186b7a7979a4647b501cda3914b7exixuan server=afe) 89a91735362bc0572e011651dcb0fba66068cfae57xixuan is_prod_proxy_server = False 90ca812ff8137186b7a7979a4647b501cda3914b7exixuan try: 91ca812ff8137186b7a7979a4647b501cda3914b7exixuan if afe.run('get_servers', hostname=server_name, 92ca812ff8137186b7a7979a4647b501cda3914b7exixuan status='primary', role='golo_proxy'): 93a91735362bc0572e011651dcb0fba66068cfae57xixuan is_prod_proxy_server = True 94a91735362bc0572e011651dcb0fba66068cfae57xixuan 95ca812ff8137186b7a7979a4647b501cda3914b7exixuan except urllib2.URLError as e: 96ca812ff8137186b7a7979a4647b501cda3914b7exixuan logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e)) 97a91735362bc0572e011651dcb0fba66068cfae57xixuan finally: 98a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan metrics.Counter(metrics_template % 'server_in_prod_check').increment( 99a91735362bc0572e011651dcb0fba66068cfae57xixuan fields={'success': is_prod_proxy_server}) 100a91735362bc0572e011651dcb0fba66068cfae57xixuan return is_prod_proxy_server 101ca812ff8137186b7a7979a4647b501cda3914b7exixuan 102ca812ff8137186b7a7979a4647b501cda3914b7exixuan 103a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan@metrics.SecondsTimerDecorator(metrics_template % 'tick') 104bf336904155b5613a4764f91a9024eec47fce1a7xixuandef tick(afe, bot_manager): 105bf336904155b5613a4764f91a9024eec47fce1a7xixuan """One tick for swarming bot manager. 106bf336904155b5613a4764f91a9024eec47fce1a7xixuan 107bf336904155b5613a4764f91a9024eec47fce1a7xixuan @param afe: the afe to check server role. 108bf336904155b5613a4764f91a9024eec47fce1a7xixuan @param bot_manager: a swarming_bots.BotManager instance. 109bf336904155b5613a4764f91a9024eec47fce1a7xixuan """ 110bf336904155b5613a4764f91a9024eec47fce1a7xixuan if is_server_in_prod(socket.getfqdn(), afe): 111bf336904155b5613a4764f91a9024eec47fce1a7xixuan bot_manager.check() 112bf336904155b5613a4764f91a9024eec47fce1a7xixuan 113bf336904155b5613a4764f91a9024eec47fce1a7xixuan 114ca812ff8137186b7a7979a4647b501cda3914b7exixuandef main(args): 115ca812ff8137186b7a7979a4647b501cda3914b7exixuan """Main func. 116ca812ff8137186b7a7979a4647b501cda3914b7exixuan 117ca812ff8137186b7a7979a4647b501cda3914b7exixuan @args: A list of system arguments. 118ca812ff8137186b7a7979a4647b501cda3914b7exixuan """ 119ca812ff8137186b7a7979a4647b501cda3914b7exixuan args = _parse_args(args) 120ca812ff8137186b7a7979a4647b501cda3914b7exixuan swarming_bots.setup_logging(args.verbose, args.log_file) 121ca812ff8137186b7a7979a4647b501cda3914b7exixuan 122ca812ff8137186b7a7979a4647b501cda3914b7exixuan if not args.swarming_proxy: 123ca812ff8137186b7a7979a4647b501cda3914b7exixuan logging.error( 124ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'No swarming proxy instance specified. ' 125ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'Specify swarming_proxy in [CROS] in shadow_config, ' 126ca812ff8137186b7a7979a4647b501cda3914b7exixuan 'or use --swarming_proxy') 127ca812ff8137186b7a7979a4647b501cda3914b7exixuan return 1 128ca812ff8137186b7a7979a4647b501cda3914b7exixuan 129ca812ff8137186b7a7979a4647b501cda3914b7exixuan if not args.swarming_proxy.startswith('https://'): 130ca812ff8137186b7a7979a4647b501cda3914b7exixuan swarming_proxy = 'https://' + args.swarming_proxy 131ca812ff8137186b7a7979a4647b501cda3914b7exixuan else: 132ca812ff8137186b7a7979a4647b501cda3914b7exixuan swarming_proxy = args.swarming_proxy 133ca812ff8137186b7a7979a4647b501cda3914b7exixuan 134ca812ff8137186b7a7979a4647b501cda3914b7exixuan global _shut_down 135ca812ff8137186b7a7979a4647b501cda3914b7exixuan logging.info("Setting signal handler.") 136ca812ff8137186b7a7979a4647b501cda3914b7exixuan signal.signal(signal.SIGINT, handle_signal) 137ca812ff8137186b7a7979a4647b501cda3914b7exixuan signal.signal(signal.SIGTERM, handle_signal) 138ca812ff8137186b7a7979a4647b501cda3914b7exixuan 139ca812ff8137186b7a7979a4647b501cda3914b7exixuan bot_manager = swarming_bots.BotManager( 140ca812ff8137186b7a7979a4647b501cda3914b7exixuan swarming_bots.parse_range(args.id_range), 141ca812ff8137186b7a7979a4647b501cda3914b7exixuan args.working_dir, 142ca812ff8137186b7a7979a4647b501cda3914b7exixuan args.swarming_proxy) 143ca812ff8137186b7a7979a4647b501cda3914b7exixuan is_prod = False 144ca812ff8137186b7a7979a4647b501cda3914b7exixuan retryable = True 145a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True): 146a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan while not _shut_down: 147a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan tick(args.afe, bot_manager) 148a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan time.sleep(CHECK_INTERVAL) 149ca812ff8137186b7a7979a4647b501cda3914b7exixuan 150ca812ff8137186b7a7979a4647b501cda3914b7exixuan 151ca812ff8137186b7a7979a4647b501cda3914b7exixuanif __name__ == '__main__': 152ca812ff8137186b7a7979a4647b501cda3914b7exixuan sys.exit(main(sys.argv[1:])) 153