1ca812ff8137186b7a7979a4647b501cda3914b7exixuan#! /usr/bin/python
2ca812ff8137186b7a7979a4647b501cda3914b7exixuan
3ca812ff8137186b7a7979a4647b501cda3914b7exixuan# Copyright 2017 The Chromium OS Authors. All rights reserved.
4ca812ff8137186b7a7979a4647b501cda3914b7exixuan# Use of this source code is governed by a BSD-style license that can be
5ca812ff8137186b7a7979a4647b501cda3914b7exixuan# found in the LICENSE file.
6ca812ff8137186b7a7979a4647b501cda3914b7exixuan
7ca812ff8137186b7a7979a4647b501cda3914b7exixuan"""
8ca812ff8137186b7a7979a4647b501cda3914b7exixuanSwarming bot manager running on servers that hold swarming bots.
9ca812ff8137186b7a7979a4647b501cda3914b7exixuanThis manages running swarming bots and routinely recovers any that die.
10ca812ff8137186b7a7979a4647b501cda3914b7exixuan"""
11ca812ff8137186b7a7979a4647b501cda3914b7exixuan
12ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport argparse
13ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport logging
14ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport signal
15ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport socket
16ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport sys
17ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport time
18ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport urllib2
19ca812ff8137186b7a7979a4647b501cda3914b7exixuan
20ca812ff8137186b7a7979a4647b501cda3914b7exixuanimport common
21ca812ff8137186b7a7979a4647b501cda3914b7exixuanfrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
22ca812ff8137186b7a7979a4647b501cda3914b7exixuanfrom autotest_lib.site_utils.chromeos_proxy import swarming_bots
23ca812ff8137186b7a7979a4647b501cda3914b7exixuan
24a91735362bc0572e011651dcb0fba66068cfae57xixuanfrom chromite.lib import metrics
25a168ce8e2e899e47456e3fcae79141b5465a2b63xixuanfrom chromite.lib import ts_mon_config
26a91735362bc0572e011651dcb0fba66068cfae57xixuan
27a91735362bc0572e011651dcb0fba66068cfae57xixuan
28ca812ff8137186b7a7979a4647b501cda3914b7exixuan# The seconds between consequent bot check.
29ca812ff8137186b7a7979a4647b501cda3914b7exixuanCHECK_INTERVAL = 180
30ca812ff8137186b7a7979a4647b501cda3914b7exixuan
31ca812ff8137186b7a7979a4647b501cda3914b7exixuan_shut_down = False
32ca812ff8137186b7a7979a4647b501cda3914b7exixuan
33a168ce8e2e899e47456e3fcae79141b5465a2b63xixuanmetrics_template = 'chromeos/autotest/swarming/bot_manager/%s'
34a91735362bc0572e011651dcb0fba66068cfae57xixuan
35ca812ff8137186b7a7979a4647b501cda3914b7exixuandef _parse_args(args):
36ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """Parse system arguments."""
37ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser = argparse.ArgumentParser(
38ca812ff8137186b7a7979a4647b501cda3914b7exixuan            description='Manage the set of swarming bots running on a server')
39ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument('afe', type=str,
40ca812ff8137186b7a7979a4647b501cda3914b7exixuan                        help='AFE to get server role and status.')
41ca812ff8137186b7a7979a4647b501cda3914b7exixuan    # TODO(xixuan): refactor together with swarming_bots.
42ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument(
43ca812ff8137186b7a7979a4647b501cda3914b7exixuan            'id_range', type=str,
44ca812ff8137186b7a7979a4647b501cda3914b7exixuan            help='A range of integer, each bot created will be labeled '
45ca812ff8137186b7a7979a4647b501cda3914b7exixuan                 'with an id from this range. E.g. "1-200"')
46ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument(
47ca812ff8137186b7a7979a4647b501cda3914b7exixuan            'working_dir', type=str,
48ca812ff8137186b7a7979a4647b501cda3914b7exixuan            help='A working directory where bots will store files '
49ca812ff8137186b7a7979a4647b501cda3914b7exixuan                 'generated at runtime')
50ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument(
51ca812ff8137186b7a7979a4647b501cda3914b7exixuan            '-p', '--swarming_proxy', type=str, dest='swarming_proxy',
52ca812ff8137186b7a7979a4647b501cda3914b7exixuan            default=swarming_bots.DEFAULT_SWARMING_PROXY,
53ca812ff8137186b7a7979a4647b501cda3914b7exixuan            help='The URL of the swarming instance to talk to, '
54ca812ff8137186b7a7979a4647b501cda3914b7exixuan                 'Default to the one specified in global config')
55ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument(
56ca812ff8137186b7a7979a4647b501cda3914b7exixuan            '-f', '--log_file', dest='log_file',
57ca812ff8137186b7a7979a4647b501cda3914b7exixuan            help='Path to the log file.')
58ca812ff8137186b7a7979a4647b501cda3914b7exixuan    parser.add_argument(
59ca812ff8137186b7a7979a4647b501cda3914b7exixuan            '-v', '--verbose', dest='verbose', action='store_true',
60ca812ff8137186b7a7979a4647b501cda3914b7exixuan            help='Verbose mode')
61ca812ff8137186b7a7979a4647b501cda3914b7exixuan
62ca812ff8137186b7a7979a4647b501cda3914b7exixuan    return parser.parse_args(args)
63ca812ff8137186b7a7979a4647b501cda3914b7exixuan
64ca812ff8137186b7a7979a4647b501cda3914b7exixuan
65ca812ff8137186b7a7979a4647b501cda3914b7exixuandef handle_signal(signum, frame):
66ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """Function called when being killed.
67ca812ff8137186b7a7979a4647b501cda3914b7exixuan
68ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @param signum: The signal received.
69ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @param frame: Ignored.
70ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """
71ca812ff8137186b7a7979a4647b501cda3914b7exixuan    del signum
72ca812ff8137186b7a7979a4647b501cda3914b7exixuan    del frame
73ca812ff8137186b7a7979a4647b501cda3914b7exixuan
74ca812ff8137186b7a7979a4647b501cda3914b7exixuan    _shut_down = True
75ca812ff8137186b7a7979a4647b501cda3914b7exixuan
76ca812ff8137186b7a7979a4647b501cda3914b7exixuan
77ca812ff8137186b7a7979a4647b501cda3914b7exixuandef is_server_in_prod(server_name, afe):
78ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """Validate server's role and status.
79ca812ff8137186b7a7979a4647b501cda3914b7exixuan
80ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @param server_name: the server name to be validated.
81ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @param afe: the afe server to get role & status info in server_db.
82ca812ff8137186b7a7979a4647b501cda3914b7exixuan
83ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @return: A boolean value, True when the server_name is in prod, False
84ca812ff8137186b7a7979a4647b501cda3914b7exixuan             otherwise, or if RPC fails.
85ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """
86ca812ff8137186b7a7979a4647b501cda3914b7exixuan    logging.info('Validating server: %s', server_name)
87ca812ff8137186b7a7979a4647b501cda3914b7exixuan    afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10,
88ca812ff8137186b7a7979a4647b501cda3914b7exixuan                                        server=afe)
89a91735362bc0572e011651dcb0fba66068cfae57xixuan    is_prod_proxy_server = False
90ca812ff8137186b7a7979a4647b501cda3914b7exixuan    try:
91ca812ff8137186b7a7979a4647b501cda3914b7exixuan        if afe.run('get_servers', hostname=server_name,
92ca812ff8137186b7a7979a4647b501cda3914b7exixuan                   status='primary', role='golo_proxy'):
93a91735362bc0572e011651dcb0fba66068cfae57xixuan            is_prod_proxy_server = True
94a91735362bc0572e011651dcb0fba66068cfae57xixuan
95ca812ff8137186b7a7979a4647b501cda3914b7exixuan    except urllib2.URLError as e:
96ca812ff8137186b7a7979a4647b501cda3914b7exixuan        logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e))
97a91735362bc0572e011651dcb0fba66068cfae57xixuan    finally:
98a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan        metrics.Counter(metrics_template % 'server_in_prod_check').increment(
99a91735362bc0572e011651dcb0fba66068cfae57xixuan                fields={'success': is_prod_proxy_server})
100a91735362bc0572e011651dcb0fba66068cfae57xixuan        return is_prod_proxy_server
101ca812ff8137186b7a7979a4647b501cda3914b7exixuan
102ca812ff8137186b7a7979a4647b501cda3914b7exixuan
103a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan@metrics.SecondsTimerDecorator(metrics_template % 'tick')
104bf336904155b5613a4764f91a9024eec47fce1a7xixuandef tick(afe, bot_manager):
105bf336904155b5613a4764f91a9024eec47fce1a7xixuan    """One tick for swarming bot manager.
106bf336904155b5613a4764f91a9024eec47fce1a7xixuan
107bf336904155b5613a4764f91a9024eec47fce1a7xixuan    @param afe: the afe to check server role.
108bf336904155b5613a4764f91a9024eec47fce1a7xixuan    @param bot_manager: a swarming_bots.BotManager instance.
109bf336904155b5613a4764f91a9024eec47fce1a7xixuan    """
110bf336904155b5613a4764f91a9024eec47fce1a7xixuan    if is_server_in_prod(socket.getfqdn(), afe):
111bf336904155b5613a4764f91a9024eec47fce1a7xixuan        bot_manager.check()
112bf336904155b5613a4764f91a9024eec47fce1a7xixuan
113bf336904155b5613a4764f91a9024eec47fce1a7xixuan
114ca812ff8137186b7a7979a4647b501cda3914b7exixuandef main(args):
115ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """Main func.
116ca812ff8137186b7a7979a4647b501cda3914b7exixuan
117ca812ff8137186b7a7979a4647b501cda3914b7exixuan    @args: A list of system arguments.
118ca812ff8137186b7a7979a4647b501cda3914b7exixuan    """
119ca812ff8137186b7a7979a4647b501cda3914b7exixuan    args = _parse_args(args)
120ca812ff8137186b7a7979a4647b501cda3914b7exixuan    swarming_bots.setup_logging(args.verbose, args.log_file)
121ca812ff8137186b7a7979a4647b501cda3914b7exixuan
122ca812ff8137186b7a7979a4647b501cda3914b7exixuan    if not args.swarming_proxy:
123ca812ff8137186b7a7979a4647b501cda3914b7exixuan        logging.error(
124ca812ff8137186b7a7979a4647b501cda3914b7exixuan                'No swarming proxy instance specified. '
125ca812ff8137186b7a7979a4647b501cda3914b7exixuan                'Specify swarming_proxy in [CROS] in shadow_config, '
126ca812ff8137186b7a7979a4647b501cda3914b7exixuan                'or use --swarming_proxy')
127ca812ff8137186b7a7979a4647b501cda3914b7exixuan        return 1
128ca812ff8137186b7a7979a4647b501cda3914b7exixuan
129ca812ff8137186b7a7979a4647b501cda3914b7exixuan    if not args.swarming_proxy.startswith('https://'):
130ca812ff8137186b7a7979a4647b501cda3914b7exixuan        swarming_proxy = 'https://' + args.swarming_proxy
131ca812ff8137186b7a7979a4647b501cda3914b7exixuan    else:
132ca812ff8137186b7a7979a4647b501cda3914b7exixuan        swarming_proxy = args.swarming_proxy
133ca812ff8137186b7a7979a4647b501cda3914b7exixuan
134ca812ff8137186b7a7979a4647b501cda3914b7exixuan    global _shut_down
135ca812ff8137186b7a7979a4647b501cda3914b7exixuan    logging.info("Setting signal handler.")
136ca812ff8137186b7a7979a4647b501cda3914b7exixuan    signal.signal(signal.SIGINT, handle_signal)
137ca812ff8137186b7a7979a4647b501cda3914b7exixuan    signal.signal(signal.SIGTERM, handle_signal)
138ca812ff8137186b7a7979a4647b501cda3914b7exixuan
139ca812ff8137186b7a7979a4647b501cda3914b7exixuan    bot_manager = swarming_bots.BotManager(
140ca812ff8137186b7a7979a4647b501cda3914b7exixuan            swarming_bots.parse_range(args.id_range),
141ca812ff8137186b7a7979a4647b501cda3914b7exixuan            args.working_dir,
142ca812ff8137186b7a7979a4647b501cda3914b7exixuan            args.swarming_proxy)
143ca812ff8137186b7a7979a4647b501cda3914b7exixuan    is_prod = False
144ca812ff8137186b7a7979a4647b501cda3914b7exixuan    retryable = True
145a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan    with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True):
146a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan        while not _shut_down:
147a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan            tick(args.afe, bot_manager)
148a168ce8e2e899e47456e3fcae79141b5465a2b63xixuan            time.sleep(CHECK_INTERVAL)
149ca812ff8137186b7a7979a4647b501cda3914b7exixuan
150ca812ff8137186b7a7979a4647b501cda3914b7exixuan
151ca812ff8137186b7a7979a4647b501cda3914b7exixuanif __name__ == '__main__':
152ca812ff8137186b7a7979a4647b501cda3914b7exixuan    sys.exit(main(sys.argv[1:]))
153