1#!/usr/bin/python
2# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Orchestrate virtual machines to setup a toy instance of the lab for testing.
7
8This module is meant to help create a closed loop development flow for members
9of the lab team which looks something like this:
10                    ______________
11                   |              |
12                   |gs vm resistry|<+
13                   |______________| |
14                          |         |
15                          v         |
16        New change -> puppylab -> New core_cluster box
17                          |
18         Vagrantfile specifies cluster settings
19         _________________|____________________
20        |                                      |
21        |  puppet provisions core_cluster box  |
22        |______________________________________|
23                |          | ........... |
24                v          v             v
25              master     shard1       shardn
26             |     |     |     |      |     |
27            mysql  afe  tko heartbt   tko heartbt
28             |     |     |     |      |     |
29host ports  8001  8002  8001  8002    8001  8002
30        [host ports liable to autocorrect as needed]
31
32This module can work with any vm hosting service/provider as long as they
33adhere to the vagrant interface. VirtualBox is the only implementation so
34far, though GCE will be an ideal candidate.
35
36Class spec:
37* VagrantProvisioner: Provision boxes per a VagrantFile.
38    * VirtualBoxProvisioner: Generate a Virtualbox VagrantFile.
39* CoreVM: Manage individual core_cluster vms.
40* ClusterManager: Spin up cluster.
41
42Usage: clusterctl --admin-repo /usr/local/autotest/chromeos-internal
43"""
44
45import argparse
46import logging
47import os
48import sys
49
50import common
51from autotest_lib.puppylab import lab_manifest
52from autotest_lib.puppylab import vm_manager
53from autotest_lib.site_utils.lib import infra
54
55
56# TODO: Enable multiple shards via command line args.
57NUM_SHARDS = 1
58SHADOW_PATH = '/usr/local/autotest/shadow_config.ini'
59
60
61class ConfigError(Exception):
62    """Raised if one of the vms in the cluster is misconfigured."""
63
64
65class CoreVM(object):
66    """Interface to create and manage a core_cluster vm image.
67
68    A core_cluster vm image has base packages shared by all server roles.
69    """
70    _core_vm_name = 'chromeos_lab_core_cluster'
71    _core_image_source = 'gs://vms/%s.box' % _core_vm_name
72    _core_image_name = '%s.box' % _core_vm_name
73    _core_image_destination = os.path.join(
74            vm_manager.VAGRANT_DIR, _core_image_name)
75
76    # TODO: Preperation is currently by hand. Use the provisioner to
77    # create a box of name '_core_image_name', with the CoreClusterTemplate
78    # in the VAGRANT_DIR if you wish to prepare a new vm. You can achieve
79    # this by:
80    # * Copying the CoreClusterTemplate to a Vagrantfile and replacing the
81    #   modulepath with the path to you chromeos-admin/puppet directory.
82    # * Calling `vagrant up` in the directory with this vagrant file.
83    # * When it's done, calling vagrant package.
84    # This should produce a package.box in the same dir.
85
86    def __init__(self, provisioner):
87        self.provisioner = provisioner
88
89
90    def setup_core_box(self):
91        """Setup a core cluster vm.
92
93        Download a core_cluster image if one isn't present on disk and
94        register it with vagrant.
95        """
96        if not os.path.exists(self._core_image_destination):
97            infra.execute_command(
98                'localhost', 'gsutil cp %s %s' %
99                (self._core_image_source, self._core_image_destination))
100        self.provisioner.register_box(
101                self._core_image_destination, self._core_vm_name)
102
103
104    def teardown_core_box(self):
105        """Teardown a core cluster vm."""
106        # TODO: delete the box file.
107        self.provisioner.unregister_box(self._core_vm_name)
108
109
110class ClusterManager(object):
111    """Interface to spin up a cluster of CoreVMs.
112
113    This class manages all the details between creating a core_cluster image
114    and running tests on a full fledged cluster.
115    """
116
117    def _register_shards(self, num_shards):
118        """Register num_shards worth of shard info.
119
120        This includes the name, port address and board of the new shard. This
121        information is piped through to each vm, so the cluster manager is
122        actually in control of all the shards in the cluster and can address
123        them by name.
124
125        Consider a shard, shard1, assigned to board stumpy:
126            * You will be able to ssh into it with 'vagrant ssh stumpyshard'.
127            * The afe for the shard will be running on a incrementally
128              designated port starting from shards_base_port.
129            * The afe port of the shard is piped through to the shadow_config.
130              This is required for 2 reasons:
131                # `cli/atest shard add` should use this name, because it is
132                  the name the shard-client will use to request jobs.
133                # the master afe should show links to the shard using this name.
134
135        @param num_shards: The number of shards we wish to add to the cluster.
136        """
137        self.vagrantfile_shard_args = {}
138        self.shard_board_map = {}
139        self.vagrant_shard_names = []
140
141        for num in range(1, num_shards+1):
142            # The name to use for vagrant ssh
143            shard_name = 'shard%s' % num
144            # The port for the shard's afe
145            shard_port = lab_manifest.shards_base_port + num
146            # The hostname to use in the shadow_config of the shard
147            shard_hostname = '%s:%s' % (lab_manifest.vm_host_name, shard_port)
148
149            self.vagrantfile_shard_args.update({
150                shard_name: shard_name,
151                '%s_shadow_config_hostname' % shard_name: shard_hostname,
152                '%s_port' % shard_name: shard_port,
153            })
154            if lab_manifest.shards:
155                board = lab_manifest.shards.pop()
156                # Assign a board to a shard. Use the shard_hostname as this
157                # settings is not meant to be human understandable.
158                self.shard_board_map[shard_hostname] = board
159                vagrant_shard_name = '%sshard' % board.rsplit(':')[-1]
160                # Replace the shard<int>-type-name with board_shard
161                self.vagrantfile_shard_args[shard_name] = vagrant_shard_name
162                self.vagrant_shard_names.append(vagrant_shard_name)
163
164
165    def __init__(self, vm_provisioner, vagrant_master_name='master',
166                 num_shards=1):
167        """Initialize parameters for the cluster.
168
169        @param vm_provisioner: A provisioner object, currently the only one
170            supported is VirtualBox.
171        @param master_name: The name to give the cluster master.
172        @param num_shards: The number of shards in the cluster. Each shard
173            gets a name allocated based on its number (eg: shard1).
174        """
175        self.provisioner = vm_provisioner
176        self.vm_manager = CoreVM(provisioner=self.provisioner)
177        self._register_shards(num_shards)
178        self.vagrant_master_name = vagrant_master_name
179
180
181    def start_cluster(self):
182        """Start a cluster."""
183        self.vm_manager.setup_core_box()
184
185        # TODO: Add a --rebuild-cluster option.
186        needs_destroy = self.provisioner.initialize_vagrant(
187                master=self.vagrant_master_name,
188                master_port=lab_manifest.master_afe_port,
189                **self.vagrantfile_shard_args)
190        self.provisioner.provision(needs_destroy)
191
192
193    def shutdown_cluster(self):
194        """Shutdown the current cluster."""
195        # TODO: Actually destroy. Halt is useful for debugging.
196        self.provisioner.vagrant_cmd('halt')
197
198
199    def execute_against_vm(self, vm_name, cmd):
200        """Execute cmd against vm_name.
201
202        @param cmd: The command to execute.
203        @param vm_name: The name of the vm, eg: stumpyshard.
204        """
205        return self.provisioner.vagrant_cmd(
206                "ssh %s -- '%s'" % (vm_name, cmd)).rstrip('\n')
207
208
209    def _get_shadow_config_value(self, vm_name, key):
210        cmd = 'grep "^%s:" %s' % (key, SHADOW_PATH)
211        shadow_value = self.execute_against_vm(vm_name, cmd)
212        return shadow_value.rsplit(':')[-1].lstrip(' ')
213
214
215    def _check_shadow_config(self, vm, key, expected_value):
216        """Sanity check the shadow_configs of all vms in the cluster.
217
218        @raises ConfigError: If a shadow_config is misconfigured.
219        """
220        value = self._get_shadow_config_value(vm, key)
221        if value != expected_value:
222            raise ConfigError(
223                    '%s vm has misconfigued config %s = %s, expected %s' %
224                    (vm, key, value, expected_value))
225        logging.info('%s has %s = %s', vm, key, value)
226
227
228    def _upstart_cmd(self, vm, job_name, cmd='status'):
229        """Execute an upstart command.
230
231        @param vm: The name of the vm to execute it against.
232        @param job_name: The name of the upstart job.
233        @param cmd: The upstart command.
234
235        @return: The output of the upstart command.
236        """
237        status_cmd = 'sudo %s %s' % (cmd, job_name)
238        try:
239            return self.execute_against_vm(vm, status_cmd)
240        except vm_manager.VagrantCmdError as e:
241            return '%s service not found on %s' % (job_name, vm)
242
243
244    def check_services(self, action='start'):
245        """Get the status of all core services on the vms.
246
247        This method is designed to start srevices on the master/all
248        shards if their shadow configs are as expected. If the shadow
249        config option on a vm has an unexpected setting, services
250        are not started on it.
251
252        @param action: The action to perform on servcies. Start will
253            start all of them, stop will stop them all.
254
255        @raises ConfigError: If a shadow_config option is unexpected.
256        """
257        core_services = set(
258                ['scheduler', 'host-scheduler',
259                 'gs_offloader', 'gs_offloader_s', 'shard-client'])
260        gateway = self.execute_against_vm(
261                self.vagrant_master_name,
262                "netstat -rn | grep \"^0.0.0.0 \" | cut -d \" \" -f10 | head -1"
263                ).rstrip('\n')
264
265        for vm in self.vagrant_shard_names + [self.vagrant_master_name]:
266            vm_manager.format_msg('Checking services on %s' % vm)
267            self._check_shadow_config(vm, 'host', 'localhost')
268            global_db = ('localhost' if vm == self.vagrant_master_name
269                         else gateway)
270            self._check_shadow_config(vm, 'global_db_host', global_db)
271
272            for service in core_services:
273                logging.info('Checking %s on %s', service, vm)
274                status = self._upstart_cmd(vm, service, action)
275                logging.info(status)
276
277
278def bringup_cluster(admin_repo, num_shards=NUM_SHARDS, start_safe=False):
279    """Start a cluster.
280
281    @param admin_repo: Path to the chromeos-admin repo.
282    @param num_shards: Number of shards. You cannot change
283        the number of shards on a running cluster, you need
284        to destroy the cluster, remove the vagrant file,
285        modify the ClusterTemplate to include a new section
286        for the additional shard, and rerun clusterctl.
287    @param start_safe: Start the cluster in safe mode. This means
288        all core services will be stopped.
289    """
290    puppet_path = os.path.join(admin_repo, 'puppet')
291    if not os.path.exists(puppet_path):
292        raise ValueError('Admin repo %s does not contain puppet module' %
293                         admin_repo)
294    cluster_manager = ClusterManager(
295            vm_provisioner=vm_manager.VirtualBox(puppet_path=puppet_path),
296            vagrant_master_name='master', num_shards=num_shards)
297    cluster_manager.start_cluster()
298    try:
299        cluster_manager.check_services(action='stop' if start_safe else 'start')
300    except ConfigError as e:
301        logging.error(
302                'Shutting down cluster: %s', e)
303        cluster_manager.shutdown_cluster()
304        return 1
305
306
307def sync():
308    """Sync autotest from the host to all vms in the cluster."""
309    vm_manager.format_msg('Syncing Cluster')
310    vm_manager.VagrantProvisioner.vagrant_cmd('rsync', stream_output=True)
311    vm_manager.VagrantProvisioner.vagrant_cmd(
312            'provision --provision-with shell', stream_output=True)
313    vm_manager.format_msg('Please restart services as required')
314
315
316def _parse_args(args):
317    """Parse command line arguments.
318
319    @param args: A list of command line arguments, eg sys.argv[1:]
320
321    @return: A tuple with the parsed args, as returned by parser.parse_args.
322    """
323    if not args:
324        print ('Too few arguments, try clusterctl --help')
325        sys.exit(1)
326
327    description = ('A script to orchestrate a toy test lab. Provided '
328                   'with a path to the internal repo it will download a '
329                   'vm image and spin up a cluster against which you can '
330                   'test core autotest changes without DUTs.')
331    parser = argparse.ArgumentParser(description=description)
332    subparsers = parser.add_subparsers()
333    provision_subparser = subparsers.add_parser(
334            'provision', help='provision a cluster')
335    provision_subparser.required = False
336    provision_subparser.set_defaults(which='provision')
337    provision_subparser.add_argument(
338            '--admin-repo', dest='admin_repo', type=str,
339            help=('Path to the admin repo that has puppet scripts used for '
340                  'provisioning the cluster. If you do not already have it you '
341                  'can git clone the chromeos/chromeos-admin repo.'))
342    provision_subparser.add_argument(
343            '--safe', dest='start_safe', action='store_true',
344            help='If sepcified services will not be started automatically.')
345
346    # TODO: Automate restart of services via a --restart option.
347    update_subparser = subparsers.add_parser('update', help='Update a cluster')
348    update_subparser.required = False
349    update_subparser.set_defaults(which='update')
350    update_subparser.add_argument(
351            '--sync', dest='sync', action='store_true',
352            help='Sync autotest from host to all vms in cluster.')
353    return parser.parse_args(args)
354
355
356def main(args):
357    """Main function.
358
359    @param args: command line arguments for the script.
360    """
361    args = _parse_args(args)
362    if args.which == 'update' and args.sync:
363        sync()
364    else:
365        bringup_cluster(
366                admin_repo=args.admin_repo, start_safe=args.start_safe)
367
368
369if __name__ == '__main__':
370    sys.exit(main(sys.argv[1:]))
371