shard_client.py revision 5949b4af7a872aeb58e7ad29090812d648725ed5
13b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich#!/usr/bin/python 23b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich#pylint: disable-msg=C0111 33b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 43b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 53b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# Use of this source code is governed by a BSD-style license that can be 63b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# found in the LICENSE file. 73b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 83b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport argparse 93b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport datetime 103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport logging 113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport os 123b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport signal 13f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelichimport socket 143b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport time 153b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 163b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport common 173b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.frontend import setup_django_environment 183b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.client.common_lib import error 193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.client.common_lib import global_config 20f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelichfrom autotest_lib.client.common_lib.cros.graphite import stats 213b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.frontend.afe import models, rpc_utils 223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.scheduler import email_manager 235949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanianfrom autotest_lib.scheduler import scheduler_lib 248421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelichfrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers 253b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 263b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 273b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich""" 283b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichAutotest shard client 293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 303b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichThe shard client can be run as standalone service. It periodically polls the 313b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichmaster in a heartbeat, retrieves new jobs and hosts and inserts them into the 323b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichlocal database. 333b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 343b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichA shard is set up (by a human) and pointed to the global AFE (cautotest). 353b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichOn the shard, this script periodically makes so called heartbeat requests to the 363b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichglobal AFE, which will then complete the following actions: 373b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 383b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich1. Find the previously created (with atest) record for the shard. Shards are 393b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich identified by their hostnames, specified in the shadow_config. 403b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich2. Take the records that were sent in the heartbeat and insert them into the 413b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich global database. 423b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - This is to set the status of jobs to completed in the master database after 433b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich they were run by a slave. This is necessary so one can just look at the 443b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich master's afe to see the statuses of all jobs. Otherwise one would have to 453b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich check the tko tables or the individual slave AFEs. 463b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich3. Find labels that have been assigned to this shard. 471b52574752be108a743d3b33561c34324f8538e7Jakob Juelich4. Assign hosts that: 481b52574752be108a743d3b33561c34324f8538e7Jakob Juelich - have the specified label 491b52574752be108a743d3b33561c34324f8538e7Jakob Juelich - aren't leased 501b52574752be108a743d3b33561c34324f8538e7Jakob Juelich - have an id which is not in the known_host_ids which were sent in the 511b52574752be108a743d3b33561c34324f8538e7Jakob Juelich heartbeat request. 521b52574752be108a743d3b33561c34324f8538e7Jakob Juelich5. Assign jobs that: 533b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - depend on the specified label 543b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - haven't been assigned before 553b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - aren't started yet 563b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - aren't completed yet 571b52574752be108a743d3b33561c34324f8538e7Jakob Juelich - have an id which is not in the jobs_known_ids which were sent in the 581b52574752be108a743d3b33561c34324f8538e7Jakob Juelich heartbeat request. 591b52574752be108a743d3b33561c34324f8538e7Jakob Juelich6. Serialize the chosen jobs and hosts. 603b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - Find objects that the Host/Job objects depend on: Labels, AclGroups, Users, 613b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich and many more. Details about this can be found around 623b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich model_logic.serialize() 633b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich7. Send these objects to the slave. 643b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 653b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 663b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichOn the client side, this will happen: 673b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich1. Deserialize the objects sent from the master and persist them to the local 683b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich database. 693b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich2. monitor_db on the shard will pick up these jobs and schedule them on the 703b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich available hosts (which were retrieved from a heartbeat). 713b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich3. Once a job is finished, it's shard_id is set to NULL 723b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich4. The shard_client will pick up all jobs where shard_id=NULL and will 733b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich send them to the master in the request of the next heartbeat. 743b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - The master will persist them as described earlier. 753b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich - the shard_id will be set back to the shard's id, so the record won't be 763b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich uploaded again. 771b52574752be108a743d3b33561c34324f8538e7Jakob Juelich The heartbeat request will also contain the ids of incomplete jobs and the 781b52574752be108a743d3b33561c34324f8538e7Jakob Juelich ids of all hosts. This is used to not send objects repeatedly. For more 791b52574752be108a743d3b33561c34324f8538e7Jakob Juelich information on this and alternatives considered 801b52574752be108a743d3b33561c34324f8538e7Jakob Juelich see site_rpc_interface.shard_heartbeat. 813b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich""" 823b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 833b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 843b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichHEARTBEAT_AFE_ENDPOINT = 'shard_heartbeat' 85f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich 868421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob JuelichRPC_TIMEOUT_MIN = 5 878421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob JuelichRPC_DELAY_SEC = 5 888421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 898421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob JuelichSTATS_KEY = 'shard_client.%s' % socket.gethostname() 90f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelichtimer = stats.Timer(STATS_KEY) 913b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 923b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 933b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichclass ShardClient(object): 943b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Performs client side tasks of sharding, i.e. the heartbeat. 953b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 968421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich This class contains the logic to do periodic heartbeats to a global AFE, 973b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich to retrieve new jobs from it and to report completed jobs back. 983b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """ 993b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1003b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def __init__(self, global_afe_hostname, shard_hostname, tick_pause_sec): 1018421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich self.afe = frontend_wrappers.RetryingAFE(server=global_afe_hostname, 1028421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich timeout_min=RPC_TIMEOUT_MIN, 1038421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich delay_sec=RPC_DELAY_SEC) 1043b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self.hostname = shard_hostname 1053b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self.tick_pause_sec = tick_pause_sec 1063b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self._shutdown = False 1078421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich self._shard = None 1083b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1093b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 110f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich @timer.decorate 1113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def process_heartbeat_response(self, heartbeat_response): 1123b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Save objects returned by a heartbeat to the local database. 1133b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1143b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich This deseralizes hosts and jobs including their dependencies and saves 1153b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich them to the local database. 1163b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1173b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich @param heartbeat_response: A dictionary with keys 'hosts' and 'jobs', 1183b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich as returned by the `shard_heartbeat` rpc 1193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich call. 1203b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """ 1213b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich hosts_serialized = heartbeat_response['hosts'] 1223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich jobs_serialized = heartbeat_response['jobs'] 1233b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 124f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich stats.Gauge(STATS_KEY).send( 125f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich 'hosts_received', len(hosts_serialized)) 126f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich stats.Gauge(STATS_KEY).send( 127f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich 'jobs_received', len(jobs_serialized)) 128f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich 1293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich # Persisting is automatically done inside deserialize 1303b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich for host in hosts_serialized: 1313b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich models.Host.deserialize(host) 1323b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich for job in jobs_serialized: 1333b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich models.Job.deserialize(job) 1343b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1353b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 1368421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich @property 1378421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich def shard(self): 1388421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich """Return this shard's own shard object, fetched from the database. 1398421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1408421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich A shard's object is fetched from the master with the first jobs. It will 1418421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich not exist before that time. 1428421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1438421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich @returns: The shard object if it already exists, otherwise None 1448421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich """ 1458421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich if self._shard is None: 1468421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich try: 1478421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich self._shard = models.Shard.smart_get(self.hostname) 1488421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich except models.Shard.DoesNotExist: 1498421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # This might happen before any jobs are assigned to this shard. 1508421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # This is okay because then there is nothing to offload anyway. 1518421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich pass 1528421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich return self._shard 1538421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1548421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1558421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich def _get_jobs_to_upload(self): 1568421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich jobs = [] 1578421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # The scheduler sets shard to None upon completion of the job. 1588421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # For more information on the shard field's semantic see 1598421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # models.Job.shard. 1608421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich job_ids = list(models.Job.objects.filter( 1618421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich shard=None).values_list('pk', flat=True)) 1628421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1638421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich for job_to_upload in models.Job.objects.filter(pk__in=job_ids).all(): 1648421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich jobs.append(job_to_upload) 1658421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich return jobs 1668421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1678421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1688421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich def _mark_jobs_as_uploaded(self, jobs): 1698421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich job_ids = [job.id for job in jobs] 1708421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # self.shard might be None if no jobs were downloaded yet. 1718421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # But then job_ids is empty, so this is harmless. 1728421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich # Even if there were jobs we'd in the worst case upload them twice. 1738421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich models.Job.objects.filter(pk__in=job_ids).update(shard=self.shard) 1748421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1758421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1768421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich def _get_hqes_for_jobs(self, jobs): 1778421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich hqes = [] 1788421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich for job in jobs: 1798421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich hqes.extend(job.hostqueueentry_set.all()) 1808421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich return hqes 1818421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1828421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 1831b52574752be108a743d3b33561c34324f8538e7Jakob Juelich def _get_known_ids(self): 1841b52574752be108a743d3b33561c34324f8538e7Jakob Juelich """Returns lists of host and job ids to send in a heartbeat. 1851b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 1861b52574752be108a743d3b33561c34324f8538e7Jakob Juelich The host and job ids are ids of objects that are already present on the 1871b52574752be108a743d3b33561c34324f8538e7Jakob Juelich shard and therefore don't need to be sent again. 1881b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 1891b52574752be108a743d3b33561c34324f8538e7Jakob Juelich For jobs, only incomplete jobs are sent, as the master won't sent 1901b52574752be108a743d3b33561c34324f8538e7Jakob Juelich already completed jobs anyway. This helps keeping the list of id's 1911b52574752be108a743d3b33561c34324f8538e7Jakob Juelich considerably small. 1921b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 1931b52574752be108a743d3b33561c34324f8538e7Jakob Juelich @returns: Tuple of two dictionaries. The first one contains job ids, the 1941b52574752be108a743d3b33561c34324f8538e7Jakob Juelich second one host ids. 1951b52574752be108a743d3b33561c34324f8538e7Jakob Juelich """ 1961b52574752be108a743d3b33561c34324f8538e7Jakob Juelich job_ids = list(models.Job.objects.filter( 1971b52574752be108a743d3b33561c34324f8538e7Jakob Juelich hostqueueentry__complete=False).values_list('id', flat=True)) 1985949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian host_ids = list(models.Host.objects.filter( 1995949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian invalid=0).values_list('id', flat=True)) 2001b52574752be108a743d3b33561c34324f8538e7Jakob Juelich return job_ids, host_ids 2011b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 2021b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 203f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich @timer.decorate 2043b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def do_heartbeat(self): 2053b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Perform a heartbeat: Retreive new jobs. 2063b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2073b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich This function executes a `shard_heartbeat` RPC. It retrieves the 2083b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich response of this call and processes the response by storing the returned 2093b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich objects in the local database. 2103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """ 2113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.info("Performing heartbeat.") 2128421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 2131b52574752be108a743d3b33561c34324f8538e7Jakob Juelich known_job_ids, known_host_ids = self._get_known_ids() 2141b52574752be108a743d3b33561c34324f8538e7Jakob Juelich 2158421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich jobs = self._get_jobs_to_upload() 2168421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich hqes = self._get_hqes_for_jobs(jobs) 2178421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 2181b52574752be108a743d3b33561c34324f8538e7Jakob Juelich # See site_rpc_interface.shard_heartbeat for explanations on the params. 2198421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich response = self.afe.run( 2208421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich HEARTBEAT_AFE_ENDPOINT, shard_hostname=self.hostname, 2211b52574752be108a743d3b33561c34324f8538e7Jakob Juelich known_job_ids=known_job_ids, 2221b52574752be108a743d3b33561c34324f8538e7Jakob Juelich known_host_ids=known_host_ids, 2238421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich jobs=[job.serialize(include_dependencies=False) for job in jobs], 2248421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich hqes=[hqe.serialize(include_dependencies=False) for hqe in hqes]) 2258421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 2268421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich self._mark_jobs_as_uploaded(jobs) 2273b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self.process_heartbeat_response(response) 2283b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.info("Heartbeat completed.") 2293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2303b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2313b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def tick(self): 2323b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Performs all tasks the shard clients needs to do periodically.""" 2333b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self.do_heartbeat() 2343b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2353b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2363b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def loop(self): 2373b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Calls tick() until shutdown() is called.""" 2383b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich while not self._shutdown: 2393b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self.tick() 2403b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich time.sleep(self.tick_pause_sec) 2413b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2423b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2433b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich def shutdown(self): 2443b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Stops the shard client after the current tick.""" 2453b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.info("Shutdown request received.") 2463b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich self._shutdown = True 2473b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2483b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2493b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef handle_signal(signum, frame): 2503b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Sigint handler so we don't crash mid-tick.""" 2513b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich global handle_signal 2523b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich _heartbeat_client.shutdown() 2533b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2543b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2553b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef _get_global_afe_hostname(): 2563b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Read the hostname of the global AFE from the global configuration.""" 2573b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich return global_config.global_config.get_config_value( 2583b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 'SHARD', 'global_afe_hostname') 2593b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2603b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2618421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelichdef _get_shard_hostname_and_ensure_running_on_shard(): 2628421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich """Read the hostname the local shard from the global configuration. 2638421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 2648421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich Raise an exception if run from elsewhere than a shard. 2658421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 2668421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich @raises error.HeartbeatOnlyAllowedInShardModeException if run from 2678421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich elsewhere than from a shard. 2688421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich """ 2698421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich hostname = global_config.global_config.get_config_value( 2708421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 'SHARD', 'shard_hostname', default=None) 2718421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich if not hostname: 2728421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich raise error.HeartbeatOnlyAllowedInShardModeException( 2738421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 'To run the shard client, shard_hostname must neither be None nor ' 2748421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich 'empty.') 2758421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich return hostname 2763b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2773b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2783b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef _get_tick_pause_sec(): 2793b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Read pause to make between two ticks from the global configuration.""" 2803b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich return global_config.global_config.get_config_value( 2813b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 'SHARD', 'heartbeat_pause_sec', type=float) 2823b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2833b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2843b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef get_shard_client(): 2853b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """Instantiate a shard client instance. 2863b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2873b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich Configuration values will be read from the global configuration. 2883b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2893b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich @returns A shard client instance. 2903b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich """ 2913b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich global_afe_hostname = _get_global_afe_hostname() 2928421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich shard_hostname = _get_shard_hostname_and_ensure_running_on_shard() 2933b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich tick_pause_sec = _get_tick_pause_sec() 2943b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich return ShardClient(global_afe_hostname, shard_hostname, tick_pause_sec) 2953b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2963b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 2973b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef main(): 2983b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich try: 299b7620760be69fb6a3aeb4efda835383fb1186992Jakob Juelich stats.Counter(STATS_KEY + 'starts').increment() 3003b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich main_without_exception_handling() 3013b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich except Exception as e: 3023b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich message = 'Uncaught exception; terminating shard_client.' 3033b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich email_manager.manager.log_stacktrace(message) 3043b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.exception(message) 305b7620760be69fb6a3aeb4efda835383fb1186992Jakob Juelich stats.Counter(STATS_KEY + 'uncaught_exceptions').increment() 3063b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich raise 3073b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich finally: 3083b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich email_manager.manager.send_queued_emails() 3093b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef main_without_exception_handling(): 3123b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich parser = argparse.ArgumentParser(description='Shard client.') 3133b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich options = parser.parse_args() 3143b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3155949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian scheduler_lib.setup_logging( 3165949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None), 3175949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian None, timestamped_logfile_prefix='shard_client') 3183b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.info("Setting signal handler.") 3203b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich signal.signal(signal.SIGINT, handle_signal) 3213b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich signal.signal(signal.SIGTERM, handle_signal) 3223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3233b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich logging.info("Starting shard client.") 3243b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich global _heartbeat_client 3253b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich _heartbeat_client = get_shard_client() 3263b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich _heartbeat_client.loop() 3273b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3283b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich 3293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichif __name__ == '__main__': 3303b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich main() 331