13b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich#!/usr/bin/python
23b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich#pylint: disable-msg=C0111
33b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
43b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
53b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# Use of this source code is governed by a BSD-style license that can be
63b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich# found in the LICENSE file.
73b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
83b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport argparse
98a43715afb478fa8be16187374618be33ff49442MK Ryuimport httplib
103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport logging
113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport os
12dfd1f5279eb70b1936b97e19bf8e8d6bf7210118Aviv Keshetimport random
133b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport signal
143b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport time
158a43715afb478fa8be16187374618be33ff49442MK Ryuimport urllib2
163b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
173b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichimport common
18eedcb8b81de7a686746b342be4732d25ccbfb955Paul Hobbs
193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.frontend import setup_django_environment
208a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshetfrom autotest_lib.frontend.afe.json_rpc import proxy
213b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.client.common_lib import error
223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.client.common_lib import global_config
2322dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanianfrom autotest_lib.frontend.afe import models
243b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichfrom autotest_lib.scheduler import email_manager
255949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanianfrom autotest_lib.scheduler import scheduler_lib
268421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelichfrom autotest_lib.server.cros.dynamic_suite import frontend_wrappers
270cb2a3b1d2d86d70da06a3f45be9297139e48207Fang Dengfrom autotest_lib.server import utils as server_utils
2889cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryufrom chromite.lib import timeout_util
2975be1d3f881ef4f4f9cffe0c38fc3139338d8f84Prashanth Balasubramanianfrom django.db import transaction
303b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
315e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shitry:
325e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi    from chromite.lib import metrics
335e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi    from chromite.lib import ts_mon_config
345e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shiexcept ImportError:
355e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi    metrics = server_utils.metrics_mock
365e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi    ts_mon_config = server_utils.metrics_mock
375e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi
385e2efb71ffebead22aa4f0744ad843ee79814b43Dan Shi
393b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich"""
403b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichAutotest shard client
413b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
423b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichThe shard client can be run as standalone service. It periodically polls the
433b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichmaster in a heartbeat, retrieves new jobs and hosts and inserts them into the
443b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichlocal database.
453b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
463b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichA shard is set up (by a human) and pointed to the global AFE (cautotest).
473b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichOn the shard, this script periodically makes so called heartbeat requests to the
483b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichglobal AFE, which will then complete the following actions:
493b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
503b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich1. Find the previously created (with atest) record for the shard. Shards are
513b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   identified by their hostnames, specified in the shadow_config.
523b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich2. Take the records that were sent in the heartbeat and insert them into the
533b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   global database.
543b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - This is to set the status of jobs to completed in the master database after
553b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     they were run by a slave. This is necessary so one can just look at the
563b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     master's afe to see the statuses of all jobs. Otherwise one would have to
573b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     check the tko tables or the individual slave AFEs.
583b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich3. Find labels that have been assigned to this shard.
591b52574752be108a743d3b33561c34324f8538e7Jakob Juelich4. Assign hosts that:
601b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   - have the specified label
611b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   - aren't leased
621b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   - have an id which is not in the known_host_ids which were sent in the
631b52574752be108a743d3b33561c34324f8538e7Jakob Juelich     heartbeat request.
641b52574752be108a743d3b33561c34324f8538e7Jakob Juelich5. Assign jobs that:
653b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - depend on the specified label
663b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - haven't been assigned before
673b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - aren't started yet
683b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - aren't completed yet
691b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   - have an id which is not in the jobs_known_ids which were sent in the
701b52574752be108a743d3b33561c34324f8538e7Jakob Juelich     heartbeat request.
711b52574752be108a743d3b33561c34324f8538e7Jakob Juelich6. Serialize the chosen jobs and hosts.
723b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - Find objects that the Host/Job objects depend on: Labels, AclGroups, Users,
733b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     and many more. Details about this can be found around
743b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     model_logic.serialize()
753b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich7. Send these objects to the slave.
763b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
773b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
783b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichOn the client side, this will happen:
793b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich1. Deserialize the objects sent from the master and persist them to the local
803b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   database.
813b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich2. monitor_db on the shard will pick up these jobs and schedule them on the
823b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   available hosts (which were retrieved from a heartbeat).
833b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich3. Once a job is finished, it's shard_id is set to NULL
843b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich4. The shard_client will pick up all jobs where shard_id=NULL and will
853b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   send them to the master in the request of the next heartbeat.
863b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - The master will persist them as described earlier.
873b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich   - the shard_id will be set back to the shard's id, so the record won't be
883b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich     uploaded again.
891b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   The heartbeat request will also contain the ids of incomplete jobs and the
901b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   ids of all hosts. This is used to not send objects repeatedly. For more
911b52574752be108a743d3b33561c34324f8538e7Jakob Juelich   information on this and alternatives considered
92cdd00f20ac7607ff89a97e83e2483a4c8feddb7bAllen Li   see rpc_interface.shard_heartbeat.
933b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich"""
943b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
953b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
963b27dbc2358aef655e050a92510ff8e9e080bf81Jakob JuelichHEARTBEAT_AFE_ENDPOINT = 'shard_heartbeat'
97f960d89a7b197fe3b3bd28546c6c89c2331b9f14Jakob Juelich
988421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob JuelichRPC_TIMEOUT_MIN = 5
998421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob JuelichRPC_DELAY_SEC = 5
1008421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
10122dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian_heartbeat_client = None
1023b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1033b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1043b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichclass ShardClient(object):
1053b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """Performs client side tasks of sharding, i.e. the heartbeat.
1063b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1078421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    This class contains the logic to do periodic heartbeats to a global AFE,
1083b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    to retrieve new jobs from it and to report completed jobs back.
1093b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """
1103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def __init__(self, global_afe_hostname, shard_hostname, tick_pause_sec):
1128421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        self.afe = frontend_wrappers.RetryingAFE(server=global_afe_hostname,
1138421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                                                 timeout_min=RPC_TIMEOUT_MIN,
1148421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                                                 delay_sec=RPC_DELAY_SEC)
1153b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self.hostname = shard_hostname
1163b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self.tick_pause_sec = tick_pause_sec
1173b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self._shutdown = False
1188421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        self._shard = None
1193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1203b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
121e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu    def _deserialize_many(self, serialized_list, djmodel, message):
122e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        """Deserialize data in JSON format to database.
123e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu
124e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        Deserialize a list of JSON-formatted data to database using Django.
125e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu
126e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        @param serialized_list: A list of JSON-formatted data.
127e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        @param djmodel: Django model type.
128e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        @param message: A string to be used in a logging message.
129e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        """
130e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        for serialized in serialized_list:
131e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu            with transaction.commit_on_success():
132e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                try:
133e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                    djmodel.deserialize(serialized)
134e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                except Exception as e:
135e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                    logging.error('Deserializing a %s fails: %s, Error: %s',
136e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                                  message, serialized, e)
13775edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                    metrics.Counter(
13875edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                        'chromeos/autotest/shard_client/deserialization_failed'
13975edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                        ).increment()
140e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu
141e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu
14264418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu    @metrics.SecondsTimerDecorator(
14364418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu            'chromeos/autotest/shard_client/heartbeat_response_duration')
1443b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def process_heartbeat_response(self, heartbeat_response):
1453b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """Save objects returned by a heartbeat to the local database.
1463b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1473b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        This deseralizes hosts and jobs including their dependencies and saves
1483b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        them to the local database.
1493b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1503b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        @param heartbeat_response: A dictionary with keys 'hosts' and 'jobs',
1513b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich                                   as returned by the `shard_heartbeat` rpc
1523b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich                                   call.
1533b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """
1543b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        hosts_serialized = heartbeat_response['hosts']
1553b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        jobs_serialized = heartbeat_response['jobs']
156f37059917de290d2983a7ca490884a8f694fb23eFang Deng        suite_keyvals_serialized = heartbeat_response['suite_keyvals']
157b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        incorrect_host_ids = heartbeat_response.get('incorrect_host_ids', [])
1583b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
15975edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Gauge('chromeos/autotest/shard_client/hosts_received'
16075edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                      ).set(len(hosts_serialized))
16175edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Gauge('chromeos/autotest/shard_client/jobs_received'
16275edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                      ).set(len(jobs_serialized))
16375edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Gauge('chromeos/autotest/shard_client/suite_keyvals_received'
16475edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet                      ).set(len(suite_keyvals_serialized))
16575edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet
166e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        self._deserialize_many(hosts_serialized, models.Host, 'host')
167e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        self._deserialize_many(jobs_serialized, models.Job, 'job')
168e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        self._deserialize_many(suite_keyvals_serialized, models.JobKeyval,
169e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                               'jobkeyval')
1703b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
1715cfd96aca6204b6fd193ab2e15a24808756e6198MK Ryu        host_ids = [h['id'] for h in hosts_serialized]
1725cfd96aca6204b6fd193ab2e15a24808756e6198MK Ryu        logging.info('Heartbeat response contains hosts %s', host_ids)
17322dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        job_ids = [j['id'] for j in jobs_serialized]
17422dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        logging.info('Heartbeat response contains jobs %s', job_ids)
175f37059917de290d2983a7ca490884a8f694fb23eFang Deng        parent_jobs_with_keyval = set([kv['job_id']
176f37059917de290d2983a7ca490884a8f694fb23eFang Deng                                       for kv in suite_keyvals_serialized])
177f37059917de290d2983a7ca490884a8f694fb23eFang Deng        logging.info('Heartbeat response contains suite_keyvals_for jobs %s',
178e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                     list(parent_jobs_with_keyval))
179b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        if incorrect_host_ids:
180b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet            logging.info('Heartbeat response contains incorrect_host_ids %s '
181b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet                         'which will be deleted.', incorrect_host_ids)
182b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet            self._remove_incorrect_hosts(incorrect_host_ids)
18322dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
18422dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # If the master has just sent any jobs that we think have completed,
18522dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # re-sync them with the master. This is especially useful when a
18622dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # heartbeat or job is silently dropped, as the next heartbeat will
18722dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # have a disagreement. Updating the shard_id to NULL will mark these
18822dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # jobs for upload on the next heartbeat.
189e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        job_models = models.Job.objects.filter(
190e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                id__in=job_ids, hostqueueentry__complete=True)
191e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        if job_models:
192e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu            job_models.update(shard=None)
193e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu            job_ids_repr = ', '.join([str(job.id) for job in job_models])
194e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu            logging.warn('Following completed jobs are reset shard_id to NULL '
195e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu                         'to be uploaded to master again: %s', job_ids_repr)
19622dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
1973b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
198b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet    def _remove_incorrect_hosts(self, incorrect_host_ids=None):
199b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        """Remove from local database any hosts that should not exist.
200b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet
201b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        Entries of |incorrect_host_ids| that are absent from database will be
202b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        silently ignored.
203b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet
204b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        @param incorrect_host_ids: a list of ids (as integers) to remove.
205b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        """
206b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        if not incorrect_host_ids:
207b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet            return
208b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet
209b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet        models.Host.objects.filter(id__in=incorrect_host_ids).delete()
210b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet
211b9077b9825143d43a63dc6c89b4f4d0c8facdf4aAviv Keshet
2128421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    @property
2138421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    def shard(self):
2148421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        """Return this shard's own shard object, fetched from the database.
2158421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2168421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        A shard's object is fetched from the master with the first jobs. It will
2178421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        not exist before that time.
2188421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2198421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        @returns: The shard object if it already exists, otherwise None
2208421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        """
2218421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        if self._shard is None:
2228421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            try:
2238421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                self._shard = models.Shard.smart_get(self.hostname)
2248421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            except models.Shard.DoesNotExist:
2258421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                # This might happen before any jobs are assigned to this shard.
2268421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                # This is okay because then there is nothing to offload anyway.
2278421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich                pass
2288421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        return self._shard
2298421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2308421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2318421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    def _get_jobs_to_upload(self):
2328421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        jobs = []
2338421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        # The scheduler sets shard to None upon completion of the job.
2348421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        # For more information on the shard field's semantic see
23522dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # models.Job.shard. We need to be careful to wait for both the
23622dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # shard_id and the complete bit here, or we will end up syncing
23722dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        # the job without ever setting the complete bit.
2388421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        job_ids = list(models.Job.objects.filter(
23922dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian            shard=None,
24022dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian            hostqueueentry__complete=True).values_list('pk', flat=True))
2418421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2428421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        for job_to_upload in models.Job.objects.filter(pk__in=job_ids).all():
2438421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            jobs.append(job_to_upload)
2448421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        return jobs
2458421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2468421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
24722dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian    def _mark_jobs_as_uploaded(self, job_ids):
2488421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        # self.shard might be None if no jobs were downloaded yet.
2498421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        # But then job_ids is empty, so this is harmless.
2508421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        # Even if there were jobs we'd in the worst case upload them twice.
2518421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        models.Job.objects.filter(pk__in=job_ids).update(shard=self.shard)
2528421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2538421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2548421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    def _get_hqes_for_jobs(self, jobs):
2558421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        hqes = []
2568421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        for job in jobs:
2578421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            hqes.extend(job.hostqueueentry_set.all())
2588421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        return hqes
2598421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
2608421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
26107a109f19cd7363fb5440c72e870802392b7ce24MK Ryu    def _get_known_jobs_and_hosts(self):
26207a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        """Returns lists of host and job info to send in a heartbeat.
2631b52574752be108a743d3b33561c34324f8538e7Jakob Juelich
2641b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        The host and job ids are ids of objects that are already present on the
2651b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        shard and therefore don't need to be sent again.
2661b52574752be108a743d3b33561c34324f8538e7Jakob Juelich
26707a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        For jobs, only incomplete jobs are sent, as the master won't send
2681b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        already completed jobs anyway. This helps keeping the list of id's
2691b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        considerably small.
2701b52574752be108a743d3b33561c34324f8538e7Jakob Juelich
27107a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        For hosts, host status in addition to host id are sent to master
27207a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        to sync the host status.
27307a109f19cd7363fb5440c72e870802392b7ce24MK Ryu
27407a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        @returns: Tuple of three lists. The first one contains job ids, the
27507a109f19cd7363fb5440c72e870802392b7ce24MK Ryu                  second one host ids, and the third one host statuses.
2761b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        """
2771b52574752be108a743d3b33561c34324f8538e7Jakob Juelich        job_ids = list(models.Job.objects.filter(
27807a109f19cd7363fb5440c72e870802392b7ce24MK Ryu                hostqueueentry__complete=False).values_list('id', flat=True))
27907a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        host_models = models.Host.objects.filter(invalid=0)
28007a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        host_ids = []
28107a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        host_statuses = []
28207a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        for h in host_models:
28307a109f19cd7363fb5440c72e870802392b7ce24MK Ryu            host_ids.append(h.id)
28407a109f19cd7363fb5440c72e870802392b7ce24MK Ryu            host_statuses.append(h.status)
28507a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        return job_ids, host_ids, host_statuses
2861b52574752be108a743d3b33561c34324f8538e7Jakob Juelich
2871b52574752be108a743d3b33561c34324f8538e7Jakob Juelich
28822dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian    def _heartbeat_packet(self):
28922dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        """Construct the heartbeat packet.
29022dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
291cdd00f20ac7607ff89a97e83e2483a4c8feddb7bAllen Li        See rpc_interface for a more detailed description of the heartbeat.
29222dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
29322dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        @return: A heartbeat packet.
29422dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        """
29507a109f19cd7363fb5440c72e870802392b7ce24MK Ryu        known_job_ids, known_host_ids, known_host_statuses = (
29607a109f19cd7363fb5440c72e870802392b7ce24MK Ryu                self._get_known_jobs_and_hosts())
29722dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        logging.info('Known jobs: %s', known_job_ids)
29822dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
29922dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        job_objs = self._get_jobs_to_upload()
30022dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        hqes = [hqe.serialize(include_dependencies=False)
30122dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian                for hqe in self._get_hqes_for_jobs(job_objs)]
30222dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        jobs = [job.serialize(include_dependencies=False) for job in job_objs]
30322dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        logging.info('Uploading jobs %s', [j['id'] for j in jobs])
30422dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
30522dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        return {'shard_hostname': self.hostname,
30622dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian                'known_job_ids': known_job_ids,
30722dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian                'known_host_ids': known_host_ids,
30807a109f19cd7363fb5440c72e870802392b7ce24MK Ryu                'known_host_statuses': known_host_statuses,
30922dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian                'jobs': jobs, 'hqes': hqes}
31022dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
31122dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian
3128a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet    def _heartbeat_failure(self, log_message, failure_type_str=''):
3138a43715afb478fa8be16187374618be33ff49442MK Ryu        logging.error("Heartbeat failed. %s", log_message)
31475edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Counter('chromeos/autotest/shard_client/heartbeat_failure'
3158a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                        ).increment(fields={'failure_type': failure_type_str})
3168a43715afb478fa8be16187374618be33ff49442MK Ryu
3178a43715afb478fa8be16187374618be33ff49442MK Ryu
31864418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu    @metrics.SecondsTimerDecorator(
31964418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu            'chromeos/autotest/shard_client/do_heatbeat_duration')
3203b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def do_heartbeat(self):
3213b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """Perform a heartbeat: Retreive new jobs.
3223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3233b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        This function executes a `shard_heartbeat` RPC. It retrieves the
3243b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        response of this call and processes the response by storing the returned
3253b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        objects in the local database.
3263b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """
32764418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu        heartbeat_metrics_prefix  = 'chromeos/autotest/shard_client/heartbeat/'
32864418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu
3293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        logging.info("Performing heartbeat.")
33022dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        packet = self._heartbeat_packet()
33164418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu        metrics.Gauge(heartbeat_metrics_prefix + 'request_size').set(
33264418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu            len(str(packet)))
3338a43715afb478fa8be16187374618be33ff49442MK Ryu
3348a43715afb478fa8be16187374618be33ff49442MK Ryu        try:
3358a43715afb478fa8be16187374618be33ff49442MK Ryu            response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet)
33689cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryu        except urllib2.HTTPError as e:
3378a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            self._heartbeat_failure('HTTPError %d: %s' % (e.code, e.reason),
3388a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                                    'HTTPError')
3398a43715afb478fa8be16187374618be33ff49442MK Ryu            return
34089cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryu        except urllib2.URLError as e:
3418a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            self._heartbeat_failure('URLError: %s' % e.reason,
3428a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                                    'URLError')
3438a43715afb478fa8be16187374618be33ff49442MK Ryu            return
34489cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryu        except httplib.HTTPException as e:
3458a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            self._heartbeat_failure('HTTPException: %s' % e,
3468a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                                    'HTTPException')
3478a43715afb478fa8be16187374618be33ff49442MK Ryu            return
34889cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryu        except timeout_util.TimeoutError as e:
3498a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            self._heartbeat_failure('TimeoutError: %s' % e,
3508a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                                    'TimeoutError')
3518a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            return
3528a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet        except proxy.JSONRPCException as e:
3538a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet            self._heartbeat_failure('JSONRPCException: %s' % e,
3548a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                                    'JSONRPCException')
35589cca5d6ef8df35f1b294b16bf536a8f3ffb5efbMK Ryu            return
3568a43715afb478fa8be16187374618be33ff49442MK Ryu
35764418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu        metrics.Gauge(heartbeat_metrics_prefix + 'response_size').set(
35864418072a5294d7fdbf338e6c573fde26eef1129Prathmesh Prabhu            len(str(response)))
35922dd226625255110c079e979113dcda1f4fa5ea8Prashanth Balasubramanian        self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']])
3603b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self.process_heartbeat_response(response)
3613b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        logging.info("Heartbeat completed.")
3623b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3633b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3643b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def tick(self):
3653b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """Performs all tasks the shard clients needs to do periodically."""
3663b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self.do_heartbeat()
36775edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Counter('chromeos/autotest/shard_client/tick').increment()
3683b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3693b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3703b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def loop(self):
3713b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """Calls tick() until shutdown() is called."""
3723b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        while not self._shutdown:
3733b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich            self.tick()
374dfd1f5279eb70b1936b97e19bf8e8d6bf7210118Aviv Keshet            # Sleep with +/- 10% fuzzing to avoid phaselock of shards.
375dfd1f5279eb70b1936b97e19bf8e8d6bf7210118Aviv Keshet            tick_fuzz = self.tick_pause_sec * 0.2 * (random.random() - 0.5)
376dfd1f5279eb70b1936b97e19bf8e8d6bf7210118Aviv Keshet            time.sleep(self.tick_pause_sec + tick_fuzz)
3773b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3783b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3793b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    def shutdown(self):
3803b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        """Stops the shard client after the current tick."""
3813b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        logging.info("Shutdown request received.")
3823b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        self._shutdown = True
3833b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3843b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3853b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef handle_signal(signum, frame):
3863b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """Sigint handler so we don't crash mid-tick."""
3873b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    _heartbeat_client.shutdown()
3883b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3893b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
3908421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelichdef _get_shard_hostname_and_ensure_running_on_shard():
3918421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    """Read the hostname the local shard from the global configuration.
3928421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
3938421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    Raise an exception if run from elsewhere than a shard.
3948421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich
3958421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    @raises error.HeartbeatOnlyAllowedInShardModeException if run from
3968421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            elsewhere than from a shard.
3978421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    """
3988421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    hostname = global_config.global_config.get_config_value(
3998421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        'SHARD', 'shard_hostname', default=None)
4008421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    if not hostname:
4018421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich        raise error.HeartbeatOnlyAllowedInShardModeException(
4028421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            'To run the shard client, shard_hostname must neither be None nor '
4038421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich            'empty.')
4048421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    return hostname
4053b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4063b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4073b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef _get_tick_pause_sec():
4083b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """Read pause to make between two ticks from the global configuration."""
4093b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    return global_config.global_config.get_config_value(
4103b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        'SHARD', 'heartbeat_pause_sec', type=float)
4113b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4123b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4133b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef get_shard_client():
4143b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """Instantiate a shard client instance.
4153b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4163b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    Configuration values will be read from the global configuration.
4173b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4183b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    @returns A shard client instance.
4193b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    """
4200cb2a3b1d2d86d70da06a3f45be9297139e48207Fang Deng    global_afe_hostname = server_utils.get_global_afe_hostname()
4218421d5905ab0aed8689c2eea6be8d9c4042ce618Jakob Juelich    shard_hostname = _get_shard_hostname_and_ensure_running_on_shard()
4223b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    tick_pause_sec = _get_tick_pause_sec()
4233b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    return ShardClient(global_afe_hostname, shard_hostname, tick_pause_sec)
4243b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4253b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4263b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef main():
42775edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet    ts_mon_config.SetupTsMonGlobalState('shard_client')
42875edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet
4293b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    try:
43075edb9cf7bce68146348a9779ebb61743b8e09c3Aviv Keshet        metrics.Counter('chromeos/autotest/shard_client/start').increment()
4313b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        main_without_exception_handling()
4323b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    except Exception as e:
4338a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet        metrics.Counter('chromeos/autotest/shard_client/uncaught_exception'
4348a93beb6aecbd0c4aaa1335b14c0e611cbd30bd4Aviv Keshet                        ).increment()
435e72a90b5942d293f67f027eaa45f6b126680ec2aMK Ryu        message = 'Uncaught exception. Terminating shard_client.'
4363b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        email_manager.manager.log_stacktrace(message)
4373b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        logging.exception(message)
4383b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        raise
4393b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    finally:
4403b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich        email_manager.manager.send_queued_emails()
4413b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4423b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4433b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichdef main_without_exception_handling():
4443b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    parser = argparse.ArgumentParser(description='Shard client.')
4453b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    options = parser.parse_args()
4463b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4475949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian    scheduler_lib.setup_logging(
4485949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian            os.environ.get('AUTOTEST_SCHEDULER_LOG_DIR', None),
4495949b4af7a872aeb58e7ad29090812d648725ed5Prashanth Balasubramanian            None, timestamped_logfile_prefix='shard_client')
4503b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4513b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    logging.info("Setting signal handler.")
4523b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    signal.signal(signal.SIGINT, handle_signal)
4533b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    signal.signal(signal.SIGTERM, handle_signal)
4543b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4553b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    logging.info("Starting shard client.")
4563b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    global _heartbeat_client
4573b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    _heartbeat_client = get_shard_client()
4583b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    _heartbeat_client.loop()
4593b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4603b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich
4613b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelichif __name__ == '__main__':
4623b27dbc2358aef655e050a92510ff8e9e080bf81Jakob Juelich    main()
463