1#pylint: disable-msg=C0111
2
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
14method on successful completion. A row in afe_special_tasks with values:
15    host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22    Host, HQE -> Pending, Starting
23    This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25    epilog:
26        failure:
27            requeue hqe
28            repair the host
29Children PreJobTasks:
30    prolog:
31        set Host, HQE status
32    epilog:
33        success:
34            on_pending
35        failure:
36            repair throgh PreJobTask
37            set Host, HQE status
38
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
55"""
56
57import logging
58import os
59
60from autotest_lib.client.common_lib import host_protections
61from autotest_lib.frontend.afe import models
62from autotest_lib.scheduler import agent_task, scheduler_config
63from autotest_lib.server import autoserv_utils
64from autotest_lib.server.cros import provision
65
66
67class PreJobTask(agent_task.SpecialAgentTask):
68    def _copy_to_results_repository(self):
69        if not self.queue_entry or self.queue_entry.meta_host:
70            return
71
72        self.queue_entry.set_execution_subdir()
73        log_name = os.path.basename(self.task.execution_path())
74        source = os.path.join(self.task.execution_path(), 'debug',
75                              'autoserv.DEBUG')
76        destination = os.path.join(
77                self.queue_entry.execution_path(), log_name)
78
79        self.monitor.try_copy_to_results_repository(
80                source, destination_path=destination)
81
82
83    def epilog(self):
84        super(PreJobTask, self).epilog()
85
86        if self.success:
87            return
88
89        if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
90            # effectively ignore failure for these hosts
91            self.success = True
92            return
93
94        if self.queue_entry:
95            # If we requeue a HQE, we should cancel any remaining pre-job
96            # tasks against this host, otherwise we'll be left in a state
97            # where a queued HQE has special tasks to run against a host.
98            models.SpecialTask.objects.filter(
99                    queue_entry__id=self.queue_entry.id,
100                    host__id=self.host.id,
101                    is_complete=0).update(is_complete=1, success=0)
102
103            previous_provisions = models.SpecialTask.objects.filter(
104                    task=models.SpecialTask.Task.PROVISION,
105                    queue_entry_id=self.queue_entry.id).count()
106            if (previous_provisions >
107                scheduler_config.config.max_provision_retries):
108                self._actually_fail_queue_entry()
109                # This abort will mark the aborted bit on the HQE itself, to
110                # signify that we're killing it.  Technically it also will do
111                # the recursive aborting of all child jobs, but that shouldn't
112                # matter here, as only suites have children, and those are
113                # hostless and thus don't have provisioning.
114                # TODO(milleral) http://crbug.com/188217
115                # However, we can't actually do this yet, as if we set the
116                # abort bit the FinalReparseTask will set the status of the HQE
117                # to ABORTED, which then means that we don't show the status in
118                # run_suite.  So in the meantime, don't mark the HQE as
119                # aborted.
120                # queue_entry.abort()
121            else:
122                # requeue() must come after handling provision retries, since
123                # _actually_fail_queue_entry needs an execution subdir.
124                # We also don't want to requeue if we hit the provision retry
125                # limit, since then we overwrite the PARSING state of the HQE.
126                self.queue_entry.requeue()
127
128            # Limit the repair on a host when a prejob task fails, e.g., reset,
129            # verify etc. The number of repair jobs is limited to the specific
130            # HQE and host.
131            previous_repairs = models.SpecialTask.objects.filter(
132                    task=models.SpecialTask.Task.REPAIR,
133                    queue_entry_id=self.queue_entry.id,
134                    host_id=self.queue_entry.host_id).count()
135            if previous_repairs >= scheduler_config.config.max_repair_limit:
136                self.host.set_status(models.Host.Status.REPAIR_FAILED)
137                self._fail_queue_entry()
138                return
139
140            queue_entry = models.HostQueueEntry.objects.get(
141                    id=self.queue_entry.id)
142        else:
143            queue_entry = None
144
145        models.SpecialTask.objects.create(
146                host=models.Host.objects.get(id=self.host.id),
147                task=models.SpecialTask.Task.REPAIR,
148                queue_entry=queue_entry,
149                requested_by=self.task.requested_by)
150
151
152    def _should_pending(self):
153        """
154        Decide if we should call the host queue entry's on_pending method.
155        We should if:
156        1) There exists an associated host queue entry.
157        2) The current special task completed successfully.
158        3) There do not exist any more special tasks to be run before the
159           host queue entry starts.
160
161        @returns: True if we should call pending, false if not.
162
163        """
164        if not self.queue_entry or not self.success:
165            return False
166
167        # We know if this is the last one when we create it, so we could add
168        # another column to the database to keep track of this information, but
169        # I expect the overhead of querying here to be minimal.
170        queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
171        queued = models.SpecialTask.objects.filter(
172                host__id=self.host.id, is_active=False,
173                is_complete=False, queue_entry=queue_entry)
174        queued = queued.exclude(id=self.task.id)
175        return queued.count() == 0
176
177
178class VerifyTask(PreJobTask):
179    TASK_TYPE = models.SpecialTask.Task.VERIFY
180
181
182    def __init__(self, task):
183        args = ['-v']
184        if task.queue_entry:
185            args.extend(self._generate_autoserv_label_args(task))
186        super(VerifyTask, self).__init__(task, args)
187        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
188
189
190    def prolog(self):
191        super(VerifyTask, self).prolog()
192
193        logging.info("starting verify on %s", self.host.hostname)
194        if self.queue_entry:
195            self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
196        self.host.set_status(models.Host.Status.VERIFYING)
197
198        # Delete any queued manual reverifies for this host.  One verify will do
199        # and there's no need to keep records of other requests.
200        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
201                                  keep_last_one=True)
202
203
204    def epilog(self):
205        super(VerifyTask, self).epilog()
206        if self.success:
207            if self._should_pending():
208                self.queue_entry.on_pending()
209            else:
210                self.host.set_status(models.Host.Status.READY)
211
212
213class CleanupTask(PreJobTask):
214    # note this can also run post-job, but when it does, it's running standalone
215    # against the host (not related to the job), so it's not considered a
216    # PostJobTask
217
218    TASK_TYPE = models.SpecialTask.Task.CLEANUP
219
220
221    def __init__(self, task, recover_run_monitor=None):
222        args = ['--cleanup']
223        if task.queue_entry:
224            args.extend(self._generate_autoserv_label_args(task))
225        super(CleanupTask, self).__init__(task, args)
226        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
227
228
229    def prolog(self):
230        super(CleanupTask, self).prolog()
231        logging.info("starting cleanup task for host: %s", self.host.hostname)
232        self.host.set_status(models.Host.Status.CLEANING)
233        if self.queue_entry:
234            self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
235
236
237    def _finish_epilog(self):
238        if not self.queue_entry or not self.success:
239            return
240
241        do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
242        should_run_verify = (
243                self.queue_entry.job.run_verify
244                and self.host.protection != do_not_verify_protection)
245        if should_run_verify:
246            entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
247            models.SpecialTask.objects.create(
248                    host=models.Host.objects.get(id=self.host.id),
249                    queue_entry=entry,
250                    task=models.SpecialTask.Task.VERIFY)
251        else:
252            if self._should_pending():
253                self.queue_entry.on_pending()
254
255
256    def epilog(self):
257        super(CleanupTask, self).epilog()
258
259        if self.success:
260            self.host.update_field('dirty', 0)
261            self.host.set_status(models.Host.Status.READY)
262
263        self._finish_epilog()
264
265
266class ResetTask(PreJobTask):
267    """Task to reset a DUT, including cleanup and verify."""
268    # note this can also run post-job, but when it does, it's running standalone
269    # against the host (not related to the job), so it's not considered a
270    # PostJobTask
271
272    TASK_TYPE = models.SpecialTask.Task.RESET
273
274
275    def __init__(self, task, recover_run_monitor=None):
276        args = ['--reset']
277        if task.queue_entry:
278            args.extend(self._generate_autoserv_label_args(task))
279        super(ResetTask, self).__init__(task, args)
280        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
281
282
283    def prolog(self):
284        super(ResetTask, self).prolog()
285        logging.info('starting reset task for host: %s',
286                     self.host.hostname)
287        self.host.set_status(models.Host.Status.RESETTING)
288        if self.queue_entry:
289            self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
290
291        # Delete any queued cleanups for this host.
292        self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
293                                  keep_last_one=False)
294
295        # Delete any queued reverifies for this host.
296        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
297                                  keep_last_one=False)
298
299        # Only one reset is needed.
300        self.remove_special_tasks(models.SpecialTask.Task.RESET,
301                                  keep_last_one=True)
302
303
304    def epilog(self):
305        super(ResetTask, self).epilog()
306
307        if self.success:
308            self.host.update_field('dirty', 0)
309
310            if self._should_pending():
311                self.queue_entry.on_pending()
312            else:
313                self.host.set_status(models.Host.Status.READY)
314
315
316class ProvisionTask(PreJobTask):
317    TASK_TYPE = models.SpecialTask.Task.PROVISION
318
319    def __init__(self, task):
320        # Provisioning requires that we be associated with a job/queue entry
321        assert task.queue_entry, "No HQE associated with provision task!"
322        # task.queue_entry is an afe model HostQueueEntry object.
323        # self.queue_entry is a scheduler models HostQueueEntry object, but
324        # it gets constructed and assigned in __init__, so it's not available
325        # yet.  Therefore, we're stuck pulling labels off of the afe model
326        # so that we can pass the --provision args into the __init__ call.
327        labels = {x.name for x in task.queue_entry.job.labels}
328        _, provisionable = provision.filter_labels(labels)
329        extra_command_args = ['--provision',
330                              '--job-labels', ','.join(provisionable)]
331        super(ProvisionTask, self).__init__(task, extra_command_args)
332        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
333
334
335    def _command_line(self):
336        # If we give queue_entry to _autoserv_command_line, then it will append
337        # -c for this invocation if the queue_entry is a client side test. We
338        # don't want that, as it messes with provisioning, so we just drop it
339        # from the arguments here.
340        # Note that we also don't verify job_repo_url as provisioining tasks are
341        # required to stage whatever content we need, and the job itself will
342        # force autotest to be staged if it isn't already.
343        return autoserv_utils._autoserv_command_line(self.host.hostname,
344                                                     self._extra_command_args,
345                                                     in_lab=True)
346
347
348    def prolog(self):
349        super(ProvisionTask, self).prolog()
350        # add check for previous provision task and abort if exist.
351        logging.info("starting provision task for host: %s", self.host.hostname)
352        self.queue_entry.set_status(
353                models.HostQueueEntry.Status.PROVISIONING)
354        self.host.set_status(models.Host.Status.PROVISIONING)
355
356
357    def epilog(self):
358        super(ProvisionTask, self).epilog()
359
360        # If we were not successful in provisioning the machine
361        # leave the DUT in whatever status was set in the PreJobTask's
362        # epilog. If this task was successful the host status will get
363        # set appropriately as a fallout of the hqe's on_pending. If
364        # we don't call on_pending, it can only be because:
365        #   1. This task was not successful:
366        #       a. Another repair is queued: this repair job will set the host
367        #       status, and it will remain in 'Provisioning' till then.
368        #       b. We have hit the max_repair_limit: in which case the host
369        #       status is set to 'RepairFailed' in the epilog of PreJobTask.
370        #   2. The task was successful, but there are other special tasks:
371        #      Those special tasks will set the host status appropriately.
372        if self._should_pending():
373            self.queue_entry.on_pending()
374
375
376class RepairTask(agent_task.SpecialAgentTask):
377    TASK_TYPE = models.SpecialTask.Task.REPAIR
378
379
380    def __init__(self, task):
381        """\
382        queue_entry: queue entry to mark failed if this repair fails.
383        """
384        protection = host_protections.Protection.get_string(
385                task.host.protection)
386        # normalize the protection name
387        protection = host_protections.Protection.get_attr_name(protection)
388
389        args = ['-R', '--host-protection', protection]
390        if task.queue_entry:
391            args.extend(self._generate_autoserv_label_args(task))
392
393        super(RepairTask, self).__init__(task, args)
394
395        # *don't* include the queue entry in IDs -- if the queue entry is
396        # aborted, we want to leave the repair task running
397        self._set_ids(host=self.host)
398
399
400    def prolog(self):
401        super(RepairTask, self).prolog()
402        logging.info("repair_task starting")
403        self.host.set_status(models.Host.Status.REPAIRING)
404
405
406    def epilog(self):
407        super(RepairTask, self).epilog()
408
409        if self.success:
410            self.host.set_status(models.Host.Status.READY)
411        else:
412            self.host.set_status(models.Host.Status.REPAIR_FAILED)
413            if self.queue_entry:
414                self._fail_queue_entry()
415