1#pylint: disable=C0111
2
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
14method on successful completion. A row in afe_special_tasks with values:
15    host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22    Host, HQE -> Pending, Starting
23    This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25    epilog:
26        failure:
27            requeue hqe
28            repair the host
29Children PreJobTasks:
30    prolog:
31        set Host, HQE status
32    epilog:
33        success:
34            on_pending
35        failure:
36            repair throgh PreJobTask
37            set Host, HQE status
38
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
55"""
56
57import logging
58import re
59
60from autotest_lib.client.common_lib import host_protections
61from autotest_lib.frontend.afe import models
62from autotest_lib.scheduler import agent_task, scheduler_config
63from autotest_lib.server import autoserv_utils
64from autotest_lib.server.cros import provision
65
66
67class PreJobTask(agent_task.SpecialAgentTask):
68    def epilog(self):
69        super(PreJobTask, self).epilog()
70
71        if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
72            # effectively ignore failure for these hosts
73            self.success = True
74
75        if self.success:
76            self.host.record_working_state(True,
77                                           self.task.time_finished)
78            return
79
80        if self.queue_entry:
81            # If we requeue a HQE, we should cancel any remaining pre-job
82            # tasks against this host, otherwise we'll be left in a state
83            # where a queued HQE has special tasks to run against a host.
84            models.SpecialTask.objects.filter(
85                    queue_entry__id=self.queue_entry.id,
86                    host__id=self.host.id,
87                    is_complete=0).update(is_complete=1, success=0)
88
89            previous_provisions = models.SpecialTask.objects.filter(
90                    task=models.SpecialTask.Task.PROVISION,
91                    queue_entry_id=self.queue_entry.id).count()
92            if (previous_provisions >
93                scheduler_config.config.max_provision_retries):
94                self._actually_fail_queue_entry()
95                # This abort will mark the aborted bit on the HQE itself, to
96                # signify that we're killing it.  Technically it also will do
97                # the recursive aborting of all child jobs, but that shouldn't
98                # matter here, as only suites have children, and those are
99                # hostless and thus don't have provisioning.
100                # TODO(milleral) http://crbug.com/188217
101                # However, we can't actually do this yet, as if we set the
102                # abort bit the FinalReparseTask will set the status of the HQE
103                # to ABORTED, which then means that we don't show the status in
104                # run_suite.  So in the meantime, don't mark the HQE as
105                # aborted.
106                # queue_entry.abort()
107            else:
108                # requeue() must come after handling provision retries, since
109                # _actually_fail_queue_entry needs an execution subdir.
110                # We also don't want to requeue if we hit the provision retry
111                # limit, since then we overwrite the PARSING state of the HQE.
112                self.queue_entry.requeue()
113
114            # Limit the repair on a host when a prejob task fails, e.g., reset,
115            # verify etc. The number of repair jobs is limited to the specific
116            # HQE and host.
117            previous_repairs = models.SpecialTask.objects.filter(
118                    task=models.SpecialTask.Task.REPAIR,
119                    queue_entry_id=self.queue_entry.id,
120                    host_id=self.queue_entry.host_id).count()
121            if previous_repairs >= scheduler_config.config.max_repair_limit:
122                self.host.set_status(models.Host.Status.REPAIR_FAILED)
123                self._fail_queue_entry()
124                return
125
126            queue_entry = models.HostQueueEntry.objects.get(
127                    id=self.queue_entry.id)
128        else:
129            queue_entry = None
130
131        models.SpecialTask.objects.create(
132                host=models.Host.objects.get(id=self.host.id),
133                task=models.SpecialTask.Task.REPAIR,
134                queue_entry=queue_entry,
135                requested_by=self.task.requested_by)
136
137
138    def _should_pending(self):
139        """
140        Decide if we should call the host queue entry's on_pending method.
141        We should if:
142        1) There exists an associated host queue entry.
143        2) The current special task completed successfully.
144        3) There do not exist any more special tasks to be run before the
145           host queue entry starts.
146
147        @returns: True if we should call pending, false if not.
148
149        """
150        if not self.queue_entry or not self.success:
151            return False
152
153        # We know if this is the last one when we create it, so we could add
154        # another column to the database to keep track of this information, but
155        # I expect the overhead of querying here to be minimal.
156        queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
157        queued = models.SpecialTask.objects.filter(
158                host__id=self.host.id, is_active=False,
159                is_complete=False, queue_entry=queue_entry)
160        queued = queued.exclude(id=self.task.id)
161        return queued.count() == 0
162
163
164class VerifyTask(PreJobTask):
165    TASK_TYPE = models.SpecialTask.Task.VERIFY
166
167
168    def __init__(self, task):
169        args = ['-v']
170        if task.queue_entry:
171            args.extend(self._generate_autoserv_label_args(task))
172        super(VerifyTask, self).__init__(task, args)
173        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
174
175
176    def prolog(self):
177        super(VerifyTask, self).prolog()
178
179        logging.info("starting verify on %s", self.host.hostname)
180        if self.queue_entry:
181            self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
182        self.host.set_status(models.Host.Status.VERIFYING)
183
184        # Delete any queued manual reverifies for this host.  One verify will do
185        # and there's no need to keep records of other requests.
186        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
187                                  keep_last_one=True)
188
189
190    def epilog(self):
191        super(VerifyTask, self).epilog()
192        if self.success:
193            if self._should_pending():
194                self.queue_entry.on_pending()
195            else:
196                self.host.set_status(models.Host.Status.READY)
197
198
199class CleanupTask(PreJobTask):
200    # note this can also run post-job, but when it does, it's running standalone
201    # against the host (not related to the job), so it's not considered a
202    # PostJobTask
203
204    TASK_TYPE = models.SpecialTask.Task.CLEANUP
205
206
207    def __init__(self, task, recover_run_monitor=None):
208        args = ['--cleanup']
209        if task.queue_entry:
210            args.extend(self._generate_autoserv_label_args(task))
211        super(CleanupTask, self).__init__(task, args)
212        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
213
214
215    def prolog(self):
216        super(CleanupTask, self).prolog()
217        logging.info("starting cleanup task for host: %s", self.host.hostname)
218        self.host.set_status(models.Host.Status.CLEANING)
219        if self.queue_entry:
220            self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
221
222
223    def _finish_epilog(self):
224        if not self.queue_entry or not self.success:
225            return
226
227        do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
228        should_run_verify = (
229                self.queue_entry.job.run_verify
230                and self.host.protection != do_not_verify_protection)
231        if should_run_verify:
232            entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
233            models.SpecialTask.objects.create(
234                    host=models.Host.objects.get(id=self.host.id),
235                    queue_entry=entry,
236                    task=models.SpecialTask.Task.VERIFY)
237        else:
238            if self._should_pending():
239                self.queue_entry.on_pending()
240
241
242    def epilog(self):
243        super(CleanupTask, self).epilog()
244
245        if self.success:
246            self.host.update_field('dirty', 0)
247            self.host.set_status(models.Host.Status.READY)
248
249        self._finish_epilog()
250
251
252class ResetTask(PreJobTask):
253    """Task to reset a DUT, including cleanup and verify."""
254    # note this can also run post-job, but when it does, it's running standalone
255    # against the host (not related to the job), so it's not considered a
256    # PostJobTask
257
258    TASK_TYPE = models.SpecialTask.Task.RESET
259
260
261    def __init__(self, task, recover_run_monitor=None):
262        args = ['--reset']
263        if task.queue_entry:
264            args.extend(self._generate_autoserv_label_args(task))
265        super(ResetTask, self).__init__(task, args)
266        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
267
268
269    def prolog(self):
270        super(ResetTask, self).prolog()
271        logging.info('starting reset task for host: %s',
272                     self.host.hostname)
273        self.host.set_status(models.Host.Status.RESETTING)
274        if self.queue_entry:
275            self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
276
277        # Delete any queued cleanups for this host.
278        self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
279                                  keep_last_one=False)
280
281        # Delete any queued reverifies for this host.
282        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
283                                  keep_last_one=False)
284
285        # Only one reset is needed.
286        self.remove_special_tasks(models.SpecialTask.Task.RESET,
287                                  keep_last_one=True)
288
289
290    def epilog(self):
291        super(ResetTask, self).epilog()
292
293        if self.success:
294            self.host.update_field('dirty', 0)
295
296            if self._should_pending():
297                self.queue_entry.on_pending()
298            else:
299                self.host.set_status(models.Host.Status.READY)
300
301
302# TODO (ayatane): Refactor using server/cros/provision
303def _is_cros_version(label):
304    """Return whether the label is a cros-version: label."""
305    return label.startswith('cros-version:')
306
307
308# TODO (ayatane): Refactor using server/cros/provision
309def _get_cros_version(label):
310    """Return cros-version from cros-version label."""
311    return label[len('cros-version:'):]
312
313
314# TODO (ayatane): Refactor into server/cros/provision
315class _CrosImage(object):
316    """The name of a CrOS image."""
317
318    _name_pattern = re.compile(
319        r'^'
320        r'(?P<group>[a-z0-9-]+)'
321        r'/'
322        r'(?P<milestone>LATEST|R[0-9]+)'
323        r'-'
324        r'(?P<version>[0-9.]+)'
325        r'(-(?P<rc>rc[0-9]+))?'
326        r'$'
327    )
328
329    def __init__(self, name):
330        """Initialize instance.
331
332        @param name: Image name string (lumpy-release/R27-3773.0.0)
333        """
334        self._name = name
335        match = self._name_pattern.search(name)
336        if match is None:
337            raise ValueError('Invalid CrOS image name: %r' % name)
338        self.group = match.group('group')
339        self.milestone = match.group('milestone')
340        self.version = match.group('version')
341        self.rc = match.group('rc')
342
343    def __repr__(self):
344        return '{cls}({name!r})'.format(cls=type(self).__name__,
345                                        name=self._name)
346
347    def __str__(self):
348        return self._name
349
350
351class ProvisionTask(PreJobTask):
352    TASK_TYPE = models.SpecialTask.Task.PROVISION
353
354    def __init__(self, task):
355        # Provisioning requires that we be associated with a job/queue entry
356        assert task.queue_entry, "No HQE associated with provision task!"
357        # task.queue_entry is an afe model HostQueueEntry object.
358        # self.queue_entry is a scheduler models HostQueueEntry object, but
359        # it gets constructed and assigned in __init__, so it's not available
360        # yet.  Therefore, we're stuck pulling labels off of the afe model
361        # so that we can pass the --provision args into the __init__ call.
362        labels = {x.name for x in task.queue_entry.job.labels}
363        _, provisionable = provision.Provision.partition(labels)
364        extra_command_args = ['--provision',
365                              '--job-labels', ','.join(provisionable)]
366        super(ProvisionTask, self).__init__(task, extra_command_args)
367        self._set_milestone(labels)
368        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
369
370
371    def _set_milestone(self, labels):
372        """Set build milestone from the labels.
373
374        @param labels: iterable of labels.
375        """
376        labels = (label
377                  for label in labels
378                  if _is_cros_version(label))
379        for label in labels:
380            try:
381                cros_image = _CrosImage(_get_cros_version(label))
382            except ValueError as e:
383                logging.warning('Could not parse cros-version. Error msg: %s', e)
384                self._milestone = 'N/A'
385            else:
386                self._milestone = cros_image.milestone
387            break
388
389
390    def _command_line(self):
391        # If we give queue_entry to _autoserv_command_line, then it will append
392        # -c for this invocation if the queue_entry is a client side test. We
393        # don't want that, as it messes with provisioning, so we just drop it
394        # from the arguments here.
395        # Note that we also don't verify job_repo_url as provisioining tasks are
396        # required to stage whatever content we need, and the job itself will
397        # force autotest to be staged if it isn't already.
398        return autoserv_utils._autoserv_command_line(self.host.hostname,
399                                                     self._extra_command_args,
400                                                     in_lab=True)
401
402
403    def prolog(self):
404        super(ProvisionTask, self).prolog()
405        # add check for previous provision task and abort if exist.
406        logging.info("starting provision task for host: %s", self.host.hostname)
407        self.queue_entry.set_status(
408                models.HostQueueEntry.Status.PROVISIONING)
409        self.host.set_status(models.Host.Status.PROVISIONING)
410
411
412    def epilog(self):
413        super(ProvisionTask, self).epilog()
414
415        # If we were not successful in provisioning the machine
416        # leave the DUT in whatever status was set in the PreJobTask's
417        # epilog. If this task was successful the host status will get
418        # set appropriately as a fallout of the hqe's on_pending. If
419        # we don't call on_pending, it can only be because:
420        #   1. This task was not successful:
421        #       a. Another repair is queued: this repair job will set the host
422        #       status, and it will remain in 'Provisioning' till then.
423        #       b. We have hit the max_repair_limit: in which case the host
424        #       status is set to 'RepairFailed' in the epilog of PreJobTask.
425        #   2. The task was successful, but there are other special tasks:
426        #      Those special tasks will set the host status appropriately.
427        if self._should_pending():
428            self.queue_entry.on_pending()
429
430
431class RepairTask(agent_task.SpecialAgentTask):
432    TASK_TYPE = models.SpecialTask.Task.REPAIR
433
434
435    def __init__(self, task):
436        """\
437        queue_entry: queue entry to mark failed if this repair fails.
438        """
439        protection = host_protections.Protection.get_string(
440                task.host.protection)
441        # normalize the protection name
442        protection = host_protections.Protection.get_attr_name(protection)
443
444        args = ['-R', '--host-protection', protection]
445        if task.queue_entry:
446            args.extend(self._generate_autoserv_label_args(task))
447
448        super(RepairTask, self).__init__(task, args)
449
450        # *don't* include the queue entry in IDs -- if the queue entry is
451        # aborted, we want to leave the repair task running
452        self._set_ids(host=self.host)
453
454
455    def prolog(self):
456        super(RepairTask, self).prolog()
457        logging.info("repair_task starting")
458        self.host.set_status(models.Host.Status.REPAIRING)
459
460
461    def epilog(self):
462        super(RepairTask, self).epilog()
463
464        if self.success:
465            self.host.set_status(models.Host.Status.READY)
466        else:
467            self.host.set_status(models.Host.Status.REPAIR_FAILED)
468            if self.queue_entry:
469                self._fail_queue_entry()
470        self.host.record_working_state(bool(self.success),
471                                       self.task.time_finished)
472