1#pylint: disable=C0111 2 3""" 4Prejob tasks. 5 6Prejob tasks _usually_ run before a job and verify the state of a machine. 7Cleanup and repair are exceptions, cleanup can run after a job too, while 8repair will run anytime the host needs a repair, which could be pre or post 9job. Most of the work specific to this module is achieved through the prolog 10and epilog of each task. 11 12All prejob tasks must have a host, though they may not have an HQE. If a 13prejob task has a hqe, it will activate the hqe through its on_pending 14method on successful completion. A row in afe_special_tasks with values: 15 host=C1, unlocked, is_active=0, is_complete=0, type=Verify 16will indicate to the scheduler that it needs to schedule a new special task 17of type=Verify, against the C1 host. While the special task is running 18the scheduler only monitors it through the Agent, and its is_active bit=1. 19Once a special task finishes, we set its is_active=0, is_complete=1 and 20success bits, so the scheduler ignores it. 21HQE.on_pending: 22 Host, HQE -> Pending, Starting 23 This status is acted upon in the scheduler, to assign an AgentTask. 24PreJobTask: 25 epilog: 26 failure: 27 requeue hqe 28 repair the host 29Children PreJobTasks: 30 prolog: 31 set Host, HQE status 32 epilog: 33 success: 34 on_pending 35 failure: 36 repair throgh PreJobTask 37 set Host, HQE status 38 39Failing a prejob task effects both the Host and the HQE, as follows: 40 41- Host: PreJob failure will result in a Repair job getting queued against 42the host, is we haven't already tried repairing it more than the 43max_repair_limit. When this happens, the host will remain in whatever status 44the prejob task left it in, till the Repair job puts it into 'Repairing'. This 45way the host_scheduler won't pick bad hosts and assign them to jobs. 46 47If we have already tried repairing the host too many times, the PreJobTask 48will flip the host to 'RepairFailed' in its epilog, and it will remain in this 49state till it is recovered and reverified. 50 51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it 52in the Queued state and setting its host_id to None, so it gets a new host 53in the next scheduler tick. Failing the HQE results in either a Parsing 54or Archiving postjob task, and an eventual Failed status for the HQE. 55""" 56 57import logging 58import re 59 60from autotest_lib.client.common_lib import host_protections 61from autotest_lib.frontend.afe import models 62from autotest_lib.scheduler import agent_task, scheduler_config 63from autotest_lib.server import autoserv_utils 64from autotest_lib.server.cros import provision 65 66 67class PreJobTask(agent_task.SpecialAgentTask): 68 def epilog(self): 69 super(PreJobTask, self).epilog() 70 71 if self.host.protection == host_protections.Protection.DO_NOT_VERIFY: 72 # effectively ignore failure for these hosts 73 self.success = True 74 75 if self.success: 76 self.host.record_working_state(True, 77 self.task.time_finished) 78 return 79 80 if self.queue_entry: 81 # If we requeue a HQE, we should cancel any remaining pre-job 82 # tasks against this host, otherwise we'll be left in a state 83 # where a queued HQE has special tasks to run against a host. 84 models.SpecialTask.objects.filter( 85 queue_entry__id=self.queue_entry.id, 86 host__id=self.host.id, 87 is_complete=0).update(is_complete=1, success=0) 88 89 previous_provisions = models.SpecialTask.objects.filter( 90 task=models.SpecialTask.Task.PROVISION, 91 queue_entry_id=self.queue_entry.id).count() 92 if (previous_provisions > 93 scheduler_config.config.max_provision_retries): 94 self._actually_fail_queue_entry() 95 # This abort will mark the aborted bit on the HQE itself, to 96 # signify that we're killing it. Technically it also will do 97 # the recursive aborting of all child jobs, but that shouldn't 98 # matter here, as only suites have children, and those are 99 # hostless and thus don't have provisioning. 100 # TODO(milleral) http://crbug.com/188217 101 # However, we can't actually do this yet, as if we set the 102 # abort bit the FinalReparseTask will set the status of the HQE 103 # to ABORTED, which then means that we don't show the status in 104 # run_suite. So in the meantime, don't mark the HQE as 105 # aborted. 106 # queue_entry.abort() 107 else: 108 # requeue() must come after handling provision retries, since 109 # _actually_fail_queue_entry needs an execution subdir. 110 # We also don't want to requeue if we hit the provision retry 111 # limit, since then we overwrite the PARSING state of the HQE. 112 self.queue_entry.requeue() 113 114 # Limit the repair on a host when a prejob task fails, e.g., reset, 115 # verify etc. The number of repair jobs is limited to the specific 116 # HQE and host. 117 previous_repairs = models.SpecialTask.objects.filter( 118 task=models.SpecialTask.Task.REPAIR, 119 queue_entry_id=self.queue_entry.id, 120 host_id=self.queue_entry.host_id).count() 121 if previous_repairs >= scheduler_config.config.max_repair_limit: 122 self.host.set_status(models.Host.Status.REPAIR_FAILED) 123 self._fail_queue_entry() 124 return 125 126 queue_entry = models.HostQueueEntry.objects.get( 127 id=self.queue_entry.id) 128 else: 129 queue_entry = None 130 131 models.SpecialTask.objects.create( 132 host=models.Host.objects.get(id=self.host.id), 133 task=models.SpecialTask.Task.REPAIR, 134 queue_entry=queue_entry, 135 requested_by=self.task.requested_by) 136 137 138 def _should_pending(self): 139 """ 140 Decide if we should call the host queue entry's on_pending method. 141 We should if: 142 1) There exists an associated host queue entry. 143 2) The current special task completed successfully. 144 3) There do not exist any more special tasks to be run before the 145 host queue entry starts. 146 147 @returns: True if we should call pending, false if not. 148 149 """ 150 if not self.queue_entry or not self.success: 151 return False 152 153 # We know if this is the last one when we create it, so we could add 154 # another column to the database to keep track of this information, but 155 # I expect the overhead of querying here to be minimal. 156 queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) 157 queued = models.SpecialTask.objects.filter( 158 host__id=self.host.id, is_active=False, 159 is_complete=False, queue_entry=queue_entry) 160 queued = queued.exclude(id=self.task.id) 161 return queued.count() == 0 162 163 164class VerifyTask(PreJobTask): 165 TASK_TYPE = models.SpecialTask.Task.VERIFY 166 167 168 def __init__(self, task): 169 args = ['-v'] 170 if task.queue_entry: 171 args.extend(self._generate_autoserv_label_args(task)) 172 super(VerifyTask, self).__init__(task, args) 173 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 174 175 176 def prolog(self): 177 super(VerifyTask, self).prolog() 178 179 logging.info("starting verify on %s", self.host.hostname) 180 if self.queue_entry: 181 self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING) 182 self.host.set_status(models.Host.Status.VERIFYING) 183 184 # Delete any queued manual reverifies for this host. One verify will do 185 # and there's no need to keep records of other requests. 186 self.remove_special_tasks(models.SpecialTask.Task.VERIFY, 187 keep_last_one=True) 188 189 190 def epilog(self): 191 super(VerifyTask, self).epilog() 192 if self.success: 193 if self._should_pending(): 194 self.queue_entry.on_pending() 195 else: 196 self.host.set_status(models.Host.Status.READY) 197 198 199class CleanupTask(PreJobTask): 200 # note this can also run post-job, but when it does, it's running standalone 201 # against the host (not related to the job), so it's not considered a 202 # PostJobTask 203 204 TASK_TYPE = models.SpecialTask.Task.CLEANUP 205 206 207 def __init__(self, task, recover_run_monitor=None): 208 args = ['--cleanup'] 209 if task.queue_entry: 210 args.extend(self._generate_autoserv_label_args(task)) 211 super(CleanupTask, self).__init__(task, args) 212 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 213 214 215 def prolog(self): 216 super(CleanupTask, self).prolog() 217 logging.info("starting cleanup task for host: %s", self.host.hostname) 218 self.host.set_status(models.Host.Status.CLEANING) 219 if self.queue_entry: 220 self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING) 221 222 223 def _finish_epilog(self): 224 if not self.queue_entry or not self.success: 225 return 226 227 do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY 228 should_run_verify = ( 229 self.queue_entry.job.run_verify 230 and self.host.protection != do_not_verify_protection) 231 if should_run_verify: 232 entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) 233 models.SpecialTask.objects.create( 234 host=models.Host.objects.get(id=self.host.id), 235 queue_entry=entry, 236 task=models.SpecialTask.Task.VERIFY) 237 else: 238 if self._should_pending(): 239 self.queue_entry.on_pending() 240 241 242 def epilog(self): 243 super(CleanupTask, self).epilog() 244 245 if self.success: 246 self.host.update_field('dirty', 0) 247 self.host.set_status(models.Host.Status.READY) 248 249 self._finish_epilog() 250 251 252class ResetTask(PreJobTask): 253 """Task to reset a DUT, including cleanup and verify.""" 254 # note this can also run post-job, but when it does, it's running standalone 255 # against the host (not related to the job), so it's not considered a 256 # PostJobTask 257 258 TASK_TYPE = models.SpecialTask.Task.RESET 259 260 261 def __init__(self, task, recover_run_monitor=None): 262 args = ['--reset'] 263 if task.queue_entry: 264 args.extend(self._generate_autoserv_label_args(task)) 265 super(ResetTask, self).__init__(task, args) 266 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 267 268 269 def prolog(self): 270 super(ResetTask, self).prolog() 271 logging.info('starting reset task for host: %s', 272 self.host.hostname) 273 self.host.set_status(models.Host.Status.RESETTING) 274 if self.queue_entry: 275 self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING) 276 277 # Delete any queued cleanups for this host. 278 self.remove_special_tasks(models.SpecialTask.Task.CLEANUP, 279 keep_last_one=False) 280 281 # Delete any queued reverifies for this host. 282 self.remove_special_tasks(models.SpecialTask.Task.VERIFY, 283 keep_last_one=False) 284 285 # Only one reset is needed. 286 self.remove_special_tasks(models.SpecialTask.Task.RESET, 287 keep_last_one=True) 288 289 290 def epilog(self): 291 super(ResetTask, self).epilog() 292 293 if self.success: 294 self.host.update_field('dirty', 0) 295 296 if self._should_pending(): 297 self.queue_entry.on_pending() 298 else: 299 self.host.set_status(models.Host.Status.READY) 300 301 302# TODO (ayatane): Refactor using server/cros/provision 303def _is_cros_version(label): 304 """Return whether the label is a cros-version: label.""" 305 return label.startswith('cros-version:') 306 307 308# TODO (ayatane): Refactor using server/cros/provision 309def _get_cros_version(label): 310 """Return cros-version from cros-version label.""" 311 return label[len('cros-version:'):] 312 313 314# TODO (ayatane): Refactor into server/cros/provision 315class _CrosImage(object): 316 """The name of a CrOS image.""" 317 318 _name_pattern = re.compile( 319 r'^' 320 r'(?P<group>[a-z0-9-]+)' 321 r'/' 322 r'(?P<milestone>LATEST|R[0-9]+)' 323 r'-' 324 r'(?P<version>[0-9.]+)' 325 r'(-(?P<rc>rc[0-9]+))?' 326 r'$' 327 ) 328 329 def __init__(self, name): 330 """Initialize instance. 331 332 @param name: Image name string (lumpy-release/R27-3773.0.0) 333 """ 334 self._name = name 335 match = self._name_pattern.search(name) 336 if match is None: 337 raise ValueError('Invalid CrOS image name: %r' % name) 338 self.group = match.group('group') 339 self.milestone = match.group('milestone') 340 self.version = match.group('version') 341 self.rc = match.group('rc') 342 343 def __repr__(self): 344 return '{cls}({name!r})'.format(cls=type(self).__name__, 345 name=self._name) 346 347 def __str__(self): 348 return self._name 349 350 351class ProvisionTask(PreJobTask): 352 TASK_TYPE = models.SpecialTask.Task.PROVISION 353 354 def __init__(self, task): 355 # Provisioning requires that we be associated with a job/queue entry 356 assert task.queue_entry, "No HQE associated with provision task!" 357 # task.queue_entry is an afe model HostQueueEntry object. 358 # self.queue_entry is a scheduler models HostQueueEntry object, but 359 # it gets constructed and assigned in __init__, so it's not available 360 # yet. Therefore, we're stuck pulling labels off of the afe model 361 # so that we can pass the --provision args into the __init__ call. 362 labels = {x.name for x in task.queue_entry.job.labels} 363 _, provisionable = provision.Provision.partition(labels) 364 extra_command_args = ['--provision', 365 '--job-labels', ','.join(provisionable)] 366 super(ProvisionTask, self).__init__(task, extra_command_args) 367 self._set_milestone(labels) 368 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 369 370 371 def _set_milestone(self, labels): 372 """Set build milestone from the labels. 373 374 @param labels: iterable of labels. 375 """ 376 labels = (label 377 for label in labels 378 if _is_cros_version(label)) 379 for label in labels: 380 try: 381 cros_image = _CrosImage(_get_cros_version(label)) 382 except ValueError as e: 383 logging.warning('Could not parse cros-version. Error msg: %s', e) 384 self._milestone = 'N/A' 385 else: 386 self._milestone = cros_image.milestone 387 break 388 389 390 def _command_line(self): 391 # If we give queue_entry to _autoserv_command_line, then it will append 392 # -c for this invocation if the queue_entry is a client side test. We 393 # don't want that, as it messes with provisioning, so we just drop it 394 # from the arguments here. 395 # Note that we also don't verify job_repo_url as provisioining tasks are 396 # required to stage whatever content we need, and the job itself will 397 # force autotest to be staged if it isn't already. 398 return autoserv_utils._autoserv_command_line(self.host.hostname, 399 self._extra_command_args, 400 in_lab=True) 401 402 403 def prolog(self): 404 super(ProvisionTask, self).prolog() 405 # add check for previous provision task and abort if exist. 406 logging.info("starting provision task for host: %s", self.host.hostname) 407 self.queue_entry.set_status( 408 models.HostQueueEntry.Status.PROVISIONING) 409 self.host.set_status(models.Host.Status.PROVISIONING) 410 411 412 def epilog(self): 413 super(ProvisionTask, self).epilog() 414 415 # If we were not successful in provisioning the machine 416 # leave the DUT in whatever status was set in the PreJobTask's 417 # epilog. If this task was successful the host status will get 418 # set appropriately as a fallout of the hqe's on_pending. If 419 # we don't call on_pending, it can only be because: 420 # 1. This task was not successful: 421 # a. Another repair is queued: this repair job will set the host 422 # status, and it will remain in 'Provisioning' till then. 423 # b. We have hit the max_repair_limit: in which case the host 424 # status is set to 'RepairFailed' in the epilog of PreJobTask. 425 # 2. The task was successful, but there are other special tasks: 426 # Those special tasks will set the host status appropriately. 427 if self._should_pending(): 428 self.queue_entry.on_pending() 429 430 431class RepairTask(agent_task.SpecialAgentTask): 432 TASK_TYPE = models.SpecialTask.Task.REPAIR 433 434 435 def __init__(self, task): 436 """\ 437 queue_entry: queue entry to mark failed if this repair fails. 438 """ 439 protection = host_protections.Protection.get_string( 440 task.host.protection) 441 # normalize the protection name 442 protection = host_protections.Protection.get_attr_name(protection) 443 444 args = ['-R', '--host-protection', protection] 445 if task.queue_entry: 446 args.extend(self._generate_autoserv_label_args(task)) 447 448 super(RepairTask, self).__init__(task, args) 449 450 # *don't* include the queue entry in IDs -- if the queue entry is 451 # aborted, we want to leave the repair task running 452 self._set_ids(host=self.host) 453 454 455 def prolog(self): 456 super(RepairTask, self).prolog() 457 logging.info("repair_task starting") 458 self.host.set_status(models.Host.Status.REPAIRING) 459 460 461 def epilog(self): 462 super(RepairTask, self).epilog() 463 464 if self.success: 465 self.host.set_status(models.Host.Status.READY) 466 else: 467 self.host.set_status(models.Host.Status.REPAIR_FAILED) 468 if self.queue_entry: 469 self._fail_queue_entry() 470 self.host.record_working_state(bool(self.success), 471 self.task.time_finished) 472