1#pylint: disable-msg=C0111 2 3""" 4Prejob tasks. 5 6Prejob tasks _usually_ run before a job and verify the state of a machine. 7Cleanup and repair are exceptions, cleanup can run after a job too, while 8repair will run anytime the host needs a repair, which could be pre or post 9job. Most of the work specific to this module is achieved through the prolog 10and epilog of each task. 11 12All prejob tasks must have a host, though they may not have an HQE. If a 13prejob task has a hqe, it will activate the hqe through its on_pending 14method on successful completion. A row in afe_special_tasks with values: 15 host=C1, unlocked, is_active=0, is_complete=0, type=Verify 16will indicate to the scheduler that it needs to schedule a new special task 17of type=Verify, against the C1 host. While the special task is running 18the scheduler only monitors it through the Agent, and its is_active bit=1. 19Once a special task finishes, we set its is_active=0, is_complete=1 and 20success bits, so the scheduler ignores it. 21HQE.on_pending: 22 Host, HQE -> Pending, Starting 23 This status is acted upon in the scheduler, to assign an AgentTask. 24PreJobTask: 25 epilog: 26 failure: 27 requeue hqe 28 repair the host 29Children PreJobTasks: 30 prolog: 31 set Host, HQE status 32 epilog: 33 success: 34 on_pending 35 failure: 36 repair throgh PreJobTask 37 set Host, HQE status 38 39Failing a prejob task effects both the Host and the HQE, as follows: 40 41- Host: PreJob failure will result in a Repair job getting queued against 42the host, is we haven't already tried repairing it more than the 43max_repair_limit. When this happens, the host will remain in whatever status 44the prejob task left it in, till the Repair job puts it into 'Repairing'. This 45way the host_scheduler won't pick bad hosts and assign them to jobs. 46 47If we have already tried repairing the host too many times, the PreJobTask 48will flip the host to 'RepairFailed' in its epilog, and it will remain in this 49state till it is recovered and reverified. 50 51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it 52in the Queued state and setting its host_id to None, so it gets a new host 53in the next scheduler tick. Failing the HQE results in either a Parsing 54or Archiving postjob task, and an eventual Failed status for the HQE. 55""" 56 57import logging 58import os 59 60from autotest_lib.client.common_lib import host_protections 61from autotest_lib.frontend.afe import models 62from autotest_lib.scheduler import agent_task, scheduler_config 63from autotest_lib.server import autoserv_utils 64from autotest_lib.server.cros import provision 65 66 67class PreJobTask(agent_task.SpecialAgentTask): 68 def _copy_to_results_repository(self): 69 if not self.queue_entry or self.queue_entry.meta_host: 70 return 71 72 self.queue_entry.set_execution_subdir() 73 log_name = os.path.basename(self.task.execution_path()) 74 source = os.path.join(self.task.execution_path(), 'debug', 75 'autoserv.DEBUG') 76 destination = os.path.join( 77 self.queue_entry.execution_path(), log_name) 78 79 self.monitor.try_copy_to_results_repository( 80 source, destination_path=destination) 81 82 83 def epilog(self): 84 super(PreJobTask, self).epilog() 85 86 if self.success: 87 return 88 89 if self.host.protection == host_protections.Protection.DO_NOT_VERIFY: 90 # effectively ignore failure for these hosts 91 self.success = True 92 return 93 94 if self.queue_entry: 95 # If we requeue a HQE, we should cancel any remaining pre-job 96 # tasks against this host, otherwise we'll be left in a state 97 # where a queued HQE has special tasks to run against a host. 98 models.SpecialTask.objects.filter( 99 queue_entry__id=self.queue_entry.id, 100 host__id=self.host.id, 101 is_complete=0).update(is_complete=1, success=0) 102 103 previous_provisions = models.SpecialTask.objects.filter( 104 task=models.SpecialTask.Task.PROVISION, 105 queue_entry_id=self.queue_entry.id).count() 106 if (previous_provisions > 107 scheduler_config.config.max_provision_retries): 108 self._actually_fail_queue_entry() 109 # This abort will mark the aborted bit on the HQE itself, to 110 # signify that we're killing it. Technically it also will do 111 # the recursive aborting of all child jobs, but that shouldn't 112 # matter here, as only suites have children, and those are 113 # hostless and thus don't have provisioning. 114 # TODO(milleral) http://crbug.com/188217 115 # However, we can't actually do this yet, as if we set the 116 # abort bit the FinalReparseTask will set the status of the HQE 117 # to ABORTED, which then means that we don't show the status in 118 # run_suite. So in the meantime, don't mark the HQE as 119 # aborted. 120 # queue_entry.abort() 121 else: 122 # requeue() must come after handling provision retries, since 123 # _actually_fail_queue_entry needs an execution subdir. 124 # We also don't want to requeue if we hit the provision retry 125 # limit, since then we overwrite the PARSING state of the HQE. 126 self.queue_entry.requeue() 127 128 # Limit the repair on a host when a prejob task fails, e.g., reset, 129 # verify etc. The number of repair jobs is limited to the specific 130 # HQE and host. 131 previous_repairs = models.SpecialTask.objects.filter( 132 task=models.SpecialTask.Task.REPAIR, 133 queue_entry_id=self.queue_entry.id, 134 host_id=self.queue_entry.host_id).count() 135 if previous_repairs >= scheduler_config.config.max_repair_limit: 136 self.host.set_status(models.Host.Status.REPAIR_FAILED) 137 self._fail_queue_entry() 138 return 139 140 queue_entry = models.HostQueueEntry.objects.get( 141 id=self.queue_entry.id) 142 else: 143 queue_entry = None 144 145 models.SpecialTask.objects.create( 146 host=models.Host.objects.get(id=self.host.id), 147 task=models.SpecialTask.Task.REPAIR, 148 queue_entry=queue_entry, 149 requested_by=self.task.requested_by) 150 151 152 def _should_pending(self): 153 """ 154 Decide if we should call the host queue entry's on_pending method. 155 We should if: 156 1) There exists an associated host queue entry. 157 2) The current special task completed successfully. 158 3) There do not exist any more special tasks to be run before the 159 host queue entry starts. 160 161 @returns: True if we should call pending, false if not. 162 163 """ 164 if not self.queue_entry or not self.success: 165 return False 166 167 # We know if this is the last one when we create it, so we could add 168 # another column to the database to keep track of this information, but 169 # I expect the overhead of querying here to be minimal. 170 queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) 171 queued = models.SpecialTask.objects.filter( 172 host__id=self.host.id, is_active=False, 173 is_complete=False, queue_entry=queue_entry) 174 queued = queued.exclude(id=self.task.id) 175 return queued.count() == 0 176 177 178class VerifyTask(PreJobTask): 179 TASK_TYPE = models.SpecialTask.Task.VERIFY 180 181 182 def __init__(self, task): 183 args = ['-v'] 184 if task.queue_entry: 185 args.extend(self._generate_autoserv_label_args(task)) 186 super(VerifyTask, self).__init__(task, args) 187 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 188 189 190 def prolog(self): 191 super(VerifyTask, self).prolog() 192 193 logging.info("starting verify on %s", self.host.hostname) 194 if self.queue_entry: 195 self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING) 196 self.host.set_status(models.Host.Status.VERIFYING) 197 198 # Delete any queued manual reverifies for this host. One verify will do 199 # and there's no need to keep records of other requests. 200 self.remove_special_tasks(models.SpecialTask.Task.VERIFY, 201 keep_last_one=True) 202 203 204 def epilog(self): 205 super(VerifyTask, self).epilog() 206 if self.success: 207 if self._should_pending(): 208 self.queue_entry.on_pending() 209 else: 210 self.host.set_status(models.Host.Status.READY) 211 212 213class CleanupTask(PreJobTask): 214 # note this can also run post-job, but when it does, it's running standalone 215 # against the host (not related to the job), so it's not considered a 216 # PostJobTask 217 218 TASK_TYPE = models.SpecialTask.Task.CLEANUP 219 220 221 def __init__(self, task, recover_run_monitor=None): 222 args = ['--cleanup'] 223 if task.queue_entry: 224 args.extend(self._generate_autoserv_label_args(task)) 225 super(CleanupTask, self).__init__(task, args) 226 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 227 228 229 def prolog(self): 230 super(CleanupTask, self).prolog() 231 logging.info("starting cleanup task for host: %s", self.host.hostname) 232 self.host.set_status(models.Host.Status.CLEANING) 233 if self.queue_entry: 234 self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING) 235 236 237 def _finish_epilog(self): 238 if not self.queue_entry or not self.success: 239 return 240 241 do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY 242 should_run_verify = ( 243 self.queue_entry.job.run_verify 244 and self.host.protection != do_not_verify_protection) 245 if should_run_verify: 246 entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id) 247 models.SpecialTask.objects.create( 248 host=models.Host.objects.get(id=self.host.id), 249 queue_entry=entry, 250 task=models.SpecialTask.Task.VERIFY) 251 else: 252 if self._should_pending(): 253 self.queue_entry.on_pending() 254 255 256 def epilog(self): 257 super(CleanupTask, self).epilog() 258 259 if self.success: 260 self.host.update_field('dirty', 0) 261 self.host.set_status(models.Host.Status.READY) 262 263 self._finish_epilog() 264 265 266class ResetTask(PreJobTask): 267 """Task to reset a DUT, including cleanup and verify.""" 268 # note this can also run post-job, but when it does, it's running standalone 269 # against the host (not related to the job), so it's not considered a 270 # PostJobTask 271 272 TASK_TYPE = models.SpecialTask.Task.RESET 273 274 275 def __init__(self, task, recover_run_monitor=None): 276 args = ['--reset'] 277 if task.queue_entry: 278 args.extend(self._generate_autoserv_label_args(task)) 279 super(ResetTask, self).__init__(task, args) 280 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 281 282 283 def prolog(self): 284 super(ResetTask, self).prolog() 285 logging.info('starting reset task for host: %s', 286 self.host.hostname) 287 self.host.set_status(models.Host.Status.RESETTING) 288 if self.queue_entry: 289 self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING) 290 291 # Delete any queued cleanups for this host. 292 self.remove_special_tasks(models.SpecialTask.Task.CLEANUP, 293 keep_last_one=False) 294 295 # Delete any queued reverifies for this host. 296 self.remove_special_tasks(models.SpecialTask.Task.VERIFY, 297 keep_last_one=False) 298 299 # Only one reset is needed. 300 self.remove_special_tasks(models.SpecialTask.Task.RESET, 301 keep_last_one=True) 302 303 304 def epilog(self): 305 super(ResetTask, self).epilog() 306 307 if self.success: 308 self.host.update_field('dirty', 0) 309 310 if self._should_pending(): 311 self.queue_entry.on_pending() 312 else: 313 self.host.set_status(models.Host.Status.READY) 314 315 316class ProvisionTask(PreJobTask): 317 TASK_TYPE = models.SpecialTask.Task.PROVISION 318 319 def __init__(self, task): 320 # Provisioning requires that we be associated with a job/queue entry 321 assert task.queue_entry, "No HQE associated with provision task!" 322 # task.queue_entry is an afe model HostQueueEntry object. 323 # self.queue_entry is a scheduler models HostQueueEntry object, but 324 # it gets constructed and assigned in __init__, so it's not available 325 # yet. Therefore, we're stuck pulling labels off of the afe model 326 # so that we can pass the --provision args into the __init__ call. 327 labels = {x.name for x in task.queue_entry.job.labels} 328 _, provisionable = provision.filter_labels(labels) 329 extra_command_args = ['--provision', 330 '--job-labels', ','.join(provisionable)] 331 super(ProvisionTask, self).__init__(task, extra_command_args) 332 self._set_ids(host=self.host, queue_entries=[self.queue_entry]) 333 334 335 def _command_line(self): 336 # If we give queue_entry to _autoserv_command_line, then it will append 337 # -c for this invocation if the queue_entry is a client side test. We 338 # don't want that, as it messes with provisioning, so we just drop it 339 # from the arguments here. 340 # Note that we also don't verify job_repo_url as provisioining tasks are 341 # required to stage whatever content we need, and the job itself will 342 # force autotest to be staged if it isn't already. 343 return autoserv_utils._autoserv_command_line(self.host.hostname, 344 self._extra_command_args, 345 in_lab=True) 346 347 348 def prolog(self): 349 super(ProvisionTask, self).prolog() 350 # add check for previous provision task and abort if exist. 351 logging.info("starting provision task for host: %s", self.host.hostname) 352 self.queue_entry.set_status( 353 models.HostQueueEntry.Status.PROVISIONING) 354 self.host.set_status(models.Host.Status.PROVISIONING) 355 356 357 def epilog(self): 358 super(ProvisionTask, self).epilog() 359 360 # If we were not successful in provisioning the machine 361 # leave the DUT in whatever status was set in the PreJobTask's 362 # epilog. If this task was successful the host status will get 363 # set appropriately as a fallout of the hqe's on_pending. If 364 # we don't call on_pending, it can only be because: 365 # 1. This task was not successful: 366 # a. Another repair is queued: this repair job will set the host 367 # status, and it will remain in 'Provisioning' till then. 368 # b. We have hit the max_repair_limit: in which case the host 369 # status is set to 'RepairFailed' in the epilog of PreJobTask. 370 # 2. The task was successful, but there are other special tasks: 371 # Those special tasks will set the host status appropriately. 372 if self._should_pending(): 373 self.queue_entry.on_pending() 374 375 376class RepairTask(agent_task.SpecialAgentTask): 377 TASK_TYPE = models.SpecialTask.Task.REPAIR 378 379 380 def __init__(self, task): 381 """\ 382 queue_entry: queue entry to mark failed if this repair fails. 383 """ 384 protection = host_protections.Protection.get_string( 385 task.host.protection) 386 # normalize the protection name 387 protection = host_protections.Protection.get_attr_name(protection) 388 389 args = ['-R', '--host-protection', protection] 390 if task.queue_entry: 391 args.extend(self._generate_autoserv_label_args(task)) 392 393 super(RepairTask, self).__init__(task, args) 394 395 # *don't* include the queue entry in IDs -- if the queue entry is 396 # aborted, we want to leave the repair task running 397 self._set_ids(host=self.host) 398 399 400 def prolog(self): 401 super(RepairTask, self).prolog() 402 logging.info("repair_task starting") 403 self.host.set_status(models.Host.Status.REPAIRING) 404 405 406 def epilog(self): 407 super(RepairTask, self).epilog() 408 409 if self.success: 410 self.host.set_status(models.Host.Status.READY) 411 else: 412 self.host.set_status(models.Host.Status.REPAIR_FAILED) 413 if self.queue_entry: 414 self._fail_queue_entry() 415