1#pylint: disable=C0111
2
3"""
4Prejob tasks.
5
6Prejob tasks _usually_ run before a job and verify the state of a machine.
7Cleanup and repair are exceptions, cleanup can run after a job too, while
8repair will run anytime the host needs a repair, which could be pre or post
9job. Most of the work specific to this module is achieved through the prolog
10and epilog of each task.
11
12All prejob tasks must have a host, though they may not have an HQE. If a
13prejob task has a hqe, it will activate the hqe through its on_pending
14method on successful completion. A row in afe_special_tasks with values:
15    host=C1, unlocked, is_active=0, is_complete=0, type=Verify
16will indicate to the scheduler that it needs to schedule a new special task
17of type=Verify, against the C1 host. While the special task is running
18the scheduler only monitors it through the Agent, and its is_active bit=1.
19Once a special task finishes, we set its is_active=0, is_complete=1 and
20success bits, so the scheduler ignores it.
21HQE.on_pending:
22    Host, HQE -> Pending, Starting
23    This status is acted upon in the scheduler, to assign an AgentTask.
24PreJobTask:
25    epilog:
26        failure:
27            requeue hqe
28            repair the host
29Children PreJobTasks:
30    prolog:
31        set Host, HQE status
32    epilog:
33        success:
34            on_pending
35        failure:
36            repair throgh PreJobTask
37            set Host, HQE status
38
39Failing a prejob task effects both the Host and the HQE, as follows:
40
41- Host: PreJob failure will result in a Repair job getting queued against
42the host, is we haven't already tried repairing it more than the
43max_repair_limit. When this happens, the host will remain in whatever status
44the prejob task left it in, till the Repair job puts it into 'Repairing'. This
45way the host_scheduler won't pick bad hosts and assign them to jobs.
46
47If we have already tried repairing the host too many times, the PreJobTask
48will flip the host to 'RepairFailed' in its epilog, and it will remain in this
49state till it is recovered and reverified.
50
51- HQE: Is either requeued or failed. Requeuing the HQE involves putting it
52in the Queued state and setting its host_id to None, so it gets a new host
53in the next scheduler tick. Failing the HQE results in either a Parsing
54or Archiving postjob task, and an eventual Failed status for the HQE.
55"""
56
57import logging
58import re
59
60from autotest_lib.client.common_lib import host_protections
61from autotest_lib.frontend.afe import models
62from autotest_lib.scheduler import agent_task
63from autotest_lib.scheduler import drone_manager
64from autotest_lib.scheduler import scheduler_config
65from autotest_lib.server import autoserv_utils
66from autotest_lib.server.cros import provision
67
68
69class PreJobTask(agent_task.SpecialAgentTask):
70    def epilog(self):
71        super(PreJobTask, self).epilog()
72
73        if self.host.protection == host_protections.Protection.DO_NOT_VERIFY:
74            # effectively ignore failure for these hosts
75            self.success = True
76
77        if self.success:
78            self.host.record_working_state(True,
79                                           self.task.time_finished)
80            return
81
82        if self.queue_entry:
83            # If we requeue a HQE, we should cancel any remaining pre-job
84            # tasks against this host, otherwise we'll be left in a state
85            # where a queued HQE has special tasks to run against a host.
86            models.SpecialTask.objects.filter(
87                    queue_entry__id=self.queue_entry.id,
88                    host__id=self.host.id,
89                    is_complete=0).update(is_complete=1, success=0)
90
91            previous_provisions = models.SpecialTask.objects.filter(
92                    task=models.SpecialTask.Task.PROVISION,
93                    queue_entry_id=self.queue_entry.id).count()
94            if (previous_provisions >
95                scheduler_config.config.max_provision_retries):
96                self._actually_fail_queue_entry()
97                # This abort will mark the aborted bit on the HQE itself, to
98                # signify that we're killing it.  Technically it also will do
99                # the recursive aborting of all child jobs, but that shouldn't
100                # matter here, as only suites have children, and those are
101                # hostless and thus don't have provisioning.
102                # TODO(milleral) http://crbug.com/188217
103                # However, we can't actually do this yet, as if we set the
104                # abort bit the FinalReparseTask will set the status of the HQE
105                # to ABORTED, which then means that we don't show the status in
106                # run_suite.  So in the meantime, don't mark the HQE as
107                # aborted.
108                # queue_entry.abort()
109            else:
110                # requeue() must come after handling provision retries, since
111                # _actually_fail_queue_entry needs an execution subdir.
112                # We also don't want to requeue if we hit the provision retry
113                # limit, since then we overwrite the PARSING state of the HQE.
114                self.queue_entry.requeue()
115
116            # Limit the repair on a host when a prejob task fails, e.g., reset,
117            # verify etc. The number of repair jobs is limited to the specific
118            # HQE and host.
119            previous_repairs = models.SpecialTask.objects.filter(
120                    task=models.SpecialTask.Task.REPAIR,
121                    queue_entry_id=self.queue_entry.id,
122                    host_id=self.queue_entry.host_id).count()
123            if previous_repairs >= scheduler_config.config.max_repair_limit:
124                self.host.set_status(models.Host.Status.REPAIR_FAILED)
125                self._fail_queue_entry()
126                return
127
128            queue_entry = models.HostQueueEntry.objects.get(
129                    id=self.queue_entry.id)
130        else:
131            queue_entry = None
132
133        models.SpecialTask.objects.create(
134                host=models.Host.objects.get(id=self.host.id),
135                task=models.SpecialTask.Task.REPAIR,
136                queue_entry=queue_entry,
137                requested_by=self.task.requested_by)
138
139
140    def _should_pending(self):
141        """
142        Decide if we should call the host queue entry's on_pending method.
143        We should if:
144        1) There exists an associated host queue entry.
145        2) The current special task completed successfully.
146        3) There do not exist any more special tasks to be run before the
147           host queue entry starts.
148
149        @returns: True if we should call pending, false if not.
150
151        """
152        if not self.queue_entry or not self.success:
153            return False
154
155        # We know if this is the last one when we create it, so we could add
156        # another column to the database to keep track of this information, but
157        # I expect the overhead of querying here to be minimal.
158        queue_entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
159        queued = models.SpecialTask.objects.filter(
160                host__id=self.host.id, is_active=False,
161                is_complete=False, queue_entry=queue_entry)
162        queued = queued.exclude(id=self.task.id)
163        return queued.count() == 0
164
165
166class VerifyTask(PreJobTask):
167    TASK_TYPE = models.SpecialTask.Task.VERIFY
168
169
170    def __init__(self, task):
171        args = ['-v']
172        if task.queue_entry:
173            args.extend(self._generate_autoserv_label_args(task))
174        super(VerifyTask, self).__init__(task, args)
175        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
176
177
178    def prolog(self):
179        super(VerifyTask, self).prolog()
180
181        logging.info("starting verify on %s", self.host.hostname)
182        if self.queue_entry:
183            self.queue_entry.set_status(models.HostQueueEntry.Status.VERIFYING)
184        self.host.set_status(models.Host.Status.VERIFYING)
185
186        # Delete any queued manual reverifies for this host.  One verify will do
187        # and there's no need to keep records of other requests.
188        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
189                                  keep_last_one=True)
190
191
192    def epilog(self):
193        super(VerifyTask, self).epilog()
194        if self.success:
195            if self._should_pending():
196                self.queue_entry.on_pending()
197            else:
198                self.host.set_status(models.Host.Status.READY)
199
200
201class CleanupTask(PreJobTask):
202    # note this can also run post-job, but when it does, it's running standalone
203    # against the host (not related to the job), so it's not considered a
204    # PostJobTask
205
206    TASK_TYPE = models.SpecialTask.Task.CLEANUP
207
208
209    def __init__(self, task, recover_run_monitor=None):
210        args = ['--cleanup']
211        if task.queue_entry:
212            args.extend(self._generate_autoserv_label_args(task))
213        super(CleanupTask, self).__init__(task, args)
214        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
215
216
217    def prolog(self):
218        super(CleanupTask, self).prolog()
219        logging.info("starting cleanup task for host: %s", self.host.hostname)
220        self.host.set_status(models.Host.Status.CLEANING)
221        if self.queue_entry:
222            self.queue_entry.set_status(models.HostQueueEntry.Status.CLEANING)
223
224
225    def _finish_epilog(self):
226        if not self.queue_entry or not self.success:
227            return
228
229        do_not_verify_protection = host_protections.Protection.DO_NOT_VERIFY
230        should_run_verify = (
231                self.queue_entry.job.run_verify
232                and self.host.protection != do_not_verify_protection)
233        if should_run_verify:
234            entry = models.HostQueueEntry.objects.get(id=self.queue_entry.id)
235            models.SpecialTask.objects.create(
236                    host=models.Host.objects.get(id=self.host.id),
237                    queue_entry=entry,
238                    task=models.SpecialTask.Task.VERIFY)
239        else:
240            if self._should_pending():
241                self.queue_entry.on_pending()
242
243
244    def epilog(self):
245        super(CleanupTask, self).epilog()
246
247        if self.success:
248            self.host.update_field('dirty', 0)
249            self.host.set_status(models.Host.Status.READY)
250
251        self._finish_epilog()
252
253
254class ResetTask(PreJobTask):
255    """Task to reset a DUT, including cleanup and verify."""
256    # note this can also run post-job, but when it does, it's running standalone
257    # against the host (not related to the job), so it's not considered a
258    # PostJobTask
259
260    TASK_TYPE = models.SpecialTask.Task.RESET
261
262
263    def __init__(self, task, recover_run_monitor=None):
264        args = ['--reset']
265        if task.queue_entry:
266            args.extend(self._generate_autoserv_label_args(task))
267        super(ResetTask, self).__init__(task, args)
268        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
269
270
271    def prolog(self):
272        super(ResetTask, self).prolog()
273        logging.info('starting reset task for host: %s',
274                     self.host.hostname)
275        self.host.set_status(models.Host.Status.RESETTING)
276        if self.queue_entry:
277            self.queue_entry.set_status(models.HostQueueEntry.Status.RESETTING)
278
279        # Delete any queued cleanups for this host.
280        self.remove_special_tasks(models.SpecialTask.Task.CLEANUP,
281                                  keep_last_one=False)
282
283        # Delete any queued reverifies for this host.
284        self.remove_special_tasks(models.SpecialTask.Task.VERIFY,
285                                  keep_last_one=False)
286
287        # Only one reset is needed.
288        self.remove_special_tasks(models.SpecialTask.Task.RESET,
289                                  keep_last_one=True)
290
291
292    def epilog(self):
293        super(ResetTask, self).epilog()
294
295        if self.success:
296            self.host.update_field('dirty', 0)
297
298            if self._should_pending():
299                self.queue_entry.on_pending()
300            else:
301                self.host.set_status(models.Host.Status.READY)
302
303
304# TODO (ayatane): Refactor using server/cros/provision
305def _is_cros_version(label):
306    """Return whether the label is a cros-version: label."""
307    return label.startswith('cros-version:')
308
309
310# TODO (ayatane): Refactor using server/cros/provision
311def _get_cros_version(label):
312    """Return cros-version from cros-version label."""
313    return label[len('cros-version:'):]
314
315
316# TODO (ayatane): Refactor into server/cros/provision
317class _CrosImage(object):
318    """The name of a CrOS image."""
319
320    _name_pattern = re.compile(
321        r'^'
322        r'(?P<group>[a-z0-9-]+)'
323        r'/'
324        r'(?P<milestone>LATEST|R[0-9]+)'
325        r'-'
326        r'(?P<version>[0-9.]+)'
327        r'(-(?P<rc>rc[0-9]+))?'
328        r'$'
329    )
330
331    def __init__(self, name):
332        """Initialize instance.
333
334        @param name: Image name string (lumpy-release/R27-3773.0.0)
335        """
336        self._name = name
337        match = self._name_pattern.search(name)
338        if match is None:
339            raise ValueError('Invalid CrOS image name: %r' % name)
340        self.group = match.group('group')
341        self.milestone = match.group('milestone')
342        self.version = match.group('version')
343        self.rc = match.group('rc')
344
345    def __repr__(self):
346        return '{cls}({name!r})'.format(cls=type(self).__name__,
347                                        name=self._name)
348
349    def __str__(self):
350        return self._name
351
352
353class ProvisionTask(PreJobTask):
354    TASK_TYPE = models.SpecialTask.Task.PROVISION
355
356    def __init__(self, task):
357        # Provisioning requires that we be associated with a job/queue entry
358        assert task.queue_entry, "No HQE associated with provision task!"
359        # task.queue_entry is an afe model HostQueueEntry object.
360        # self.queue_entry is a scheduler models HostQueueEntry object, but
361        # it gets constructed and assigned in __init__, so it's not available
362        # yet.  Therefore, we're stuck pulling labels off of the afe model
363        # so that we can pass the --provision args into the __init__ call.
364        labels = {x.name for x in task.queue_entry.job.labels}
365        _, provisionable = provision.Provision.partition(labels)
366        extra_command_args = ['--provision',
367                              '--job-labels', ','.join(provisionable)]
368        super(ProvisionTask, self).__init__(task, extra_command_args)
369        self._set_milestone(labels)
370        self._set_ids(host=self.host, queue_entries=[self.queue_entry])
371
372
373    def _set_milestone(self, labels):
374        """Set build milestone from the labels.
375
376        @param labels: iterable of labels.
377        """
378        labels = (label
379                  for label in labels
380                  if _is_cros_version(label))
381        for label in labels:
382            try:
383                cros_image = _CrosImage(_get_cros_version(label))
384            except ValueError as e:
385                logging.warning('Could not parse cros-version. Error msg: %s', e)
386                self._milestone = 'N/A'
387            else:
388                self._milestone = cros_image.milestone
389            break
390
391
392    def _command_line(self):
393        # If we give queue_entry to autoserv_run_job_command, then it will
394        # append -c for this invocation if the queue_entry is a client side
395        # test. We don't want that, as it messes with provisioning, so we just
396        # drop it from the arguments here.
397        # Note that we also don't verify job_repo_url as provisioining tasks are
398        # required to stage whatever content we need, and the job itself will
399        # force autotest to be staged if it isn't already.
400        return autoserv_utils.autoserv_run_job_command(
401                autoserv_utils.autoserv_directory,
402                self.host.hostname,
403                results_directory=drone_manager.WORKING_DIRECTORY,
404                extra_args=self._extra_command_args,
405                in_lab=True,
406        )
407
408    def prolog(self):
409        super(ProvisionTask, self).prolog()
410        # add check for previous provision task and abort if exist.
411        logging.info("starting provision task for host: %s", self.host.hostname)
412        self.queue_entry.set_status(
413                models.HostQueueEntry.Status.PROVISIONING)
414        self.host.set_status(models.Host.Status.PROVISIONING)
415
416
417    def epilog(self):
418        super(ProvisionTask, self).epilog()
419
420        # If we were not successful in provisioning the machine
421        # leave the DUT in whatever status was set in the PreJobTask's
422        # epilog. If this task was successful the host status will get
423        # set appropriately as a fallout of the hqe's on_pending. If
424        # we don't call on_pending, it can only be because:
425        #   1. This task was not successful:
426        #       a. Another repair is queued: this repair job will set the host
427        #       status, and it will remain in 'Provisioning' till then.
428        #       b. We have hit the max_repair_limit: in which case the host
429        #       status is set to 'RepairFailed' in the epilog of PreJobTask.
430        #   2. The task was successful, but there are other special tasks:
431        #      Those special tasks will set the host status appropriately.
432        if self._should_pending():
433            self.queue_entry.on_pending()
434
435
436class RepairTask(agent_task.SpecialAgentTask):
437    TASK_TYPE = models.SpecialTask.Task.REPAIR
438
439
440    def __init__(self, task):
441        """\
442        queue_entry: queue entry to mark failed if this repair fails.
443        """
444        protection = host_protections.Protection.get_string(
445                task.host.protection)
446        # normalize the protection name
447        protection = host_protections.Protection.get_attr_name(protection)
448
449        args = ['-R', '--host-protection', protection]
450        if task.queue_entry:
451            args.extend(self._generate_autoserv_label_args(task))
452
453        super(RepairTask, self).__init__(task, args)
454
455        # *don't* include the queue entry in IDs -- if the queue entry is
456        # aborted, we want to leave the repair task running
457        self._set_ids(host=self.host)
458
459
460    def prolog(self):
461        super(RepairTask, self).prolog()
462        logging.info("repair_task starting")
463        self.host.set_status(models.Host.Status.REPAIRING)
464
465
466    def epilog(self):
467        super(RepairTask, self).epilog()
468
469        if self.success:
470            self.host.set_status(models.Host.Status.READY)
471        else:
472            self.host.set_status(models.Host.Status.REPAIR_FAILED)
473            if self.queue_entry:
474                self._fail_queue_entry()
475        self.host.record_working_state(bool(self.success),
476                                       self.task.time_finished)
477