1# Copyright 2015 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Services relating to DUT status and job history.
6
7The central abstraction of this module is the `HostJobHistory`
8class.  This class provides two related pieces of information
9regarding a single DUT:
10  * A history of tests and special tasks that have run on
11    the DUT in a given time range.
12  * Whether the DUT was "working" or "broken" at a given
13    time.
14
15The "working" or "broken" status of a DUT is determined by
16the DUT's special task history.  At the end of any job or
17task, the status is indicated as follows:
18  * After any successful special task, the DUT is considered
19    "working".
20  * After any failed Repair task, the DUT is considered "broken".
21  * After any other special task or after any regular test job, the
22    DUT's status is considered unchanged.
23
24Definitions for terms used in the code below:
25  * status task - Any special task that determines the DUT's
26    status; that is, any successful task, or any failed Repair.
27  * diagnosis interval - A time interval during which DUT status
28    changed either from "working" to "broken", or vice versa.  The
29    interval starts with the last status task with the old status,
30    and ends after the first status task with the new status.
31
32Diagnosis intervals are interesting because they normally contain
33the logs explaining a failure or repair event.
34
35"""
36
37import common
38import os
39from autotest_lib.frontend import setup_django_environment
40from django.db import models as django_models
41
42from autotest_lib.client.common_lib import global_config
43from autotest_lib.client.common_lib import utils
44from autotest_lib.client.common_lib import time_utils
45from autotest_lib.frontend.afe import models as afe_models
46from autotest_lib.server import constants
47
48
49# Values used to describe the diagnosis of a DUT.  These values are
50# used to indicate both DUT status after a job or task, and also
51# diagnosis of whether the DUT was working at the end of a given
52# time interval.
53#
54# UNUSED:  Used when there are no events recorded in a given
55#     time interval.
56# UNKNOWN:  For an individual event, indicates that the DUT status
57#     is unchanged from the previous event.  For a time interval,
58#     indicates that the DUT's status can't be determined from the
59#     DUT's history.
60# WORKING:  Indicates that the DUT was working normally after the
61#     event, or at the end of the time interval.
62# BROKEN:  Indicates that the DUT needed manual repair after the
63#     event, or at the end of the time interval.
64#
65UNUSED = 0
66UNKNOWN = 1
67WORKING = 2
68BROKEN = 3
69
70
71status_names = {
72  UNUSED: "UNUSED",
73  UNKNOWN: "UNKNOWN",
74  WORKING: "WORKING",
75  BROKEN: "BROKEN",
76}
77
78
79def parse_time(time_string):
80    """Parse time according to a canonical form.
81
82    The "canonical" form is the form in which date/time
83    values are stored in the database.
84
85    @param time_string Time to be parsed.
86    """
87    return int(time_utils.to_epoch_time(time_string))
88
89
90class _JobEvent(object):
91    """Information about an event in host history.
92
93    This remembers the relevant data from a single event in host
94    history.  An event is any change in DUT state caused by a job
95    or special task.  The data captured are the start and end times
96    of the event, the URL of logs to the job or task causing the
97    event, and a diagnosis of whether the DUT was working or failed
98    afterwards.
99
100    This class is an adapter around the database model objects
101    describing jobs and special tasks.  This is an abstract
102    superclass, with concrete subclasses for `HostQueueEntry` and
103    `SpecialTask` objects.
104
105    @property start_time  Time the job or task began execution.
106    @property end_time    Time the job or task finished execution.
107    @property id          id of the event in the AFE database.
108    @property name        Name of the event, derived from the AFE database.
109    @property job_status  Short string describing the event's final status.
110    @property logdir      Relative path to the logs for the event's job.
111    @property job_url     URL to the logs for the event's job.
112    @property gs_url      GS URL to the logs for the event's job.
113    @property job_id      id of the AFE job for HQEs.  None otherwise.
114    @property diagnosis   Working status of the DUT after the event.
115    @property is_special  Boolean indicating if the event is a special task.
116
117    """
118
119    get_config_value = global_config.global_config.get_config_value
120    _LOG_URL_PATTERN = ('%s/browse/chromeos-autotest-results/%%s/'
121                        % get_config_value('AUTOTEST_WEB', 'stainless_url',
122                                           default=None))
123
124    @classmethod
125    def get_gs_url(cls, logdir):
126        """Return a GS URL to job results.
127
128        The URL is constructed from a base URL determined by the
129        global config, plus the relative path of the job's log
130        directory.
131
132        @param logdir Relative path of the results log directory.
133
134        @return A URL to the requested results log.
135
136        """
137        return os.path.join(utils.get_offload_gsuri(), logdir)
138
139
140    def __init__(self, start_time, end_time):
141        self.start_time = parse_time(start_time)
142        self.end_time = parse_time(end_time)
143
144
145    def __cmp__(self, other):
146        """Compare two jobs by their start time.
147
148        This is a standard Python `__cmp__` method to allow sorting
149        `_JobEvent` objects by their times.
150
151        @param other The `_JobEvent` object to compare to `self`.
152
153        """
154        return self.start_time - other.start_time
155
156
157    @property
158    def id(self):
159        """Return the id of the event in the AFE database."""
160        raise NotImplementedError()
161
162
163    @property
164    def name(self):
165        """Return the name of the event."""
166        raise NotImplementedError()
167
168
169    @property
170    def job_status(self):
171        """Return a short string describing the event's final status."""
172        raise NotImplementedError()
173
174
175    @property
176    def logdir(self):
177        """Return the relative path for this event's job logs."""
178        raise NotImplementedError()
179
180
181    @property
182    def job_url(self):
183        """Return the URL for this event's job logs."""
184        return self._LOG_URL_PATTERN % self.logdir
185
186
187    @property
188    def gs_url(self):
189        """Return the GS URL for this event's job logs."""
190        return self.get_gs_url(self.logdir)
191
192
193    @property
194    def job_id(self):
195        """Return the id of the AFE job for HQEs.  None otherwise."""
196        raise NotImplementedError()
197
198
199    @property
200    def diagnosis(self):
201        """Return the status of the DUT after this event.
202
203        The diagnosis is interpreted as follows:
204          UNKNOWN - The DUT status was the same before and after
205              the event.
206          WORKING - The DUT appeared to be working after the event.
207          BROKEN - The DUT likely required manual intervention
208              after the event.
209
210        @return A valid diagnosis value.
211
212        """
213        raise NotImplementedError()
214
215
216    @property
217    def is_special(self):
218        """Return if the event is for a special task."""
219        raise NotImplementedError()
220
221
222class _SpecialTaskEvent(_JobEvent):
223    """`_JobEvent` adapter for special tasks.
224
225    This class wraps the standard `_JobEvent` interface around a row
226    in the `afe_special_tasks` table.
227
228    """
229
230    @classmethod
231    def get_tasks(cls, afe, host_id, start_time, end_time):
232        """Return special tasks for a host in a given time range.
233
234        Return a list of `_SpecialTaskEvent` objects representing all
235        special tasks that ran on the given host in the given time
236        range.  The list is ordered as it was returned by the query
237        (i.e. unordered).
238
239        @param afe         Autotest frontend
240        @param host_id     Database host id of the desired host.
241        @param start_time  Start time of the range of interest.
242        @param end_time    End time of the range of interest.
243
244        @return A list of `_SpecialTaskEvent` objects.
245
246        """
247        query_start = time_utils.epoch_time_to_date_string(start_time)
248        query_end = time_utils.epoch_time_to_date_string(end_time)
249        tasks = afe.get_host_special_tasks(
250                host_id,
251                time_started__gte=query_start,
252                time_finished__lte=query_end,
253                is_complete=1)
254        return [cls(t) for t in tasks]
255
256
257    @classmethod
258    def get_status_task(cls, afe, host_id, end_time):
259        """Return the task indicating a host's status at a given time.
260
261        The task returned determines the status of the DUT; the
262        diagnosis on the task indicates the diagnosis for the DUT at
263        the given `end_time`.
264
265        @param afe         Autotest frontend
266        @param host_id     Database host id of the desired host.
267        @param end_time    Find status as of this time.
268
269        @return A `_SpecialTaskEvent` object for the requested task,
270                or `None` if no task was found.
271
272        """
273        query_end = time_utils.epoch_time_to_date_string(end_time)
274        task = afe.get_host_status_task(host_id, query_end)
275        return cls(task) if task else None
276
277
278    def __init__(self, afetask):
279        self._afetask = afetask
280        super(_SpecialTaskEvent, self).__init__(
281                afetask.time_started, afetask.time_finished)
282
283
284    @property
285    def id(self):
286        return self._afetask.id
287
288
289    @property
290    def name(self):
291        return self._afetask.task
292
293
294    @property
295    def job_status(self):
296        if self._afetask.is_aborted:
297            return 'ABORTED'
298        elif self._afetask.success:
299            return 'PASS'
300        else:
301            return 'FAIL'
302
303
304    @property
305    def logdir(self):
306        return ('hosts/%s/%s-%s' %
307                (self._afetask.host.hostname, self._afetask.id,
308                 self._afetask.task.lower()))
309
310
311    @property
312    def job_id(self):
313        return None
314
315
316    @property
317    def diagnosis(self):
318        if self._afetask.success:
319            return WORKING
320        elif self._afetask.task == 'Repair':
321            return BROKEN
322        else:
323            return UNKNOWN
324
325
326    @property
327    def is_special(self):
328        return True
329
330
331class _TestJobEvent(_JobEvent):
332    """`_JobEvent` adapter for regular test jobs.
333
334    This class wraps the standard `_JobEvent` interface around a row
335    in the `afe_host_queue_entries` table.
336
337    """
338
339    @classmethod
340    def get_hqes(cls, afe, host_id, start_time, end_time):
341        """Return HQEs for a host in a given time range.
342
343        Return a list of `_TestJobEvent` objects representing all the
344        HQEs of all the jobs that ran on the given host in the given
345        time range.  The list is ordered as it was returned by the
346        query (i.e. unordered).
347
348        @param afe         Autotest frontend
349        @param host_id     Database host id of the desired host.
350        @param start_time  Start time of the range of interest.
351        @param end_time    End time of the range of interest.
352
353        @return A list of `_TestJobEvent` objects.
354
355        """
356        query_start = time_utils.epoch_time_to_date_string(start_time)
357        query_end = time_utils.epoch_time_to_date_string(end_time)
358        hqelist = afe.get_host_queue_entries_by_insert_time(
359                host_id=host_id,
360                insert_time_after=query_start,
361                insert_time_before=query_end,
362                started_on__gte=query_start,
363                started_on__lte=query_end,
364                complete=1)
365        return [cls(hqe) for hqe in hqelist]
366
367
368    def __init__(self, hqe):
369        self._hqe = hqe
370        super(_TestJobEvent, self).__init__(
371                hqe.started_on, hqe.finished_on)
372
373
374    @property
375    def id(self):
376        return self._hqe.id
377
378
379    @property
380    def name(self):
381        return self._hqe.job.name
382
383
384    @property
385    def job_status(self):
386        return self._hqe.status
387
388
389    @property
390    def logdir(self):
391        return _get_job_logdir(self._hqe.job)
392
393
394    @property
395    def job_id(self):
396        return self._hqe.job.id
397
398
399    @property
400    def diagnosis(self):
401        return UNKNOWN
402
403
404    @property
405    def is_special(self):
406        return False
407
408
409class HostJobHistory(object):
410    """Class to query and remember DUT execution and status history.
411
412    This class is responsible for querying the database to determine
413    the history of a single DUT in a time interval of interest, and
414    for remembering the query results for reporting.
415
416    @property hostname    Host name of the DUT.
417    @property start_time  Start of the requested time interval, as a unix
418                          timestamp (epoch time).
419                          This field may be `None`.
420    @property end_time    End of the requested time interval, as a unix
421                          timestamp (epoch time).
422    @property _afe        Autotest frontend for queries.
423    @property _host       Database host object for the DUT.
424    @property _history    A list of jobs and special tasks that
425                          ran on the DUT in the requested time
426                          interval, ordered in reverse, from latest
427                          to earliest.
428
429    @property _status_interval   A list of all the jobs and special
430                                 tasks that ran on the DUT in the
431                                 last diagnosis interval prior to
432                                 `end_time`, ordered from latest to
433                                 earliest.
434    @property _status_diagnosis  The DUT's status as of `end_time`.
435    @property _status_task       The DUT's last status task as of
436                                 `end_time`.
437
438    """
439
440    @classmethod
441    def get_host_history(cls, afe, hostname, start_time, end_time):
442        """Create a `HostJobHistory` instance for a single host.
443
444        Simple factory method to construct host history from a
445        hostname.  Simply looks up the host in the AFE database, and
446        passes it to the class constructor.
447
448        @param afe         Autotest frontend
449        @param hostname    Name of the host.
450        @param start_time  Start time for the history's time
451                           interval.
452        @param end_time    End time for the history's time interval.
453
454        @return A new `HostJobHistory` instance.
455
456        """
457        afehost = afe.get_hosts(hostname=hostname)[0]
458        return cls(afe, afehost, start_time, end_time)
459
460
461    @classmethod
462    def get_multiple_histories(cls, afe, start_time, end_time, labels=()):
463        """Create `HostJobHistory` instances for a set of hosts.
464
465        @param afe         Autotest frontend
466        @param start_time  Start time for the history's time
467                           interval.
468        @param end_time    End time for the history's time interval.
469        @param labels      type: [str]. AFE labels to constrain the host query.
470                           This option must be non-empty. An unconstrained
471                           search of the DB is too costly.
472
473        @return A list of new `HostJobHistory` instances.
474
475        """
476        assert labels, (
477            'Must specify labels for get_multiple_histories. '
478            'Unconstrainted search of the database is prohibitively costly.')
479
480        kwargs = {'multiple_labels': labels}
481        hosts = afe.get_hosts(**kwargs)
482        return [cls(afe, h, start_time, end_time) for h in hosts]
483
484
485    def __init__(self, afe, afehost, start_time, end_time):
486        self._afe = afe
487        self.hostname = afehost.hostname
488        self.end_time = end_time
489        self.start_time = start_time
490        self._host = afehost
491        # Don't spend time on queries until they're needed.
492        self._history = None
493        self._status_interval = None
494        self._status_diagnosis = None
495        self._status_task = None
496
497
498    def _get_history(self, start_time, end_time):
499        """Get the list of events for the given interval."""
500        newtasks = _SpecialTaskEvent.get_tasks(
501                self._afe, self._host.id, start_time, end_time)
502        newhqes = _TestJobEvent.get_hqes(
503                self._afe, self._host.id, start_time, end_time)
504        newhistory = newtasks + newhqes
505        newhistory.sort(reverse=True)
506        return newhistory
507
508
509    def __iter__(self):
510        if self._history is None:
511            self._history = self._get_history(self.start_time,
512                                              self.end_time)
513        return self._history.__iter__()
514
515
516    def _extract_prefixed_label(self, prefix):
517        labels = [l for l in self._host.labels
518                    if l.startswith(prefix)]
519        return labels[0][len(prefix) : ] if labels else None
520
521
522    @property
523    def host(self):
524        """Return the AFE host object for this history."""
525        return self._host
526
527
528    @property
529    def host_model(self):
530        """Return the model name for this history's DUT."""
531        prefix = constants.Labels.MODEL_PREFIX
532        return self._extract_prefixed_label(prefix)
533
534
535    @property
536    def host_board(self):
537        """Return the board name for this history's DUT."""
538        prefix = constants.Labels.BOARD_PREFIX
539        return self._extract_prefixed_label(prefix)
540
541
542    @property
543    def host_pool(self):
544        """Return the pool name for this history's DUT."""
545        prefix = constants.Labels.POOL_PREFIX
546        return self._extract_prefixed_label(prefix)
547
548
549    def _init_status_task(self):
550        """Fill in `self._status_diagnosis` and `_status_task`."""
551        if self._status_diagnosis is not None:
552            return
553        self._status_task = _SpecialTaskEvent.get_status_task(
554                self._afe, self._host.id, self.end_time)
555        if self._status_task is not None:
556            self._status_diagnosis = self._status_task.diagnosis
557        else:
558            self._status_diagnosis = UNKNOWN
559
560
561    def _init_status_interval(self):
562        """Fill in `self._status_interval`."""
563        if self._status_interval is not None:
564            return
565        self._init_status_task()
566        self._status_interval = []
567        if self._status_task is None:
568            return
569        query_end = time_utils.epoch_time_to_date_string(self.end_time)
570        interval = self._afe.get_host_diagnosis_interval(
571                self._host.id, query_end,
572                self._status_diagnosis != WORKING)
573        if not interval:
574            return
575        self._status_interval = self._get_history(
576                parse_time(interval[0]),
577                parse_time(interval[1]))
578
579
580    def diagnosis_interval(self):
581        """Find this history's most recent diagnosis interval.
582
583        Returns a list of `_JobEvent` instances corresponding to the
584        most recent diagnosis interval occurring before this
585        history's end time.
586
587        The list is returned as with `self._history`, ordered from
588        most to least recent.
589
590        @return The list of the `_JobEvent`s in the diagnosis
591                interval.
592
593        """
594        self._init_status_interval()
595        return self._status_interval
596
597
598    def last_diagnosis(self):
599        """Return the diagnosis of whether the DUT is working.
600
601        This searches the DUT's job history, looking for the most
602        recent status task for the DUT.  Return a tuple of
603        `(diagnosis, task)`.
604
605        The `diagnosis` entry in the tuple is one of these values:
606          * UNUSED - The host's last status task is older than
607              `self.start_time`.
608          * WORKING - The DUT is working.
609          * BROKEN - The DUT likely requires manual intervention.
610          * UNKNOWN - No task could be found indicating status for
611              the DUT.
612
613        If the DUT was working at last check, but hasn't been used
614        inside this history's time interval, the status `UNUSED` is
615        returned with the last status task, instead of `WORKING`.
616
617        The `task` entry in the tuple is the status task that led to
618        the diagnosis.  The task will be `None` if the diagnosis is
619        `UNKNOWN`.
620
621        @return A tuple with the DUT's diagnosis and the task that
622                determined it.
623
624        """
625        self._init_status_task()
626        diagnosis = self._status_diagnosis
627        if (self.start_time is not None and
628                self._status_task is not None and
629                self._status_task.end_time < self.start_time and
630                diagnosis == WORKING):
631            diagnosis = UNUSED
632        return diagnosis, self._status_task
633
634
635def get_diagnosis_interval(host_id, end_time, success):
636    """Return the last diagnosis interval for a given host and time.
637
638    This routine queries the database for the special tasks on a
639    given host before a given time.  From those tasks it selects the
640    last status task before a change in status, and the first status
641    task after the change.  When `success` is true, the change must
642    be from "working" to "broken".  When false, the search is for a
643    change in the opposite direction.
644
645    A "successful status task" is any successful special task.  A
646    "failed status task" is a failed Repair task.  These criteria
647    are based on the definition of "status task" in the module-level
648    docstring, above.
649
650    This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
651
652    @param host_id     Database host id of the desired host.
653    @param end_time    Find the last eligible interval before this time.
654    @param success     Whether the eligible interval should start with a
655                       success or a failure.
656
657    @return A list containing the start time of the earliest job
658            selected, and the end time of the latest job.
659
660    """
661    base_query = afe_models.SpecialTask.objects.filter(
662            host_id=host_id, is_complete=True)
663    success_query = base_query.filter(success=True)
664    failure_query = base_query.filter(success=False, task='Repair')
665    if success:
666        query0 = success_query
667        query1 = failure_query
668    else:
669        query0 = failure_query
670        query1 = success_query
671    query0 = query0.filter(time_finished__lte=end_time)
672    query0 = query0.order_by('time_started').reverse()
673    if not query0:
674        return []
675    task0 = query0[0]
676    query1 = query1.filter(time_finished__gt=task0.time_finished)
677    task1 = query1.order_by('time_started')[0]
678    return [task0.time_started.strftime(time_utils.TIME_FMT),
679            task1.time_finished.strftime(time_utils.TIME_FMT)]
680
681
682def get_status_task(host_id, end_time):
683    """Get the last status task for a host before a given time.
684
685    This routine returns a Django query for the AFE database to find
686    the last task that finished on the given host before the given
687    time that was either a successful task, or a Repair task.  The
688    query criteria are based on the definition of "status task" in
689    the module-level docstring, above.
690
691    This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
692
693    @param host_id     Database host id of the desired host.
694    @param end_time    End time of the range of interest.
695
696    @return A Django query-set selecting the single special task of
697            interest.
698
699    """
700    # Selects status tasks:  any Repair task, or any successful task.
701    status_tasks = (django_models.Q(task='Repair') |
702                    django_models.Q(success=True))
703    # Our caller needs a Django query set in order to serialize the
704    # result, so we don't resolve the query here; we just return a
705    # slice with at most one element.
706    return afe_models.SpecialTask.objects.filter(
707            status_tasks,
708            host_id=host_id,
709            time_finished__lte=end_time,
710            is_complete=True).order_by('time_started').reverse()[0:1]
711
712
713def _get_job_logdir(job):
714    """Gets the logdir for an AFE job.
715
716    @param job Job object which has id and owner properties.
717
718    @return Relative path of the results log directory.
719    """
720    return '%s-%s' % (job.id, job.owner)
721
722
723def get_job_gs_url(job):
724    """Gets the GS URL for an AFE job.
725
726    @param job Job object which has id and owner properties.
727
728    @return Absolute GS URL to the results log directory.
729    """
730    return _JobEvent.get_gs_url(_get_job_logdir(job))
731