1# Copyright 2015 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Services relating to DUT status and job history.
6
7The central abstraction of this module is the `HostJobHistory`
8class.  This class provides two related pieces of information
9regarding a single DUT:
10  * A history of tests and special tasks that have run on
11    the DUT in a given time range.
12  * Whether the DUT was "working" or "broken" at a given
13    time.
14
15The "working" or "broken" status of a DUT is determined by
16the DUT's special task history.  At the end of any job or
17task, the status is indicated as follows:
18  * After any successful special task, the DUT is considered
19    "working".
20  * After any failed Repair task, the DUT is considered "broken".
21  * After any other special task or after any regular test job, the
22    DUT's status is considered unchanged.
23
24Definitions for terms used in the code below:
25  * status task - Any special task that determines the DUT's
26    status; that is, any successful task, or any failed Repair.
27  * diagnosis interval - A time interval during which DUT status
28    changed either from "working" to "broken", or vice versa.  The
29    interval starts with the last status task with the old status,
30    and ends after the first status task with the new status.
31
32Diagnosis intervals are interesting because they normally contain
33the logs explaining a failure or repair event.
34
35"""
36
37import common
38import os
39from autotest_lib.frontend import setup_django_environment
40from django.db import models as django_models
41
42from autotest_lib.client.common_lib import global_config
43from autotest_lib.client.common_lib import utils
44from autotest_lib.client.common_lib import time_utils
45from autotest_lib.frontend.afe import models as afe_models
46from autotest_lib.server import constants
47
48
49# Values used to describe the diagnosis of a DUT.  These values are
50# used to indicate both DUT status after a job or task, and also
51# diagnosis of whether the DUT was working at the end of a given
52# time interval.
53#
54# UNUSED:  Used when there are no events recorded in a given
55#     time interval.
56# UNKNOWN:  For an individual event, indicates that the DUT status
57#     is unchanged from the previous event.  For a time interval,
58#     indicates that the DUT's status can't be determined from the
59#     DUT's history.
60# WORKING:  Indicates that the DUT was working normally after the
61#     event, or at the end of the time interval.
62# BROKEN:  Indicates that the DUT needed manual repair after the
63#     event, or at the end of the time interval.
64#
65UNUSED = 0
66UNKNOWN = 1
67WORKING = 2
68BROKEN = 3
69
70
71def parse_time(time_string):
72    """Parse time according to a canonical form.
73
74    The "canonical" form is the form in which date/time
75    values are stored in the database.
76
77    @param time_string Time to be parsed.
78    """
79    return int(time_utils.to_epoch_time(time_string))
80
81
82class _JobEvent(object):
83    """Information about an event in host history.
84
85    This remembers the relevant data from a single event in host
86    history.  An event is any change in DUT state caused by a job
87    or special task.  The data captured are the start and end times
88    of the event, the URL of logs to the job or task causing the
89    event, and a diagnosis of whether the DUT was working or failed
90    afterwards.
91
92    This class is an adapter around the database model objects
93    describing jobs and special tasks.  This is an abstract
94    superclass, with concrete subclasses for `HostQueueEntry` and
95    `SpecialTask` objects.
96
97    @property start_time  Time the job or task began execution.
98    @property end_time    Time the job or task finished execution.
99    @property id          id of the event in the AFE database.
100    @property name        Name of the event, derived from the AFE database.
101    @property job_status  Short string describing the event's final status.
102    @property logdir      Relative path to the logs for the event's job.
103    @property job_url     URL to the logs for the event's job.
104    @property gs_url      GS URL to the logs for the event's job.
105    @property job_id      id of the AFE job for HQEs.  None otherwise.
106    @property diagnosis   Working status of the DUT after the event.
107    @property is_special  Boolean indicating if the event is a special task.
108
109    """
110
111    get_config_value = global_config.global_config.get_config_value
112    _LOG_URL_PATTERN = ('%s/browse/chromeos-autotest-results/%%s/'
113                        % get_config_value('AUTOTEST_WEB', 'stainless_url',
114                                           default=None))
115
116    @classmethod
117    def get_gs_url(cls, logdir):
118        """Return a GS URL to job results.
119
120        The URL is constructed from a base URL determined by the
121        global config, plus the relative path of the job's log
122        directory.
123
124        @param logdir Relative path of the results log directory.
125
126        @return A URL to the requested results log.
127
128        """
129        return os.path.join(utils.get_offload_gsuri(), logdir)
130
131
132    def __init__(self, start_time, end_time):
133        self.start_time = parse_time(start_time)
134        self.end_time = parse_time(end_time)
135
136
137    def __cmp__(self, other):
138        """Compare two jobs by their start time.
139
140        This is a standard Python `__cmp__` method to allow sorting
141        `_JobEvent` objects by their times.
142
143        @param other The `_JobEvent` object to compare to `self`.
144
145        """
146        return self.start_time - other.start_time
147
148
149    @property
150    def id(self):
151        """Return the id of the event in the AFE database."""
152        raise NotImplementedError()
153
154
155    @property
156    def name(self):
157        """Return the name of the event."""
158        raise NotImplementedError()
159
160
161    @property
162    def job_status(self):
163        """Return a short string describing the event's final status."""
164        raise NotImplementedError()
165
166
167    @property
168    def logdir(self):
169        """Return the relative path for this event's job logs."""
170        raise NotImplementedError()
171
172
173    @property
174    def job_url(self):
175        """Return the URL for this event's job logs."""
176        return self._LOG_URL_PATTERN % self.logdir
177
178
179    @property
180    def gs_url(self):
181        """Return the GS URL for this event's job logs."""
182        return self.get_gs_url(self.logdir)
183
184
185    @property
186    def job_id(self):
187        """Return the id of the AFE job for HQEs.  None otherwise."""
188        raise NotImplementedError()
189
190
191    @property
192    def diagnosis(self):
193        """Return the status of the DUT after this event.
194
195        The diagnosis is interpreted as follows:
196          UNKNOWN - The DUT status was the same before and after
197              the event.
198          WORKING - The DUT appeared to be working after the event.
199          BROKEN - The DUT likely required manual intervention
200              after the event.
201
202        @return A valid diagnosis value.
203
204        """
205        raise NotImplementedError()
206
207
208    @property
209    def is_special(self):
210        """Return if the event is for a special task."""
211        raise NotImplementedError()
212
213
214class _SpecialTaskEvent(_JobEvent):
215    """`_JobEvent` adapter for special tasks.
216
217    This class wraps the standard `_JobEvent` interface around a row
218    in the `afe_special_tasks` table.
219
220    """
221
222    @classmethod
223    def get_tasks(cls, afe, host_id, start_time, end_time):
224        """Return special tasks for a host in a given time range.
225
226        Return a list of `_SpecialTaskEvent` objects representing all
227        special tasks that ran on the given host in the given time
228        range.  The list is ordered as it was returned by the query
229        (i.e. unordered).
230
231        @param afe         Autotest frontend
232        @param host_id     Database host id of the desired host.
233        @param start_time  Start time of the range of interest.
234        @param end_time    End time of the range of interest.
235
236        @return A list of `_SpecialTaskEvent` objects.
237
238        """
239        query_start = time_utils.epoch_time_to_date_string(start_time)
240        query_end = time_utils.epoch_time_to_date_string(end_time)
241        tasks = afe.get_host_special_tasks(
242                host_id,
243                time_started__gte=query_start,
244                time_finished__lte=query_end,
245                is_complete=1)
246        return [cls(t) for t in tasks]
247
248
249    @classmethod
250    def get_status_task(cls, afe, host_id, end_time):
251        """Return the task indicating a host's status at a given time.
252
253        The task returned determines the status of the DUT; the
254        diagnosis on the task indicates the diagnosis for the DUT at
255        the given `end_time`.
256
257        @param afe         Autotest frontend
258        @param host_id     Database host id of the desired host.
259        @param end_time    Find status as of this time.
260
261        @return A `_SpecialTaskEvent` object for the requested task,
262                or `None` if no task was found.
263
264        """
265        query_end = time_utils.epoch_time_to_date_string(end_time)
266        task = afe.get_host_status_task(host_id, query_end)
267        return cls(task) if task else None
268
269
270    def __init__(self, afetask):
271        self._afetask = afetask
272        super(_SpecialTaskEvent, self).__init__(
273                afetask.time_started, afetask.time_finished)
274
275
276    @property
277    def id(self):
278        return self._afetask.id
279
280
281    @property
282    def name(self):
283        return self._afetask.task
284
285
286    @property
287    def job_status(self):
288        if self._afetask.is_aborted:
289            return 'ABORTED'
290        elif self._afetask.success:
291            return 'PASS'
292        else:
293            return 'FAIL'
294
295
296    @property
297    def logdir(self):
298        return ('hosts/%s/%s-%s' %
299                (self._afetask.host.hostname, self._afetask.id,
300                 self._afetask.task.lower()))
301
302
303    @property
304    def job_id(self):
305        return None
306
307
308    @property
309    def diagnosis(self):
310        if self._afetask.success:
311            return WORKING
312        elif self._afetask.task == 'Repair':
313            return BROKEN
314        else:
315            return UNKNOWN
316
317
318    @property
319    def is_special(self):
320        return True
321
322
323class _TestJobEvent(_JobEvent):
324    """`_JobEvent` adapter for regular test jobs.
325
326    This class wraps the standard `_JobEvent` interface around a row
327    in the `afe_host_queue_entries` table.
328
329    """
330
331    @classmethod
332    def get_hqes(cls, afe, host_id, start_time, end_time):
333        """Return HQEs for a host in a given time range.
334
335        Return a list of `_TestJobEvent` objects representing all the
336        HQEs of all the jobs that ran on the given host in the given
337        time range.  The list is ordered as it was returned by the
338        query (i.e. unordered).
339
340        @param afe         Autotest frontend
341        @param host_id     Database host id of the desired host.
342        @param start_time  Start time of the range of interest.
343        @param end_time    End time of the range of interest.
344
345        @return A list of `_TestJobEvent` objects.
346
347        """
348        query_start = time_utils.epoch_time_to_date_string(start_time)
349        query_end = time_utils.epoch_time_to_date_string(end_time)
350        hqelist = afe.get_host_queue_entries_by_insert_time(
351                host_id=host_id,
352                insert_time_after=query_start,
353                insert_time_before=query_end,
354                started_on__gte=query_start,
355                started_on__lte=query_end,
356                complete=1)
357        return [cls(hqe) for hqe in hqelist]
358
359
360    def __init__(self, hqe):
361        self._hqe = hqe
362        super(_TestJobEvent, self).__init__(
363                hqe.started_on, hqe.finished_on)
364
365
366    @property
367    def id(self):
368        return self._hqe.id
369
370
371    @property
372    def name(self):
373        return self._hqe.job.name
374
375
376    @property
377    def job_status(self):
378        return self._hqe.status
379
380
381    @property
382    def logdir(self):
383        return _get_job_logdir(self._hqe.job)
384
385
386    @property
387    def job_id(self):
388        return self._hqe.job.id
389
390
391    @property
392    def diagnosis(self):
393        return UNKNOWN
394
395
396    @property
397    def is_special(self):
398        return False
399
400
401class HostJobHistory(object):
402    """Class to query and remember DUT execution and status history.
403
404    This class is responsible for querying the database to determine
405    the history of a single DUT in a time interval of interest, and
406    for remembering the query results for reporting.
407
408    @property hostname    Host name of the DUT.
409    @property start_time  Start of the requested time interval, as a unix
410                          timestamp (epoch time).
411                          This field may be `None`.
412    @property end_time    End of the requested time interval, as a unix
413                          timestamp (epoch time).
414    @property _afe        Autotest frontend for queries.
415    @property _host       Database host object for the DUT.
416    @property _history    A list of jobs and special tasks that
417                          ran on the DUT in the requested time
418                          interval, ordered in reverse, from latest
419                          to earliest.
420
421    @property _status_interval   A list of all the jobs and special
422                                 tasks that ran on the DUT in the
423                                 last diagnosis interval prior to
424                                 `end_time`, ordered from latest to
425                                 earliest.
426    @property _status_diagnosis  The DUT's status as of `end_time`.
427    @property _status_task       The DUT's last status task as of
428                                 `end_time`.
429
430    """
431
432    @classmethod
433    def get_host_history(cls, afe, hostname, start_time, end_time):
434        """Create a `HostJobHistory` instance for a single host.
435
436        Simple factory method to construct host history from a
437        hostname.  Simply looks up the host in the AFE database, and
438        passes it to the class constructor.
439
440        @param afe         Autotest frontend
441        @param hostname    Name of the host.
442        @param start_time  Start time for the history's time
443                           interval.
444        @param end_time    End time for the history's time interval.
445
446        @return A new `HostJobHistory` instance.
447
448        """
449        afehost = afe.get_hosts(hostname=hostname)[0]
450        return cls(afe, afehost, start_time, end_time)
451
452
453    @classmethod
454    def get_multiple_histories(cls, afe, start_time, end_time, labels=()):
455        """Create `HostJobHistory` instances for a set of hosts.
456
457        @param afe         Autotest frontend
458        @param start_time  Start time for the history's time
459                           interval.
460        @param end_time    End time for the history's time interval.
461        @param labels      type: [str]. AFE labels to constrain the host query.
462                           This option must be non-empty. An unconstrained
463                           search of the DB is too costly.
464
465        @return A list of new `HostJobHistory` instances.
466
467        """
468        assert labels, (
469            'Must specify labels for get_multiple_histories. '
470            'Unconstrainted search of the database is prohibitively costly.')
471
472        kwargs = {'multiple_labels': labels}
473        hosts = afe.get_hosts(**kwargs)
474        return [cls(afe, h, start_time, end_time) for h in hosts]
475
476
477    def __init__(self, afe, afehost, start_time, end_time):
478        self._afe = afe
479        self.hostname = afehost.hostname
480        self.end_time = end_time
481        self.start_time = start_time
482        self._host = afehost
483        # Don't spend time on queries until they're needed.
484        self._history = None
485        self._status_interval = None
486        self._status_diagnosis = None
487        self._status_task = None
488
489
490    def _get_history(self, start_time, end_time):
491        """Get the list of events for the given interval."""
492        newtasks = _SpecialTaskEvent.get_tasks(
493                self._afe, self._host.id, start_time, end_time)
494        newhqes = _TestJobEvent.get_hqes(
495                self._afe, self._host.id, start_time, end_time)
496        newhistory = newtasks + newhqes
497        newhistory.sort(reverse=True)
498        return newhistory
499
500
501    def __iter__(self):
502        if self._history is None:
503            self._history = self._get_history(self.start_time,
504                                              self.end_time)
505        return self._history.__iter__()
506
507
508    def _extract_prefixed_label(self, prefix):
509        labels = [l for l in self._host.labels
510                    if l.startswith(prefix)]
511        return labels[0][len(prefix) : ] if labels else None
512
513
514    @property
515    def host(self):
516        """Return the AFE host object for this history."""
517        return self._host
518
519
520    @property
521    def host_model(self):
522        """Return the model name for this history's DUT."""
523        prefix = constants.Labels.MODEL_PREFIX
524        return self._extract_prefixed_label(prefix)
525
526
527    @property
528    def host_board(self):
529        """Return the board name for this history's DUT."""
530        prefix = constants.Labels.BOARD_PREFIX
531        return self._extract_prefixed_label(prefix)
532
533
534    @property
535    def host_pool(self):
536        """Return the pool name for this history's DUT."""
537        prefix = constants.Labels.POOL_PREFIX
538        return self._extract_prefixed_label(prefix)
539
540
541    def _init_status_task(self):
542        """Fill in `self._status_diagnosis` and `_status_task`."""
543        if self._status_diagnosis is not None:
544            return
545        self._status_task = _SpecialTaskEvent.get_status_task(
546                self._afe, self._host.id, self.end_time)
547        if self._status_task is not None:
548            self._status_diagnosis = self._status_task.diagnosis
549        else:
550            self._status_diagnosis = UNKNOWN
551
552
553    def _init_status_interval(self):
554        """Fill in `self._status_interval`."""
555        if self._status_interval is not None:
556            return
557        self._init_status_task()
558        self._status_interval = []
559        if self._status_task is None:
560            return
561        query_end = time_utils.epoch_time_to_date_string(self.end_time)
562        interval = self._afe.get_host_diagnosis_interval(
563                self._host.id, query_end,
564                self._status_diagnosis != WORKING)
565        if not interval:
566            return
567        self._status_interval = self._get_history(
568                parse_time(interval[0]),
569                parse_time(interval[1]))
570
571
572    def diagnosis_interval(self):
573        """Find this history's most recent diagnosis interval.
574
575        Returns a list of `_JobEvent` instances corresponding to the
576        most recent diagnosis interval occurring before this
577        history's end time.
578
579        The list is returned as with `self._history`, ordered from
580        most to least recent.
581
582        @return The list of the `_JobEvent`s in the diagnosis
583                interval.
584
585        """
586        self._init_status_interval()
587        return self._status_interval
588
589
590    def last_diagnosis(self):
591        """Return the diagnosis of whether the DUT is working.
592
593        This searches the DUT's job history, looking for the most
594        recent status task for the DUT.  Return a tuple of
595        `(diagnosis, task)`.
596
597        The `diagnosis` entry in the tuple is one of these values:
598          * UNUSED - The host's last status task is older than
599              `self.start_time`.
600          * WORKING - The DUT is working.
601          * BROKEN - The DUT likely requires manual intervention.
602          * UNKNOWN - No task could be found indicating status for
603              the DUT.
604
605        If the DUT was working at last check, but hasn't been used
606        inside this history's time interval, the status `UNUSED` is
607        returned with the last status task, instead of `WORKING`.
608
609        The `task` entry in the tuple is the status task that led to
610        the diagnosis.  The task will be `None` if the diagnosis is
611        `UNKNOWN`.
612
613        @return A tuple with the DUT's diagnosis and the task that
614                determined it.
615
616        """
617        self._init_status_task()
618        diagnosis = self._status_diagnosis
619        if (self.start_time is not None and
620                self._status_task is not None and
621                self._status_task.end_time < self.start_time and
622                diagnosis == WORKING):
623            diagnosis = UNUSED
624        return diagnosis, self._status_task
625
626
627def get_diagnosis_interval(host_id, end_time, success):
628    """Return the last diagnosis interval for a given host and time.
629
630    This routine queries the database for the special tasks on a
631    given host before a given time.  From those tasks it selects the
632    last status task before a change in status, and the first status
633    task after the change.  When `success` is true, the change must
634    be from "working" to "broken".  When false, the search is for a
635    change in the opposite direction.
636
637    A "successful status task" is any successful special task.  A
638    "failed status task" is a failed Repair task.  These criteria
639    are based on the definition of "status task" in the module-level
640    docstring, above.
641
642    This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`.
643
644    @param host_id     Database host id of the desired host.
645    @param end_time    Find the last eligible interval before this time.
646    @param success     Whether the eligible interval should start with a
647                       success or a failure.
648
649    @return A list containing the start time of the earliest job
650            selected, and the end time of the latest job.
651
652    """
653    base_query = afe_models.SpecialTask.objects.filter(
654            host_id=host_id, is_complete=True)
655    success_query = base_query.filter(success=True)
656    failure_query = base_query.filter(success=False, task='Repair')
657    if success:
658        query0 = success_query
659        query1 = failure_query
660    else:
661        query0 = failure_query
662        query1 = success_query
663    query0 = query0.filter(time_finished__lte=end_time)
664    query0 = query0.order_by('time_started').reverse()
665    if not query0:
666        return []
667    task0 = query0[0]
668    query1 = query1.filter(time_finished__gt=task0.time_finished)
669    task1 = query1.order_by('time_started')[0]
670    return [task0.time_started.strftime(time_utils.TIME_FMT),
671            task1.time_finished.strftime(time_utils.TIME_FMT)]
672
673
674def get_status_task(host_id, end_time):
675    """Get the last status task for a host before a given time.
676
677    This routine returns a Django query for the AFE database to find
678    the last task that finished on the given host before the given
679    time that was either a successful task, or a Repair task.  The
680    query criteria are based on the definition of "status task" in
681    the module-level docstring, above.
682
683    This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`.
684
685    @param host_id     Database host id of the desired host.
686    @param end_time    End time of the range of interest.
687
688    @return A Django query-set selecting the single special task of
689            interest.
690
691    """
692    # Selects status tasks:  any Repair task, or any successful task.
693    status_tasks = (django_models.Q(task='Repair') |
694                    django_models.Q(success=True))
695    # Our caller needs a Django query set in order to serialize the
696    # result, so we don't resolve the query here; we just return a
697    # slice with at most one element.
698    return afe_models.SpecialTask.objects.filter(
699            status_tasks,
700            host_id=host_id,
701            time_finished__lte=end_time,
702            is_complete=True).order_by('time_started').reverse()[0:1]
703
704
705def _get_job_logdir(job):
706    """Gets the logdir for an AFE job.
707
708    @param job Job object which has id and owner properties.
709
710    @return Relative path of the results log directory.
711    """
712    return '%s-%s' % (job.id, job.owner)
713
714
715def get_job_gs_url(job):
716    """Gets the GS URL for an AFE job.
717
718    @param job Job object which has id and owner properties.
719
720    @return Absolute GS URL to the results log directory.
721    """
722    return _JobEvent.get_gs_url(_get_job_logdir(job))
723