1# Copyright 2015 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Services relating to DUT status and job history. 6 7The central abstraction of this module is the `HostJobHistory` 8class. This class provides two related pieces of information 9regarding a single DUT: 10 * A history of tests and special tasks that have run on 11 the DUT in a given time range. 12 * Whether the DUT was "working" or "broken" at a given 13 time. 14 15The "working" or "broken" status of a DUT is determined by 16the DUT's special task history. At the end of any job or 17task, the status is indicated as follows: 18 * After any successful special task, the DUT is considered 19 "working". 20 * After any failed Repair task, the DUT is considered "broken". 21 * After any other special task or after any regular test job, the 22 DUT's status is considered unchanged. 23 24Definitions for terms used in the code below: 25 * status task - Any special task that determines the DUT's 26 status; that is, any successful task, or any failed Repair. 27 * diagnosis interval - A time interval during which DUT status 28 changed either from "working" to "broken", or vice versa. The 29 interval starts with the last status task with the old status, 30 and ends after the first status task with the new status. 31 32Diagnosis intervals are interesting because they normally contain 33the logs explaining a failure or repair event. 34 35""" 36 37import common 38import os 39from autotest_lib.frontend import setup_django_environment 40from django.db import models as django_models 41 42from autotest_lib.client.common_lib import global_config 43from autotest_lib.client.common_lib import site_utils 44from autotest_lib.client.common_lib import time_utils 45from autotest_lib.frontend.afe import models as afe_models 46from autotest_lib.site_utils.suite_scheduler import constants 47 48 49# Values used to describe the diagnosis of a DUT. These values are 50# used to indicate both DUT status after a job or task, and also 51# diagnosis of whether the DUT was working at the end of a given 52# time interval. 53# 54# UNUSED: Used when there are no events recorded in a given 55# time interval. 56# UNKNOWN: For an individual event, indicates that the DUT status 57# is unchanged from the previous event. For a time interval, 58# indicates that the DUT's status can't be determined from the 59# DUT's history. 60# WORKING: Indicates that the DUT was working normally after the 61# event, or at the end of the time interval. 62# BROKEN: Indicates that the DUT needed manual repair after the 63# event, or at the end of the time interval. 64# 65UNUSED = 0 66UNKNOWN = 1 67WORKING = 2 68BROKEN = 3 69 70 71def parse_time(time_string): 72 """Parse time according to a canonical form. 73 74 The "canonical" form is the form in which date/time 75 values are stored in the database. 76 77 @param time_string Time to be parsed. 78 """ 79 return int(time_utils.to_epoch_time(time_string)) 80 81 82class _JobEvent(object): 83 """Information about an event in host history. 84 85 This remembers the relevant data from a single event in host 86 history. An event is any change in DUT state caused by a job 87 or special task. The data captured are the start and end times 88 of the event, the URL of logs to the job or task causing the 89 event, and a diagnosis of whether the DUT was working or failed 90 afterwards. 91 92 This class is an adapter around the database model objects 93 describing jobs and special tasks. This is an abstract 94 superclass, with concrete subclasses for `HostQueueEntry` and 95 `SpecialTask` objects. 96 97 @property start_time Time the job or task began execution. 98 @property end_time Time the job or task finished execution. 99 @property id id of the event in the AFE database. 100 @property name Name of the event, derived from the AFE database. 101 @property job_status Short string describing the event's final status. 102 @property logdir Relative path to the logs for the event's job. 103 @property job_url URL to the logs for the event's job. 104 @property gs_url GS URL to the logs for the event's job. 105 @property job_id id of the AFE job for HQEs. None otherwise. 106 @property diagnosis Working status of the DUT after the event. 107 @property is_special Boolean indicating if the event is a special task. 108 109 """ 110 111 get_config_value = global_config.global_config.get_config_value 112 _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern') 113 114 @classmethod 115 def get_log_url(cls, afe_hostname, logdir): 116 """Return a URL to job results. 117 118 The URL is constructed from a base URL determined by the 119 global config, plus the relative path of the job's log 120 directory. 121 122 @param afe_hostname Hostname for autotest frontend 123 @param logdir Relative path of the results log directory. 124 125 @return A URL to the requested results log. 126 127 """ 128 return cls._LOG_URL_PATTERN % (afe_hostname, logdir) 129 130 131 @classmethod 132 def get_gs_url(cls, logdir): 133 """Return a GS URL to job results. 134 135 The URL is constructed from a base URL determined by the 136 global config, plus the relative path of the job's log 137 directory. 138 139 @param logdir Relative path of the results log directory. 140 141 @return A URL to the requested results log. 142 143 """ 144 return os.path.join(site_utils.get_offload_gsuri(), logdir) 145 146 147 def __init__(self, start_time, end_time): 148 self.start_time = parse_time(start_time) 149 self.end_time = parse_time(end_time) 150 151 152 def __cmp__(self, other): 153 """Compare two jobs by their start time. 154 155 This is a standard Python `__cmp__` method to allow sorting 156 `_JobEvent` objects by their times. 157 158 @param other The `_JobEvent` object to compare to `self`. 159 160 """ 161 return self.start_time - other.start_time 162 163 164 @property 165 def id(self): 166 """Return the id of the event in the AFE database.""" 167 raise NotImplementedError() 168 169 170 @property 171 def name(self): 172 """Return the name of the event.""" 173 raise NotImplementedError() 174 175 176 @property 177 def job_status(self): 178 """Return a short string describing the event's final status.""" 179 raise NotImplementedError() 180 181 182 @property 183 def logdir(self): 184 """Return the relative path for this event's job logs.""" 185 raise NotImplementedError() 186 187 188 @property 189 def job_url(self): 190 """Return the URL for this event's job logs.""" 191 raise NotImplementedError() 192 193 194 @property 195 def gs_url(self): 196 """Return the GS URL for this event's job logs.""" 197 raise NotImplementedError() 198 199 200 @property 201 def job_id(self): 202 """Return the id of the AFE job for HQEs. None otherwise.""" 203 raise NotImplementedError() 204 205 206 @property 207 def diagnosis(self): 208 """Return the status of the DUT after this event. 209 210 The diagnosis is interpreted as follows: 211 UNKNOWN - The DUT status was the same before and after 212 the event. 213 WORKING - The DUT appeared to be working after the event. 214 BROKEN - The DUT likely required manual intervention 215 after the event. 216 217 @return A valid diagnosis value. 218 219 """ 220 raise NotImplementedError() 221 222 223 @property 224 def is_special(self): 225 """Return if the event is for a special task.""" 226 raise NotImplementedError() 227 228 229class _SpecialTaskEvent(_JobEvent): 230 """`_JobEvent` adapter for special tasks. 231 232 This class wraps the standard `_JobEvent` interface around a row 233 in the `afe_special_tasks` table. 234 235 """ 236 237 @classmethod 238 def get_tasks(cls, afe, host_id, start_time, end_time): 239 """Return special tasks for a host in a given time range. 240 241 Return a list of `_SpecialTaskEvent` objects representing all 242 special tasks that ran on the given host in the given time 243 range. The list is ordered as it was returned by the query 244 (i.e. unordered). 245 246 @param afe Autotest frontend 247 @param host_id Database host id of the desired host. 248 @param start_time Start time of the range of interest. 249 @param end_time End time of the range of interest. 250 251 @return A list of `_SpecialTaskEvent` objects. 252 253 """ 254 query_start = time_utils.epoch_time_to_date_string(start_time) 255 query_end = time_utils.epoch_time_to_date_string(end_time) 256 tasks = afe.get_host_special_tasks( 257 host_id, 258 time_started__gte=query_start, 259 time_finished__lte=query_end, 260 is_complete=1) 261 return [cls(afe.server, t) for t in tasks] 262 263 264 @classmethod 265 def get_status_task(cls, afe, host_id, end_time): 266 """Return the task indicating a host's status at a given time. 267 268 The task returned determines the status of the DUT; the 269 diagnosis on the task indicates the diagnosis for the DUT at 270 the given `end_time`. 271 272 @param afe Autotest frontend 273 @param host_id Database host id of the desired host. 274 @param end_time Find status as of this time. 275 276 @return A `_SpecialTaskEvent` object for the requested task, 277 or `None` if no task was found. 278 279 """ 280 query_end = time_utils.epoch_time_to_date_string(end_time) 281 task = afe.get_host_status_task(host_id, query_end) 282 return cls(afe.server, task) if task else None 283 284 285 def __init__(self, afe_hostname, afetask): 286 self._afe_hostname = afe_hostname 287 self._afetask = afetask 288 super(_SpecialTaskEvent, self).__init__( 289 afetask.time_started, afetask.time_finished) 290 291 292 @property 293 def id(self): 294 return self._afetask.id 295 296 297 @property 298 def name(self): 299 return self._afetask.task 300 301 302 @property 303 def job_status(self): 304 if self._afetask.is_aborted: 305 return 'ABORTED' 306 elif self._afetask.success: 307 return 'PASS' 308 else: 309 return 'FAIL' 310 311 312 @property 313 def logdir(self): 314 return ('hosts/%s/%s-%s' % 315 (self._afetask.host.hostname, self._afetask.id, 316 self._afetask.task.lower())) 317 318 319 @property 320 def job_url(self): 321 return _SpecialTaskEvent.get_log_url(self._afe_hostname, self.logdir) 322 323 324 @property 325 def gs_url(self): 326 return _SpecialTaskEvent.get_gs_url(self.logdir) 327 328 329 @property 330 def job_id(self): 331 return None 332 333 334 @property 335 def diagnosis(self): 336 if self._afetask.success: 337 return WORKING 338 elif self._afetask.task == 'Repair': 339 return BROKEN 340 else: 341 return UNKNOWN 342 343 344 @property 345 def is_special(self): 346 return True 347 348 349class _TestJobEvent(_JobEvent): 350 """`_JobEvent` adapter for regular test jobs. 351 352 This class wraps the standard `_JobEvent` interface around a row 353 in the `afe_host_queue_entries` table. 354 355 """ 356 357 @classmethod 358 def get_hqes(cls, afe, host_id, start_time, end_time): 359 """Return HQEs for a host in a given time range. 360 361 Return a list of `_TestJobEvent` objects representing all the 362 HQEs of all the jobs that ran on the given host in the given 363 time range. The list is ordered as it was returned by the 364 query (i.e. unordered). 365 366 @param afe Autotest frontend 367 @param host_id Database host id of the desired host. 368 @param start_time Start time of the range of interest. 369 @param end_time End time of the range of interest. 370 371 @return A list of `_TestJobEvent` objects. 372 373 """ 374 query_start = time_utils.epoch_time_to_date_string(start_time) 375 query_end = time_utils.epoch_time_to_date_string(end_time) 376 hqelist = afe.get_host_queue_entries( 377 host_id=host_id, 378 start_time=query_start, 379 end_time=query_end, 380 complete=1) 381 return [cls(afe.server, hqe) for hqe in hqelist] 382 383 384 def __init__(self, afe_hostname, hqe): 385 self._afe_hostname = afe_hostname 386 self._hqe = hqe 387 super(_TestJobEvent, self).__init__( 388 hqe.started_on, hqe.finished_on) 389 390 391 @property 392 def id(self): 393 return self._hqe.id 394 395 396 @property 397 def name(self): 398 return self._hqe.job.name 399 400 401 @property 402 def job_status(self): 403 return self._hqe.status 404 405 406 @property 407 def logdir(self): 408 return _get_job_logdir(self._hqe.job) 409 410 411 @property 412 def job_url(self): 413 return _TestJobEvent.get_log_url(self._afe_hostname, self.logdir) 414 415 416 @property 417 def gs_url(self): 418 return _TestJobEvent.get_gs_url(self.logdir) 419 420 421 @property 422 def job_id(self): 423 return self._hqe.job.id 424 425 426 @property 427 def diagnosis(self): 428 return UNKNOWN 429 430 431 @property 432 def is_special(self): 433 return False 434 435 436class HostJobHistory(object): 437 """Class to query and remember DUT execution and status history. 438 439 This class is responsible for querying the database to determine 440 the history of a single DUT in a time interval of interest, and 441 for remembering the query results for reporting. 442 443 @property hostname Host name of the DUT. 444 @property start_time Start of the requested time interval, as a unix 445 timestamp (epoch time). 446 This field may be `None`. 447 @property end_time End of the requested time interval, as a unix 448 timestamp (epoch time). 449 @property _afe Autotest frontend for queries. 450 @property _host Database host object for the DUT. 451 @property _history A list of jobs and special tasks that 452 ran on the DUT in the requested time 453 interval, ordered in reverse, from latest 454 to earliest. 455 456 @property _status_interval A list of all the jobs and special 457 tasks that ran on the DUT in the 458 last diagnosis interval prior to 459 `end_time`, ordered from latest to 460 earliest. 461 @property _status_diagnosis The DUT's status as of `end_time`. 462 @property _status_task The DUT's last status task as of 463 `end_time`. 464 465 """ 466 467 @classmethod 468 def get_host_history(cls, afe, hostname, start_time, end_time): 469 """Create a `HostJobHistory` instance for a single host. 470 471 Simple factory method to construct host history from a 472 hostname. Simply looks up the host in the AFE database, and 473 passes it to the class constructor. 474 475 @param afe Autotest frontend 476 @param hostname Name of the host. 477 @param start_time Start time for the history's time 478 interval. 479 @param end_time End time for the history's time interval. 480 481 @return A new `HostJobHistory` instance. 482 483 """ 484 afehost = afe.get_hosts(hostname=hostname)[0] 485 return cls(afe, afehost, start_time, end_time) 486 487 488 @classmethod 489 def get_multiple_histories(cls, afe, start_time, end_time, 490 board=None, pool=None): 491 """Create `HostJobHistory` instances for a set of hosts. 492 493 The set of hosts can be specified as "all hosts of a given 494 board type", "all hosts in a given pool", or "all hosts 495 of a given board and pool". 496 497 @param afe Autotest frontend 498 @param start_time Start time for the history's time 499 interval. 500 @param end_time End time for the history's time interval. 501 @param board All hosts must have this board type; if 502 `None`, all boards are allowed. 503 @param pool All hosts must be in this pool; if 504 `None`, all pools are allowed. 505 506 @return A list of new `HostJobHistory` instances. 507 508 """ 509 # If `board` or `pool` are both `None`, we could search the 510 # entire database, which is more expensive than we want. 511 # Our caller currently won't (can't) do this, but assert to 512 # be safe. 513 assert board is not None or pool is not None 514 labels = [] 515 if board is not None: 516 labels.append(constants.Labels.BOARD_PREFIX + board) 517 if pool is not None: 518 labels.append(constants.Labels.POOL_PREFIX + pool) 519 kwargs = {'multiple_labels': labels} 520 hosts = afe.get_hosts(**kwargs) 521 return [cls(afe, h, start_time, end_time) for h in hosts] 522 523 524 def __init__(self, afe, afehost, start_time, end_time): 525 self._afe = afe 526 self.hostname = afehost.hostname 527 self.end_time = end_time 528 self.start_time = start_time 529 self._host = afehost 530 # Don't spend time on queries until they're needed. 531 self._history = None 532 self._status_interval = None 533 self._status_diagnosis = None 534 self._status_task = None 535 536 537 def _get_history(self, start_time, end_time): 538 """Get the list of events for the given interval.""" 539 newtasks = _SpecialTaskEvent.get_tasks( 540 self._afe, self._host.id, start_time, end_time) 541 newhqes = _TestJobEvent.get_hqes( 542 self._afe, self._host.id, start_time, end_time) 543 newhistory = newtasks + newhqes 544 newhistory.sort(reverse=True) 545 return newhistory 546 547 548 def __iter__(self): 549 if self._history is None: 550 self._history = self._get_history(self.start_time, 551 self.end_time) 552 return self._history.__iter__() 553 554 555 def _extract_prefixed_label(self, prefix): 556 labels = [l for l in self._host.labels 557 if l.startswith(prefix)] 558 return labels[0][len(prefix) : ] if labels else None 559 560 561 @property 562 def host(self): 563 """Return the AFE host object for this history.""" 564 return self._host 565 566 567 @property 568 def host_board(self): 569 """Return the board name for this history's DUT.""" 570 prefix = constants.Labels.BOARD_PREFIX 571 return self._extract_prefixed_label(prefix) 572 573 574 @property 575 def host_pool(self): 576 """Return the pool name for this history's DUT.""" 577 prefix = constants.Labels.POOL_PREFIX 578 return self._extract_prefixed_label(prefix) 579 580 581 def _init_status_task(self): 582 """Fill in `self._status_diagnosis` and `_status_task`.""" 583 if self._status_diagnosis is not None: 584 return 585 self._status_task = _SpecialTaskEvent.get_status_task( 586 self._afe, self._host.id, self.end_time) 587 if self._status_task is not None: 588 self._status_diagnosis = self._status_task.diagnosis 589 else: 590 self._status_diagnosis = UNKNOWN 591 592 593 def _init_status_interval(self): 594 """Fill in `self._status_interval`.""" 595 if self._status_interval is not None: 596 return 597 self._init_status_task() 598 self._status_interval = [] 599 if self._status_task is None: 600 return 601 query_end = time_utils.epoch_time_to_date_string(self.end_time) 602 interval = self._afe.get_host_diagnosis_interval( 603 self._host.id, query_end, 604 self._status_diagnosis != WORKING) 605 if not interval: 606 return 607 self._status_interval = self._get_history( 608 parse_time(interval[0]), 609 parse_time(interval[1])) 610 611 612 def diagnosis_interval(self): 613 """Find this history's most recent diagnosis interval. 614 615 Returns a list of `_JobEvent` instances corresponding to the 616 most recent diagnosis interval occurring before this 617 history's end time. 618 619 The list is returned as with `self._history`, ordered from 620 most to least recent. 621 622 @return The list of the `_JobEvent`s in the diagnosis 623 interval. 624 625 """ 626 self._init_status_interval() 627 return self._status_interval 628 629 630 def last_diagnosis(self): 631 """Return the diagnosis of whether the DUT is working. 632 633 This searches the DUT's job history, looking for the most 634 recent status task for the DUT. Return a tuple of 635 `(diagnosis, task)`. 636 637 The `diagnosis` entry in the tuple is one of these values: 638 * UNUSED - The host's last status task is older than 639 `self.start_time`. 640 * WORKING - The DUT is working. 641 * BROKEN - The DUT likely requires manual intervention. 642 * UNKNOWN - No task could be found indicating status for 643 the DUT. 644 645 If the DUT was working at last check, but hasn't been used 646 inside this history's time interval, the status `UNUSED` is 647 returned with the last status task, instead of `WORKING`. 648 649 The `task` entry in the tuple is the status task that led to 650 the diagnosis. The task will be `None` if the diagnosis is 651 `UNKNOWN`. 652 653 @return A tuple with the DUT's diagnosis and the task that 654 determined it. 655 656 """ 657 self._init_status_task() 658 diagnosis = self._status_diagnosis 659 if (self.start_time is not None and 660 self._status_task is not None and 661 self._status_task.end_time < self.start_time and 662 diagnosis == WORKING): 663 diagnosis = UNUSED 664 return diagnosis, self._status_task 665 666 667def get_diagnosis_interval(host_id, end_time, success): 668 """Return the last diagnosis interval for a given host and time. 669 670 This routine queries the database for the special tasks on a 671 given host before a given time. From those tasks it selects the 672 last status task before a change in status, and the first status 673 task after the change. When `success` is true, the change must 674 be from "working" to "broken". When false, the search is for a 675 change in the opposite direction. 676 677 A "successful status task" is any successful special task. A 678 "failed status task" is a failed Repair task. These criteria 679 are based on the definition of "status task" in the module-level 680 docstring, above. 681 682 This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. 683 684 @param host_id Database host id of the desired host. 685 @param end_time Find the last eligible interval before this time. 686 @param success Whether the eligible interval should start with a 687 success or a failure. 688 689 @return A list containing the start time of the earliest job 690 selected, and the end time of the latest job. 691 692 """ 693 base_query = afe_models.SpecialTask.objects.filter( 694 host_id=host_id, is_complete=True) 695 success_query = base_query.filter(success=True) 696 failure_query = base_query.filter(success=False, task='Repair') 697 if success: 698 query0 = success_query 699 query1 = failure_query 700 else: 701 query0 = failure_query 702 query1 = success_query 703 query0 = query0.filter(time_finished__lte=end_time) 704 query0 = query0.order_by('time_started').reverse() 705 if not query0: 706 return [] 707 task0 = query0[0] 708 query1 = query1.filter(time_finished__gt=task0.time_finished) 709 task1 = query1.order_by('time_started')[0] 710 return [task0.time_started.strftime(time_utils.TIME_FMT), 711 task1.time_finished.strftime(time_utils.TIME_FMT)] 712 713 714def get_status_task(host_id, end_time): 715 """Get the last status task for a host before a given time. 716 717 This routine returns a Django query for the AFE database to find 718 the last task that finished on the given host before the given 719 time that was either a successful task, or a Repair task. The 720 query criteria are based on the definition of "status task" in 721 the module-level docstring, above. 722 723 This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. 724 725 @param host_id Database host id of the desired host. 726 @param end_time End time of the range of interest. 727 728 @return A Django query-set selecting the single special task of 729 interest. 730 731 """ 732 # Selects status tasks: any Repair task, or any successful task. 733 status_tasks = (django_models.Q(task='Repair') | 734 django_models.Q(success=True)) 735 # Our caller needs a Django query set in order to serialize the 736 # result, so we don't resolve the query here; we just return a 737 # slice with at most one element. 738 return afe_models.SpecialTask.objects.filter( 739 status_tasks, 740 host_id=host_id, 741 time_finished__lte=end_time, 742 is_complete=True).order_by('time_started').reverse()[0:1] 743 744 745def _get_job_logdir(job): 746 """Gets the logdir for an AFE job. 747 748 @param job Job object which has id and owner properties. 749 750 @return Relative path of the results log directory. 751 """ 752 return '%s-%s' % (job.id, job.owner) 753 754 755def get_job_gs_url(job): 756 """Gets the GS URL for an AFE job. 757 758 @param job Job object which has id and owner properties. 759 760 @return Absolute GS URL to the results log directory. 761 """ 762 return _JobEvent.get_gs_url(_get_job_logdir(job)) 763