1#!/usr/bin/python 2# 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7import datetime as datetime_base 8import logging 9from datetime import datetime 10 11import common 12 13from autotest_lib.server import utils 14from autotest_lib.server.cros.dynamic_suite import reporting 15from autotest_lib.server.cros.dynamic_suite import reporting_utils 16 17 18class BoardNotAvailableError(utils.TestLabException): 19 """Raised when a board is not available in the lab.""" 20 21 22class NotEnoughDutsError(utils.TestLabException): 23 """Rasied when the lab doesn't have the minimum number of duts.""" 24 25 26class SimpleTimer(object): 27 """A simple timer used to periodically check if a deadline has passed.""" 28 29 def _reset(self): 30 """Reset the deadline.""" 31 if not self.interval_hours or self.interval_hours < 0: 32 logging.error('Bad interval %s', self.interval_hours) 33 self.deadline = None 34 return 35 self.deadline = datetime.now() + datetime_base.timedelta( 36 hours=self.interval_hours) 37 38 39 def __init__(self, interval_hours=0.5): 40 """Initialize a simple periodic deadline timer. 41 42 @param interval_hours: Interval of the deadline. 43 """ 44 self.interval_hours = interval_hours 45 self._reset() 46 47 48 def poll(self): 49 """Poll the timer to see if we've hit the deadline. 50 51 This method resets the deadline if it has passed. If the deadline 52 hasn't been set, or the current time is less than the deadline, the 53 method returns False. 54 55 @return: True if the deadline has passed, False otherwise. 56 """ 57 if not self.deadline or datetime.now() < self.deadline: 58 return False 59 self._reset() 60 return True 61 62 63class JobTimer(object): 64 """Utility class capable of measuring job timeouts. 65 """ 66 67 # Format used in datetime - string conversion. 68 time_format = '%m-%d-%Y [%H:%M:%S]' 69 70 def __init__(self, job_created_time, timeout_mins): 71 """JobTimer constructor. 72 73 @param job_created_time: float representing the time a job was 74 created. Eg: time.time() 75 @param timeout_mins: float representing the timeout in minutes. 76 """ 77 self.job_created_time = datetime.fromtimestamp(job_created_time) 78 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0) 79 self.debug_output_timer = SimpleTimer(interval_hours=0.5) 80 self.past_halftime = False 81 82 83 @classmethod 84 def format_time(cls, datetime_obj): 85 """Get the string formatted version of the datetime object. 86 87 @param datetime_obj: A datetime.datetime object. 88 Eg: datetime.datetime.now() 89 90 @return: A formatted string containing the date/time of the 91 input datetime. 92 """ 93 return datetime_obj.strftime(cls.time_format) 94 95 96 def elapsed_time(self): 97 """Get the time elapsed since this job was created. 98 99 @return: A timedelta object representing the elapsed time. 100 """ 101 return datetime.now() - self.job_created_time 102 103 104 def is_suite_timeout(self): 105 """Check if the suite timed out. 106 107 @return: True if more than timeout_hours has elapsed since the suite job 108 was created. 109 """ 110 if self.elapsed_time() >= self.timeout_hours: 111 logging.info('Suite timed out. Started on %s, timed out on %s', 112 self.format_time(self.job_created_time), 113 self.format_time(datetime.now())) 114 return True 115 return False 116 117 118 def first_past_halftime(self): 119 """Check if we just crossed half time. 120 121 This method will only return True once, the first time it is called 122 after a job's elapsed time is past half its timeout. 123 124 @return True: If this is the first call of the method after halftime. 125 """ 126 if (not self.past_halftime and 127 self.elapsed_time() > self.timeout_hours/2): 128 self.past_halftime = True 129 return True 130 return False 131 132 133class RPCHelper(object): 134 """A class to help diagnose a suite run through the rpc interface. 135 """ 136 137 def __init__(self, rpc_interface): 138 """Constructor for rpc helper class. 139 140 @param rpc_interface: An rpc object, eg: A RetryingAFE instance. 141 """ 142 self.rpc_interface = rpc_interface 143 # Any bug filed during diagnosis. Generally these bugs are critical so 144 # there should only be one. 145 self.bug = '' 146 147 148 def diagnose_pool(self, board, pool, time_delta_hours, limit=5): 149 """Log diagnostic information about a timeout for a board/pool. 150 151 @param board: The board for which the current suite was run. 152 @param pool: The pool against which the current suite was run. 153 @param time_delta_hours: The time from which we should log information. 154 This is a datetime.timedelta object, as stored by the JobTimer. 155 @param limit: The maximum number of jobs per host, to log. 156 157 @raises proxy.JSONRPCException: For exceptions thrown across the wire. 158 """ 159 hosts = self.rpc_interface.get_hosts( 160 multiple_labels=('pool:%s' % pool, 'board:%s' % board)) 161 if not hosts: 162 logging.warning('No hosts found for board:%s in pool:%s', 163 board, pool) 164 return 165 cutoff = datetime.now() - time_delta_hours 166 for host in hosts: 167 jobs = self.rpc_interface.get_host_queue_entries( 168 host__id=host.id, started_on__gte=str(cutoff)) 169 job_info = '' 170 for job in jobs[-limit:]: 171 job_info += ('%s %s started on: %s status %s\n' % 172 (job.id, job.job.name, job.started_on, job.status)) 173 logging.error('host:%s, status:%s, locked: %s' 174 '\nlabels: %s\nLast %s jobs within %s:\n%s', 175 host.hostname, host.status, host.locked, host.labels, 176 limit, time_delta_hours, job_info) 177 178 179 def check_dut_availability(self, board, pool, minimum_duts=0): 180 """Check if DUT availability for a given board and pool is less than 181 minimum. 182 183 @param board: The board to check DUT availability. 184 @param pool: The pool to check DUT availability. 185 @param minimum_duts: Minimum Number of available machines required to 186 run the suite. Default is set to 0, which means do 187 not force the check of available machines before 188 running the suite. 189 @raise: NotEnoughDutsError if DUT availability is lower than minimum. 190 @raise: BoardNotAvailableError if no host found for requested 191 board/pool. 192 """ 193 if minimum_duts == 0: 194 return 195 196 hosts = self.rpc_interface.get_hosts( 197 invalid=False, 198 multiple_labels=('pool:%s' % pool, 'board:%s' % board)) 199 if not hosts: 200 raise BoardNotAvailableError( 201 'No hosts found for board:%s in pool:%s. The test lab ' 202 'currently does not cover test for this board and pool.'% 203 (board, pool)) 204 205 if len(hosts) < minimum_duts: 206 logging.debug('The total number of DUTs for %s in pool:%s is %d, ' 207 'which is no more than the required minimum number of' 208 ' available DUTS of %d. Minimum available DUT rule is' 209 ' not enforced.', board, pool, len(hosts), 210 minimum_duts) 211 return 212 213 # TODO(dshi): Replace the hard coded string with enum value, 214 # models.Host.Status.REPAIRING and REPAIR_FAILED 215 # setup_django_environment can't be imported now as paygen server does 216 # not have django package. 217 bad_statuses = ('Repair Failed', 'Repairing', 'Verifying') 218 unusable_hosts = [] 219 available_hosts = [] 220 for host in hosts: 221 if host.status in bad_statuses or host.locked: 222 unusable_hosts.append(host.hostname) 223 else: 224 available_hosts.append(host) 225 logging.debug('%d of %d DUTs are available for board %s pool %s.', 226 len(available_hosts), len(hosts), board, pool) 227 if len(available_hosts) < minimum_duts: 228 if unusable_hosts: 229 pool_health_bug = reporting.PoolHealthBug( 230 pool, board, unusable_hosts) 231 self.bug = reporting.Reporter().report(pool_health_bug)[0] 232 raise NotEnoughDutsError( 233 'Number of available DUTs for board %s pool %s is %d, which' 234 ' is less than the minimum value %d. ' 235 'Filed https://crbug.com/%s' % 236 (board, pool, len(available_hosts), minimum_duts, self.bug)) 237 238 239 def diagnose_job(self, job_id, instance_server): 240 """Diagnose a suite job. 241 242 Logs information about the jobs that are still to run in the suite. 243 244 @param job_id: The id of the suite job to get information about. 245 No meaningful information gets logged if the id is for a sub-job. 246 @param instance_server: The instance server. 247 Eg: cautotest, cautotest-cq, localhost. 248 """ 249 incomplete_jobs = self.rpc_interface.get_jobs( 250 parent_job_id=job_id, summary=True, 251 hostqueueentry__complete=False) 252 if incomplete_jobs: 253 logging.info('\n%s printing summary of incomplete jobs (%s):\n', 254 JobTimer.format_time(datetime.now()), 255 len(incomplete_jobs)) 256 for job in incomplete_jobs: 257 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:], 258 reporting_utils.link_job(job.id, instance_server)) 259 else: 260 logging.info('All jobs in suite have already completed.') 261