1#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
13from autotest_lib.server import utils
14from autotest_lib.server.cros.dynamic_suite import reporting
15from autotest_lib.server.cros.dynamic_suite import reporting_utils
16
17
18class BoardNotAvailableError(utils.TestLabException):
19    """Raised when a board is not available in the lab."""
20
21
22class NotEnoughDutsError(utils.TestLabException):
23    """Rasied when the lab doesn't have the minimum number of duts."""
24
25
26class SimpleTimer(object):
27    """A simple timer used to periodically check if a deadline has passed."""
28
29    def _reset(self):
30        """Reset the deadline."""
31        if not self.interval_hours or self.interval_hours < 0:
32            logging.error('Bad interval %s', self.interval_hours)
33            self.deadline = None
34            return
35        self.deadline = datetime.now() + datetime_base.timedelta(
36                hours=self.interval_hours)
37
38
39    def __init__(self, interval_hours=0.5):
40        """Initialize a simple periodic deadline timer.
41
42        @param interval_hours: Interval of the deadline.
43        """
44        self.interval_hours = interval_hours
45        self._reset()
46
47
48    def poll(self):
49        """Poll the timer to see if we've hit the deadline.
50
51        This method resets the deadline if it has passed. If the deadline
52        hasn't been set, or the current time is less than the deadline, the
53        method returns False.
54
55        @return: True if the deadline has passed, False otherwise.
56        """
57        if not self.deadline or datetime.now() < self.deadline:
58            return False
59        self._reset()
60        return True
61
62
63class JobTimer(object):
64    """Utility class capable of measuring job timeouts.
65    """
66
67    # Format used in datetime - string conversion.
68    time_format = '%m-%d-%Y [%H:%M:%S]'
69
70    def __init__(self, job_created_time, timeout_mins):
71        """JobTimer constructor.
72
73        @param job_created_time: float representing the time a job was
74            created. Eg: time.time()
75        @param timeout_mins: float representing the timeout in minutes.
76        """
77        self.job_created_time = datetime.fromtimestamp(job_created_time)
78        self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
79        self.debug_output_timer = SimpleTimer(interval_hours=0.5)
80        self.past_halftime = False
81
82
83    @classmethod
84    def format_time(cls, datetime_obj):
85        """Get the string formatted version of the datetime object.
86
87        @param datetime_obj: A datetime.datetime object.
88            Eg: datetime.datetime.now()
89
90        @return: A formatted string containing the date/time of the
91            input datetime.
92        """
93        return datetime_obj.strftime(cls.time_format)
94
95
96    def elapsed_time(self):
97        """Get the time elapsed since this job was created.
98
99        @return: A timedelta object representing the elapsed time.
100        """
101        return datetime.now() - self.job_created_time
102
103
104    def is_suite_timeout(self):
105        """Check if the suite timed out.
106
107        @return: True if more than timeout_hours has elapsed since the suite job
108            was created.
109        """
110        if self.elapsed_time() >= self.timeout_hours:
111            logging.info('Suite timed out. Started on %s, timed out on %s',
112                         self.format_time(self.job_created_time),
113                         self.format_time(datetime.now()))
114            return True
115        return False
116
117
118    def first_past_halftime(self):
119        """Check if we just crossed half time.
120
121        This method will only return True once, the first time it is called
122        after a job's elapsed time is past half its timeout.
123
124        @return True: If this is the first call of the method after halftime.
125        """
126        if (not self.past_halftime and
127            self.elapsed_time() > self.timeout_hours/2):
128            self.past_halftime = True
129            return True
130        return False
131
132
133class RPCHelper(object):
134    """A class to help diagnose a suite run through the rpc interface.
135    """
136
137    def __init__(self, rpc_interface):
138        """Constructor for rpc helper class.
139
140        @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
141        """
142        self.rpc_interface = rpc_interface
143        # Any bug filed during diagnosis. Generally these bugs are critical so
144        # there should only be one.
145        self.bug = ''
146
147
148    def diagnose_pool(self, board, pool, time_delta_hours, limit=5):
149        """Log diagnostic information about a timeout for a board/pool.
150
151        @param board: The board for which the current suite was run.
152        @param pool: The pool against which the current suite was run.
153        @param time_delta_hours: The time from which we should log information.
154            This is a datetime.timedelta object, as stored by the JobTimer.
155        @param limit: The maximum number of jobs per host, to log.
156
157        @raises proxy.JSONRPCException: For exceptions thrown across the wire.
158        """
159        hosts = self.rpc_interface.get_hosts(
160                multiple_labels=('pool:%s' % pool, 'board:%s' % board))
161        if not hosts:
162            logging.warning('No hosts found for board:%s in pool:%s',
163                            board, pool)
164            return
165        cutoff = datetime.now() - time_delta_hours
166        for host in hosts:
167            jobs = self.rpc_interface.get_host_queue_entries(
168                    host__id=host.id, started_on__gte=str(cutoff))
169            job_info = ''
170            for job in jobs[-limit:]:
171                job_info += ('%s %s started on: %s status %s\n' %
172                        (job.id, job.job.name, job.started_on, job.status))
173            logging.error('host:%s, status:%s, locked: %s'
174                          '\nlabels: %s\nLast %s jobs within %s:\n%s',
175                          host.hostname, host.status, host.locked, host.labels,
176                          limit, time_delta_hours, job_info)
177
178
179    def check_dut_availability(self, board, pool, minimum_duts=0):
180        """Check if DUT availability for a given board and pool is less than
181        minimum.
182
183        @param board: The board to check DUT availability.
184        @param pool: The pool to check DUT availability.
185        @param minimum_duts: Minimum Number of available machines required to
186                             run the suite. Default is set to 0, which means do
187                             not force the check of available machines before
188                             running the suite.
189        @raise: NotEnoughDutsError if DUT availability is lower than minimum.
190        @raise: BoardNotAvailableError if no host found for requested
191                board/pool.
192        """
193        if minimum_duts == 0:
194            return
195
196        hosts = self.rpc_interface.get_hosts(
197                invalid=False,
198                multiple_labels=('pool:%s' % pool, 'board:%s' % board))
199        if not hosts:
200            raise BoardNotAvailableError(
201                    'No hosts found for board:%s in pool:%s. The test lab '
202                    'currently does not cover test for this board and pool.'%
203                    (board, pool))
204
205        if len(hosts) < minimum_duts:
206            logging.debug('The total number of DUTs for %s in pool:%s is %d, '
207                          'which is no more than the required minimum number of'
208                          ' available DUTS of %d. Minimum available DUT rule is'
209                          ' not enforced.', board, pool, len(hosts),
210                          minimum_duts)
211            return
212
213        # TODO(dshi): Replace the hard coded string with enum value,
214        # models.Host.Status.REPAIRING and REPAIR_FAILED
215        # setup_django_environment can't be imported now as paygen server does
216        # not have django package.
217        bad_statuses = ('Repair Failed', 'Repairing', 'Verifying')
218        unusable_hosts = []
219        available_hosts = []
220        for host in hosts:
221            if host.status in bad_statuses or host.locked:
222                unusable_hosts.append(host.hostname)
223            else:
224                available_hosts.append(host)
225        logging.debug('%d of %d DUTs are available for board %s pool %s.',
226                      len(available_hosts), len(hosts), board, pool)
227        if len(available_hosts) < minimum_duts:
228            if unusable_hosts:
229                pool_health_bug = reporting.PoolHealthBug(
230                        pool, board, unusable_hosts)
231                self.bug = reporting.Reporter().report(pool_health_bug)[0]
232            raise NotEnoughDutsError(
233                    'Number of available DUTs for board %s pool %s is %d, which'
234                    ' is less than the minimum value %d. '
235                    'Filed https://crbug.com/%s' %
236                    (board, pool, len(available_hosts), minimum_duts, self.bug))
237
238
239    def diagnose_job(self, job_id, instance_server):
240        """Diagnose a suite job.
241
242        Logs information about the jobs that are still to run in the suite.
243
244        @param job_id: The id of the suite job to get information about.
245            No meaningful information gets logged if the id is for a sub-job.
246        @param instance_server: The instance server.
247            Eg: cautotest, cautotest-cq, localhost.
248        """
249        incomplete_jobs = self.rpc_interface.get_jobs(
250                parent_job_id=job_id, summary=True,
251                hostqueueentry__complete=False)
252        if incomplete_jobs:
253            logging.info('\n%s printing summary of incomplete jobs (%s):\n',
254                         JobTimer.format_time(datetime.now()),
255                         len(incomplete_jobs))
256            for job in incomplete_jobs:
257                logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
258                             reporting_utils.link_job(job.id, instance_server))
259        else:
260            logging.info('All jobs in suite have already completed.')
261