1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import logging, os, re, shutil, time
6
7from autotest_lib.client.bin import utils
8from autotest_lib.client.common_lib import error
9from autotest_lib.client.cros import cros_logging, sys_power
10#pylint: disable=W0611
11from autotest_lib.client.cros import flimflam_test_path
12import flimflam
13
14class Suspender(object):
15    """Class for suspend/resume measurements.
16
17    Public attributes:
18        disconnect_3G_time: Amount of seconds it took to disable 3G.
19        successes[]: List of timing measurement dicts from successful suspends.
20        failures[]: List of SuspendFailure exceptions from failed suspends.
21        device_times[]: List of individual device suspend/resume time dicts.
22
23    Public methods:
24        suspend: Do a suspend/resume cycle. Return timing measurement dict.
25
26    Private attributes:
27        _logs: Array of /var/log/messages lines since start of suspend cycle.
28        _log_file: Open file descriptor at the end of /var/log/messages.
29        _logdir: Directory to store firmware logs in case of errors.
30        _suspend: Set to the sys_power suspend function to use.
31        _throw: Set to have SuspendFailure exceptions raised to the caller.
32        _reset_pm_print_times: Set to deactivate pm_print_times after the test.
33        _restart_tlsdated: Set to restart tlsdated after the test.
34
35    Private methods:
36        __init__: Shuts off tlsdated for duration of test, disables 3G
37        __del__: Restore tlsdated (must run eventually, but GC delay no problem)
38        _set_pm_print_times: Enable/disable kernel device suspend timing output.
39        _check_failure_log: Check /sys/.../suspend_stats for new failures.
40        _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings
41        _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume
42        _device_resume_time: Read seconds overall device resume took from logs.
43        _individual_device_times: Reads individual device suspend/resume times.
44        _identify_driver: Return the driver name of a device (or "unknown").
45    """
46
47    # board-specific "time to suspend" values determined empirically
48    # TODO: migrate to separate file with http://crosbug.com/38148
49    _DEFAULT_SUSPEND_DELAY = 5
50    _SUSPEND_DELAY = {
51        # TODO: Reevaluate this when http://crosbug.com/38460 is fixed
52        'daisy': 6,
53        'daisy_spring': 6,
54        'peach_pit': 6,
55
56        # TODO: Reevaluate these when http://crosbug.com/38225 is fixed
57        'x86-mario': 6,
58        'x86-alex': 5,
59
60        # Lumpy and Stumpy need high values, because it seems to mitigate their
61        # RTC interrupt problem. See http://crosbug.com/36004
62        'lumpy': 5,
63        'stumpy': 5,
64
65        # RTS5209 card reader has a really bad staging driver, can take ~1 sec
66        'butterfly': 4,
67
68        # Hard disk sync and overall just slow
69        'parrot': 8,
70        'kiev': 9,
71    }
72
73    # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts
74    _ALARM_FORCE_EARLY_WAKEUP = 2147483647
75
76    # File written by send_metrics_on_resume containing timing information about
77    # the last resume.
78    _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings'
79
80    # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be
81    # enough to include ACPI Wake Reason... 10 should be far on the safe side.
82    _RELEVANT_EVENTLOG_LINES = 10
83
84    # Sanity check value to catch overlong resume times (from missed RTC wakes)
85    _MAX_RESUME_TIME = 10
86
87    # File written by powerd_suspend containing the hwclock time at resume.
88    HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume'
89
90    def __init__(self, logdir, method=sys_power.do_suspend,
91                 throw=False, device_times=False):
92        """Prepare environment for suspending."""
93        self.disconnect_3G_time = 0
94        self.successes = []
95        self.failures = []
96        self._logdir = logdir
97        self._suspend = method
98        self._throw = throw
99        self._reset_pm_print_times = False
100        self._restart_tlsdated = False
101        self._log_file = None
102        if device_times:
103            self.device_times = []
104
105        # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves
106        if utils.system_output('initctl status tlsdated').find('start') != -1:
107            utils.system('initctl stop tlsdated')
108            self._restart_tlsdated = True
109            # give process's file descriptors time to asynchronously tear down
110            time.sleep(0.1)
111
112        # prime powerd_suspend RTC timestamp saving and make sure hwclock works
113        utils.open_write_close(self.HWCLOCK_FILE, '')
114        hwclock_output = utils.system_output('hwclock -r --debug --utc',
115                                             ignore_status=True)
116        if not re.search('Using.*/dev interface to.*clock', hwclock_output):
117            raise error.TestError('hwclock cannot find rtc: ' + hwclock_output)
118
119        # activate device suspend timing debug output
120        if hasattr(self, 'device_times'):
121            if not int(utils.read_one_line('/sys/power/pm_print_times')):
122                self._set_pm_print_times(True)
123                self._reset_pm_print_times = True
124
125        # Shut down 3G to remove its variability from suspend time measurements
126        flim = flimflam.FlimFlam()
127        service = flim.FindCellularService(0)
128        if service:
129            logging.info('Found 3G interface, disconnecting.')
130            start_time = time.time()
131            (success, status) = flim.DisconnectService(
132                    service=service, wait_timeout=60)
133            if success:
134                logging.info('3G disconnected successfully.')
135                self.disconnect_3G_time = time.time() - start_time
136            else:
137                logging.error('Could not disconnect: %s.', status)
138                self.disconnect_3G_time = -1
139
140
141    def _set_pm_print_times(self, on):
142        """Enable/disable extra suspend timing output from powerd to syslog."""
143        if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)),
144                ignore_status=True):
145            logging.warning('Failed to set pm_print_times to %s', bool(on))
146            del self.device_times
147            self._reset_pm_print_times = False
148        else:
149            logging.info('Device resume times set to %s', bool(on))
150
151
152    def _get_board(self):
153        """Remove _freon from get_board if found."""
154	return (utils.get_board().replace("_freon", ""))
155
156
157    def _reset_logs(self):
158        """Throw away cached log lines and reset log pointer to current end."""
159        if self._log_file:
160            self._log_file.close()
161        self._log_file = open('/var/log/messages')
162        self._log_file.seek(0, os.SEEK_END)
163        self._logs = []
164
165
166    def _update_logs(self, retries=11):
167        """
168        Read all lines logged since last reset into log cache. Block until last
169        powerd_suspend resume message was read, raise if it takes too long.
170        """
171        finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished')
172        for retry in xrange(retries + 1):
173            lines = self._log_file.readlines()
174            if lines:
175                if self._logs and self._logs[-1][-1] != '\n':
176                    # Reassemble line that was cut in the middle
177                    self._logs[-1] += lines.pop(0)
178                self._logs += lines
179            for line in reversed(self._logs):
180                if (finished_regex.search(line)):
181                    return
182            time.sleep(0.005 * 2**retry)
183
184        raise error.TestError("Sanity check failed: did not try to suspend.")
185
186
187    def _ts(self, name, retries=11):
188        """Searches logs for last timestamp with a given suspend message."""
189        # Occasionally need to retry due to races from process wakeup order
190        for retry in xrange(retries + 1):
191            try:
192                f = open(self._TIMINGS_FILE)
193                for line in f:
194                    words = line.split('=')
195                    if name == words[0]:
196                        try:
197                            timestamp = float(words[1])
198                        except ValueError:
199                            logging.warning('Invalid timestamp: %s', line)
200                            timestamp = 0
201                        return timestamp
202            except IOError:
203                pass
204            time.sleep(0.005 * 2**retry)
205
206        raise error.TestError('Could not find %s entry.' % name)
207
208
209    def _hwclock_ts(self, not_before, retries=3):
210        """Read the RTC resume timestamp saved by powerd_suspend."""
211        for retry in xrange(retries + 1):
212            early_wakeup = False
213            if os.path.exists(self.HWCLOCK_FILE):
214                match = re.search(r'([0-9]+) seconds since .+ (-?[0-9.]+) sec',
215                                  utils.read_file(self.HWCLOCK_FILE), re.DOTALL)
216                if match:
217                    seconds = int(match.group(1)) + float(match.group(2))
218                    logging.debug('RTC resume timestamp read: %f', seconds)
219                    if seconds >= not_before:
220                        return seconds
221                    early_wakeup = True
222            time.sleep(0.05 * retry)
223        if early_wakeup:
224            logging.debug('Early wakeup, dumping eventlog if it exists:\n')
225            elog = utils.system_output('mosys eventlog list | tail -n %d' %
226                    self._RELEVANT_EVENTLOG_LINES, ignore_status=True)
227            wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1]
228            for line in reversed(self._logs):
229                match = re.search(r'PM1_STS: WAK.*', line)
230                if match:
231                    wake_syslog = match.group(0)
232                    break
233            else:
234                wake_syslog = 'unknown'
235            for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST:
236                if (re.search(b, utils.get_board()) and
237                        re.search(e, wake_elog) and re.search(s, wake_syslog)):
238                    logging.warning('Whitelisted spurious wake in S3: %s | %s',
239                                    wake_elog, wake_syslog)
240                    return None
241            raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s'
242                    % (wake_elog, wake_syslog))
243        if self._get_board() in ['lumpy', 'stumpy', 'kiev']:
244            logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' +
245                    utils.system_output('mosys nvram dump', ignore_status=True))
246            return None
247        raise error.TestError('Broken RTC timestamp: ' +
248                              utils.read_file(self.HWCLOCK_FILE))
249
250
251    def _firmware_resume_time(self):
252        """Calculate seconds for firmware resume from logged TSC. (x86 only)"""
253        if utils.get_arch() not in ['i686', 'x86_64']:
254            # TODO: support this on ARM somehow
255            return 0
256        regex = re.compile(r'TSC at resume: (\d+)$')
257        freq = 1000 * int(utils.read_one_line(
258                '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq'))
259        for line in reversed(self._logs):
260            match = regex.search(line)
261            if match:
262                return float(match.group(1)) / freq
263
264        raise error.TestError('Failed to find TSC resume value in syslog.')
265
266
267    def _device_resume_time(self):
268        """Read amount of seconds for overall device resume from syslog."""
269        regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)')
270        for line in reversed(self._logs):
271            match = regex.search(line)
272            if match:
273                return float(match.group(1)) / 1000
274
275        raise error.TestError('Failed to find device resume time in syslog.')
276
277
278    def _individual_device_times(self, start_resume):
279        """Return dict of individual device suspend and resume times."""
280        self.device_times.append(dict())
281        regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs')
282        for line in self._logs:
283            match = regex.search(line)
284            if match:
285                key = 'seconds_dev_' + match.group(1).replace(':', '-')
286                secs = float(match.group(2)) / 1e6
287                if cros_logging.extract_kernel_timestamp(line) > start_resume:
288                    key += '_resume'
289                else:
290                    key += '_suspend'
291                if key in self.device_times[-1]:
292                    logging.warning('Duplicate entry for %s: +%f', key, secs)
293                    self.device_times[-1][key] += secs
294                else:
295                    logging.debug('%s: %f', key, secs)
296                    self.device_times[-1][key] = secs
297
298
299    def _identify_driver(self, device):
300        """Return the driver name of a device (or "unknown")."""
301        for path, subdirs, _ in os.walk('/sys/devices'):
302            if device in subdirs:
303                node = os.path.join(path, device, 'driver')
304                if not os.path.exists(node):
305                    return "unknown"
306                return os.path.basename(os.path.realpath(node))
307        else:
308            return "unknown"
309
310
311    def _check_for_errors(self):
312        """Find and identify suspend errors. Return True iff we should retry."""
313        warning_regex = re.compile(r' kernel: \[.*WARNING:')
314        abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort'
315                r'| powerd_suspend\[.*Cancel suspend at kernel'
316                r'| kernel: \[.*PM: Wakeup pending, aborting suspend')
317        # rsyslogd can put this out of order with dmesg, so track in variable
318        fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error')
319        failed = False
320
321        # TODO(scottz): warning_monitor crosbug.com/38092
322        for i in xrange(len(self._logs)):
323            line = self._logs[i]
324            if warning_regex.search(line):
325                # match the source file from the WARNING line, and the
326                # actual error text by peeking one or two lines below that
327                src = cros_logging.strip_timestamp(line)
328                text = cros_logging.strip_timestamp(self._logs[i + 1]) + '\n' \
329                     + cros_logging.strip_timestamp(self._logs[i + 2])
330                for p1, p2 in sys_power.KernelError.WHITELIST:
331                    if re.search(p1, src) and re.search(p2, text):
332                        logging.info('Whitelisted KernelError: %s', src)
333                        break
334                else:
335                    raise sys_power.KernelError("%s\n%s" % (src, text))
336            if abort_regex.search(line):
337                wake_source = 'unknown'
338                match = re.search(r'last active wakeup source: (.*)$',
339                        '\n'.join(self._logs[i-5:i+3]), re.MULTILINE)
340                if match:
341                    wake_source = match.group(1)
342                driver = self._identify_driver(wake_source)
343                for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST:
344                    if (re.search(b, utils.get_board()) and
345                            re.search(w, wake_source)):
346                        logging.warning('Whitelisted spurious wake before '
347                                        'S3: %s | %s', wake_source, driver)
348                        return True
349                if "rtc" in driver:
350                    raise sys_power.SuspendTimeout('System took too '
351                                                   'long to suspend.')
352                raise sys_power.SpuriousWakeupError('Spurious wake '
353                        'before S3: %s | %s' % (wake_source, driver))
354            if fail_regex.search(line):
355                failed = True
356        if failed:
357            raise sys_power.SuspendFailure('Unidentified problem.')
358        return False
359
360
361    def suspend(self, duration=10):
362        """
363        Do a single suspend for 'duration' seconds. Estimates the amount of time
364        it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC
365        wakeup delay will be longer. Returns None on errors, or raises the
366        exception when _throw is set. Returns a dict of general measurements,
367        or a tuple (general_measurements, individual_device_times) when
368        _device_times is set.
369
370        @param duration: time in seconds to do a suspend prior to waking.
371        """
372        try:
373            iteration = len(self.failures) + len(self.successes) + 1
374
375            # Retry suspend in case we hit a known (whitelisted) bug
376            for _ in xrange(10):
377                self._reset_logs()
378                utils.system('sync')
379                board_delay = self._SUSPEND_DELAY.get(self._get_board(),
380                        self._DEFAULT_SUSPEND_DELAY)
381                try:
382                    alarm = self._suspend(duration + board_delay)
383                except sys_power.SpuriousWakeupError:
384                    # might be another error, we check for it ourselves below
385                    alarm = self._ALARM_FORCE_EARLY_WAKEUP
386
387                if os.path.exists('/sys/firmware/log'):
388                    for msg in re.findall(r'^.*ERROR.*$',
389                            utils.read_file('/sys/firmware/log'), re.M):
390                        for board, pattern in sys_power.FirmwareError.WHITELIST:
391                            if (re.search(board, utils.get_board()) and
392                                    re.search(pattern, msg)):
393                                logging.info('Whitelisted FW error: ' + msg)
394                                break
395                        else:
396                            firmware_log = os.path.join(self._logdir,
397                                    'firmware.log.' + str(iteration))
398                            shutil.copy('/sys/firmware/log', firmware_log)
399                            logging.info('Saved firmware log: ' + firmware_log)
400                            raise sys_power.FirmwareError(msg.strip('\r\n '))
401
402                self._update_logs()
403                if not self._check_for_errors():
404                    hwclock_ts = self._hwclock_ts(alarm)
405                    if hwclock_ts:
406                        break
407            else:
408                raise error.TestWarn('Ten tries failed due to whitelisted bug')
409
410            # calculate general measurements
411            start_resume = self._ts('start_resume_time')
412            kernel_down = (self._ts('end_suspend_time') -
413                           self._ts('start_suspend_time'))
414            kernel_up = self._ts('end_resume_time') - start_resume
415            devices_up = self._device_resume_time()
416            total_up = hwclock_ts - alarm
417            firmware_up = self._firmware_resume_time()
418            board_up = total_up - kernel_up - firmware_up
419            try:
420                cpu_up = self._ts('cpu_ready_time', 0) - start_resume
421            except error.TestError:
422                # can be missing on non-SMP machines
423                cpu_up = None
424            if total_up > self._MAX_RESUME_TIME:
425                raise error.TestError('Sanity check failed: missed RTC wakeup.')
426
427            logging.info('Success(%d): %g down, %g up, %g board, %g firmware, '
428                         '%g kernel, %g cpu, %g devices',
429                         iteration, kernel_down, total_up, board_up,
430                         firmware_up, kernel_up, cpu_up, devices_up)
431            self.successes.append({
432                'seconds_system_suspend': kernel_down,
433                'seconds_system_resume': total_up,
434                'seconds_system_resume_firmware': firmware_up + board_up,
435                'seconds_system_resume_firmware_cpu': firmware_up,
436                'seconds_system_resume_firmware_ec': board_up,
437                'seconds_system_resume_kernel': kernel_up,
438                'seconds_system_resume_kernel_cpu': cpu_up,
439                'seconds_system_resume_kernel_dev': devices_up,
440                })
441
442            if hasattr(self, 'device_times'):
443                self._individual_device_times(start_resume)
444                return (self.successes[-1], self.device_times[-1])
445            else:
446                return self.successes[-1]
447
448        except sys_power.SuspendFailure as ex:
449            message = '%s(%d): %s' % (type(ex).__name__, iteration, ex)
450            logging.error(message)
451            self.failures.append(ex)
452            if self._throw:
453                if type(ex).__name__ in ['KernelError', 'SuspendTimeout']:
454                    raise error.TestWarn(message)
455                else:
456                    raise error.TestFail(message)
457            return None
458
459
460    def finalize(self):
461        """Restore normal environment (not turning 3G back on for now...)"""
462        if os.path.exists(self.HWCLOCK_FILE):
463            os.remove(self.HWCLOCK_FILE)
464            if self._restart_tlsdated:
465                utils.system('initctl start tlsdated')
466            if self._reset_pm_print_times:
467                self._set_pm_print_times(False)
468
469
470    def __del__(self):
471        self.finalize()
472