1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import collections, logging, os, re, shutil, time
6
7from autotest_lib.client.bin import utils
8from autotest_lib.client.common_lib import base_utils, error
9from autotest_lib.client.cros import cros_logging, sys_power
10from autotest_lib.client.cros import power_utils
11from autotest_lib.client.cros import power_status
12#pylint: disable=W0611
13from autotest_lib.client.cros import flimflam_test_path
14import flimflam
15
16class Suspender(object):
17    """Class for suspend/resume measurements.
18
19    Public attributes:
20        disconnect_3G_time: Amount of seconds it took to disable 3G.
21        successes[]: List of timing measurement dicts from successful suspends.
22        failures[]: List of SuspendFailure exceptions from failed suspends.
23        device_times[]: List of individual device suspend/resume time dicts.
24
25    Public methods:
26        suspend: Do a suspend/resume cycle. Return timing measurement dict.
27
28    Private attributes:
29        _logs: Array of /var/log/messages lines since start of suspend cycle.
30        _log_file: Open file descriptor at the end of /var/log/messages.
31        _logdir: Directory to store firmware logs in case of errors.
32        _suspend: Set to the sys_power suspend function to use.
33        _throw: Set to have SuspendFailure exceptions raised to the caller.
34        _reset_pm_print_times: Set to deactivate pm_print_times after the test.
35        _restart_tlsdated: Set to restart tlsdated after the test.
36
37    Private methods:
38        __init__: Shuts off tlsdated for duration of test, disables 3G
39        __del__: Restore tlsdated (must run eventually, but GC delay no problem)
40        _set_pm_print_times: Enable/disable kernel device suspend timing output.
41        _check_failure_log: Check /sys/.../suspend_stats for new failures.
42        _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings
43        _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume
44        _device_resume_time: Read seconds overall device resume took from logs.
45        _individual_device_times: Reads individual device suspend/resume times.
46        _identify_driver: Return the driver name of a device (or "unknown").
47    """
48
49    # board-specific "time to suspend" values determined empirically
50    # TODO: migrate to separate file with http://crosbug.com/38148
51    _DEFAULT_SUSPEND_DELAY = 5
52    _SUSPEND_DELAY = {
53        # TODO: Reevaluate this when http://crosbug.com/38460 is fixed
54        'daisy': 6,
55        'daisy_spring': 6,
56        'peach_pit': 6,
57
58        # TODO: Reevaluate these when http://crosbug.com/38225 is fixed
59        'x86-mario': 6,
60        'x86-alex': 5,
61
62        # Lumpy and Stumpy need high values, because it seems to mitigate their
63        # RTC interrupt problem. See http://crosbug.com/36004
64        'lumpy': 5,
65        'stumpy': 5,
66
67        # RTS5209 card reader has a really bad staging driver, can take ~1 sec
68        'butterfly': 4,
69
70        # Hard disk sync and overall just slow
71        'parrot': 8,
72        'kiev': 9,
73    }
74
75    # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts
76    _ALARM_FORCE_EARLY_WAKEUP = 2147483647
77
78    # File written by send_metrics_on_resume containing timing information about
79    # the last resume.
80    _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings'
81
82    # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be
83    # enough to include ACPI Wake Reason... 10 should be far on the safe side.
84    _RELEVANT_EVENTLOG_LINES = 10
85
86    # Sanity check value to catch overlong resume times (from missed RTC wakes)
87    _MAX_RESUME_TIME = 10
88
89    # File written by powerd_suspend containing the hwclock time at resume.
90    HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume'
91
92    # File read by powerd to decide on the state to suspend (mem or freeze).
93    _SUSPEND_STATE_PREF_FILE = 'suspend_to_idle'
94
95    def __init__(self, logdir, method=sys_power.do_suspend,
96                 throw=False, device_times=False, suspend_state=''):
97        """
98        Prepare environment for suspending.
99        @param suspend_state: Suspend state to enter into. It can be
100                              'mem' or 'freeze' or an empty string. If
101                              the suspend state is an empty string,
102                              system suspends to the default pref.
103        """
104        self.disconnect_3G_time = 0
105        self.successes = []
106        self.failures = []
107        self._logdir = logdir
108        self._suspend = method
109        self._throw = throw
110        self._reset_pm_print_times = False
111        self._restart_tlsdated = False
112        self._log_file = None
113        self._suspend_state = suspend_state
114        if device_times:
115            self.device_times = []
116
117        # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves
118        if utils.system_output('initctl status tlsdated').find('start') != -1:
119            utils.system('initctl stop tlsdated')
120            self._restart_tlsdated = True
121            # give process's file descriptors time to asynchronously tear down
122            time.sleep(0.1)
123
124        # prime powerd_suspend RTC timestamp saving and make sure hwclock works
125        utils.open_write_close(self.HWCLOCK_FILE, '')
126        hwclock_output = utils.system_output('hwclock -r --debug --utc',
127                                             ignore_status=True)
128        if not re.search('Using.*/dev interface to.*clock', hwclock_output):
129            raise error.TestError('hwclock cannot find rtc: ' + hwclock_output)
130
131        # activate device suspend timing debug output
132        if hasattr(self, 'device_times'):
133            if not int(utils.read_one_line('/sys/power/pm_print_times')):
134                self._set_pm_print_times(True)
135                self._reset_pm_print_times = True
136
137        # Shut down 3G to remove its variability from suspend time measurements
138        flim = flimflam.FlimFlam()
139        service = flim.FindCellularService(0)
140        if service:
141            logging.info('Found 3G interface, disconnecting.')
142            start_time = time.time()
143            (success, status) = flim.DisconnectService(
144                    service=service, wait_timeout=60)
145            if success:
146                logging.info('3G disconnected successfully.')
147                self.disconnect_3G_time = time.time() - start_time
148            else:
149                logging.error('Could not disconnect: %s.', status)
150                self.disconnect_3G_time = -1
151
152        self._configure_suspend_state()
153
154    def _configure_suspend_state(self):
155        """Configure the suspend state as requested."""
156        if self._suspend_state:
157            available_suspend_states = utils.read_one_line('/sys/power/state')
158            if self._suspend_state not in available_suspend_states:
159                raise error.TestNAError('Invalid suspend state: ' +
160                                        self._suspend_state)
161            # Check the current state. If it is same as the one requested,
162            # we don't want to call PowerPrefChanger(restarts powerd).
163            if self._suspend_state == power_utils.get_sleep_state():
164                return
165            should_freeze = '1' if self._suspend_state == 'freeze' else '0'
166            new_prefs = {self._SUSPEND_STATE_PREF_FILE: should_freeze}
167            self._power_pref_changer = power_utils.PowerPrefChanger(new_prefs)
168
169    def _set_pm_print_times(self, on):
170        """Enable/disable extra suspend timing output from powerd to syslog."""
171        if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)),
172                ignore_status=True):
173            logging.warning('Failed to set pm_print_times to %s', bool(on))
174            del self.device_times
175            self._reset_pm_print_times = False
176        else:
177            logging.info('Device resume times set to %s', bool(on))
178
179
180    def _get_board(self):
181        """Remove _freon from get_board if found."""
182        return (utils.get_board().replace("_freon", ""))
183
184
185    def _reset_logs(self):
186        """Throw away cached log lines and reset log pointer to current end."""
187        if self._log_file:
188            self._log_file.close()
189        self._log_file = open('/var/log/messages')
190        self._log_file.seek(0, os.SEEK_END)
191        self._logs = []
192
193
194    def _update_logs(self, retries=11):
195        """
196        Read all lines logged since last reset into log cache. Block until last
197        powerd_suspend resume message was read, raise if it takes too long.
198        """
199        finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished')
200        for retry in xrange(retries + 1):
201            lines = self._log_file.readlines()
202            if lines:
203                if self._logs and self._logs[-1][-1] != '\n':
204                    # Reassemble line that was cut in the middle
205                    self._logs[-1] += lines.pop(0)
206                self._logs += lines
207            for line in reversed(self._logs):
208                if (finished_regex.search(line)):
209                    return
210            time.sleep(0.005 * 2**retry)
211
212        raise error.TestError("Sanity check failed: did not try to suspend.")
213
214
215    def _ts(self, name, retries=11):
216        """Searches logs for last timestamp with a given suspend message."""
217        # Occasionally need to retry due to races from process wakeup order
218        for retry in xrange(retries + 1):
219            try:
220                f = open(self._TIMINGS_FILE)
221                for line in f:
222                    words = line.split('=')
223                    if name == words[0]:
224                        try:
225                            timestamp = float(words[1])
226                        except ValueError:
227                            logging.warning('Invalid timestamp: %s', line)
228                            timestamp = 0
229                        return timestamp
230            except IOError:
231                pass
232            time.sleep(0.005 * 2**retry)
233
234        raise error.TestError('Could not find %s entry.' % name)
235
236
237    def _hwclock_ts(self, not_before, retries=3):
238        """Read the RTC resume timestamp saved by powerd_suspend."""
239        for retry in xrange(retries + 1):
240            early_wakeup = False
241            if os.path.exists(self.HWCLOCK_FILE):
242                match = re.search(r'(.+\w)\s+(-?[0-9.]+) seconds',
243                                  utils.read_file(self.HWCLOCK_FILE), re.DOTALL)
244                if match:
245                    timeval = time.strptime(match.group(1),
246                            "%a %b %d %H:%M:%S %Y")
247                    seconds = time.mktime(timeval)
248                    seconds += float(match.group(2))
249                    logging.debug('RTC resume timestamp read: %f', seconds)
250                    if seconds >= not_before:
251                        return seconds
252                    early_wakeup = True
253            time.sleep(0.05 * retry)
254        if early_wakeup:
255            logging.debug('Early wakeup, dumping eventlog if it exists:\n')
256            elog = utils.system_output('mosys eventlog list | tail -n %d' %
257                    self._RELEVANT_EVENTLOG_LINES, ignore_status=True)
258            wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1]
259            for line in reversed(self._logs):
260                match = re.search(r'PM1_STS: WAK.*', line)
261                if match:
262                    wake_syslog = match.group(0)
263                    break
264            else:
265                wake_syslog = 'unknown'
266            for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST:
267                if (re.search(b, utils.get_board()) and
268                        re.search(e, wake_elog) and re.search(s, wake_syslog)):
269                    logging.warning('Whitelisted spurious wake in S3: %s | %s',
270                                    wake_elog, wake_syslog)
271                    return None
272            raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s'
273                    % (wake_elog, wake_syslog))
274        if self._get_board() in ['lumpy', 'stumpy', 'kiev']:
275            logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' +
276                    utils.system_output('mosys nvram dump', ignore_status=True))
277            return None
278        raise error.TestError('Broken RTC timestamp: ' +
279                              utils.read_file(self.HWCLOCK_FILE))
280
281
282    def _firmware_resume_time(self):
283        """Calculate seconds for firmware resume from logged TSC. (x86 only)"""
284        if utils.get_arch() not in ['i686', 'x86_64']:
285            # TODO: support this on ARM somehow
286            return 0
287        regex_freeze = re.compile(r'PM: resume from suspend-to-idle')
288        regex_tsc = re.compile(r'TSC at resume: (\d+)$')
289        freq = 1000 * int(utils.read_one_line(
290                '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq'))
291        for line in reversed(self._logs):
292            match_freeze = regex_freeze.search(line)
293            if match_freeze:
294                logging.info('fw resume time zero due to suspend-to-idle\n')
295                return 0
296            match_tsc = regex_tsc.search(line)
297            if match_tsc:
298                return float(match_tsc.group(1)) / freq
299
300        raise error.TestError('Failed to find TSC resume value in syslog.')
301
302
303    def _device_resume_time(self):
304        """Read amount of seconds for overall device resume from syslog."""
305        regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)')
306        for line in reversed(self._logs):
307            match = regex.search(line)
308            if match:
309                return float(match.group(1)) / 1000
310
311        raise error.TestError('Failed to find device resume time in syslog.')
312
313    def _get_phase_times(self):
314        phase_times = []
315        regex = re.compile(r'PM: (\w+ )?(resume|suspend) of devices complete')
316        for line in self._logs:
317          match = regex.search(line)
318          if match:
319            ts = cros_logging.extract_kernel_timestamp(line)
320            phase = match.group(1)
321            if not phase:
322              phase = 'REG'
323            phase_times.append((phase.upper(), ts))
324        return sorted(phase_times, key = lambda entry: entry[1])
325
326    def _get_phase(self, ts, phase_table, dev):
327      for entry in phase_table:
328        #checking if timestamp was before that phase's cutoff
329        if ts < entry[1]:
330          return entry[0]
331      raise error.TestError('Device %s has a timestamp after all devices %s',
332                            dev, 'had already resumed')
333
334    def _individual_device_times(self, start_resume):
335        """Return dict of individual device suspend and resume times."""
336        self.device_times.append(dict())
337        dev_details = collections.defaultdict(dict)
338        regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs')
339        phase_table = self._get_phase_times()
340        for line in self._logs:
341          match = regex.search(line)
342          if match:
343            device = match.group(1).replace(':', '-')
344            key = 'seconds_dev_' + device
345            secs = float(match.group(2)) / 1e6
346            ts = cros_logging.extract_kernel_timestamp(line)
347            if ts > start_resume:
348              key += '_resume'
349            else:
350              key += '_suspend'
351            #looking if we're in a special phase
352            phase = self._get_phase(ts, phase_table, device)
353            dev = dev_details[key]
354            if phase in dev:
355              logging.warning('Duplicate %s entry for device %s, +%f', phase,
356                              device, secs)
357              dev[phase] += secs
358            else:
359              dev[phase] = secs
360
361        for dev_key, dev in dev_details.iteritems():
362          total_secs = sum(dev.values())
363          self.device_times[-1][dev_key] = total_secs
364          report = '%s: %f TOT' % (dev_key, total_secs)
365          for phase in dev.keys():
366            if phase is 'REG':
367              continue
368            report += ', %f %s' % (dev[phase], phase)
369          logging.debug(report)
370
371    def _identify_driver(self, device):
372        """Return the driver name of a device (or "unknown")."""
373        for path, subdirs, _ in os.walk('/sys/devices'):
374            if device in subdirs:
375                node = os.path.join(path, device, 'driver')
376                if not os.path.exists(node):
377                    return "unknown"
378                return os.path.basename(os.path.realpath(node))
379        else:
380            return "unknown"
381
382
383    def _check_for_errors(self, ignore_kernel_warns):
384        """Find and identify suspend errors.
385
386        @param ignore_kernel_warns: Ignore kernel errors.
387
388        @returns: True iff we should retry.
389
390        @raises:
391          sys_power.KernelError: for non-whitelisted kernel failures.
392          sys_power.SuspendTimeout: took too long to enter suspend.
393          sys_power.SpuriousWakeupError: woke too soon from suspend.
394          sys_power.SuspendFailure: unidentified failure.
395        """
396        warning_regex = re.compile(r' kernel: \[.*WARNING:')
397        abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort'
398                r'| powerd_suspend\[.*Cancel suspend at kernel'
399                r'| kernel: \[.*PM: Wakeup pending, aborting suspend')
400        # rsyslogd can put this out of order with dmesg, so track in variable
401        fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error')
402        failed = False
403
404        # TODO(scottz): warning_monitor crosbug.com/38092
405        log_len = len(self._logs)
406        for i in xrange(log_len):
407            line = self._logs[i]
408            if warning_regex.search(line):
409                # match the source file from the WARNING line, and the
410                # actual error text by peeking one or two lines below that
411                src = cros_logging.strip_timestamp(line)
412                text = ''
413                if i+1 < log_len:
414                    text = cros_logging.strip_timestamp(self._logs[i + 1])
415                if i+2 < log_len:
416                    text += '\n' + cros_logging.strip_timestamp(
417                        self._logs[i + 2])
418                for p1, p2 in sys_power.KernelError.WHITELIST:
419                    if re.search(p1, src) and re.search(p2, text):
420                        logging.info('Whitelisted KernelError: %s', src)
421                        break
422                else:
423                    if ignore_kernel_warns:
424                        logging.warn('Non-whitelisted KernelError: %s', src)
425                    else:
426                        raise sys_power.KernelError("%s\n%s" % (src, text))
427            if abort_regex.search(line):
428                wake_source = 'unknown'
429                match = re.search(r'last active wakeup source: (.*)$',
430                        '\n'.join(self._logs[i-5:i+3]), re.MULTILINE)
431                if match:
432                    wake_source = match.group(1)
433                driver = self._identify_driver(wake_source)
434                for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST:
435                    if (re.search(b, utils.get_board()) and
436                            re.search(w, wake_source)):
437                        logging.warning('Whitelisted spurious wake before '
438                                        'S3: %s | %s', wake_source, driver)
439                        return True
440                if "rtc" in driver:
441                    raise sys_power.SuspendTimeout('System took too '
442                                                   'long to suspend.')
443                raise sys_power.SpuriousWakeupError('Spurious wake '
444                        'before S3: %s | %s' % (wake_source, driver))
445            if fail_regex.search(line):
446                failed = True
447
448        if failed:
449            raise sys_power.SuspendFailure('Unidentified problem.')
450        return False
451
452    def suspend(self, duration=10, ignore_kernel_warns=False):
453        """
454        Do a single suspend for 'duration' seconds. Estimates the amount of time
455        it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC
456        wakeup delay will be longer. Returns None on errors, or raises the
457        exception when _throw is set. Returns a dict of general measurements,
458        or a tuple (general_measurements, individual_device_times) when
459        _device_times is set.
460
461        @param duration: time in seconds to do a suspend prior to waking.
462        @param ignore_kernel_warns: Ignore kernel errors.  Defaults to false.
463        """
464
465        if power_utils.get_sleep_state() == 'freeze':
466            self._s0ix_residency_stats = power_status.S0ixResidencyStats()
467
468        try:
469            iteration = len(self.failures) + len(self.successes) + 1
470            # Retry suspend in case we hit a known (whitelisted) bug
471            for _ in xrange(10):
472                self._reset_logs()
473                utils.system('sync')
474                board_delay = self._SUSPEND_DELAY.get(self._get_board(),
475                        self._DEFAULT_SUSPEND_DELAY)
476                try:
477                    alarm = self._suspend(duration + board_delay)
478                except sys_power.SpuriousWakeupError:
479                    # might be another error, we check for it ourselves below
480                    alarm = self._ALARM_FORCE_EARLY_WAKEUP
481
482                if os.path.exists('/sys/firmware/log'):
483                    for msg in re.findall(r'^.*ERROR.*$',
484                            utils.read_file('/sys/firmware/log'), re.M):
485                        for board, pattern in sys_power.FirmwareError.WHITELIST:
486                            if (re.search(board, utils.get_board()) and
487                                    re.search(pattern, msg)):
488                                logging.info('Whitelisted FW error: ' + msg)
489                                break
490                        else:
491                            firmware_log = os.path.join(self._logdir,
492                                    'firmware.log.' + str(iteration))
493                            shutil.copy('/sys/firmware/log', firmware_log)
494                            logging.info('Saved firmware log: ' + firmware_log)
495                            raise sys_power.FirmwareError(msg.strip('\r\n '))
496
497                self._update_logs()
498                if not self._check_for_errors(ignore_kernel_warns):
499                    hwclock_ts = self._hwclock_ts(alarm)
500                    if hwclock_ts:
501                        break
502
503            else:
504                raise error.TestWarn('Ten tries failed due to whitelisted bug')
505
506            # calculate general measurements
507            start_resume = self._ts('start_resume_time')
508            kernel_down = (self._ts('end_suspend_time') -
509                           self._ts('start_suspend_time'))
510            kernel_up = self._ts('end_resume_time') - start_resume
511            devices_up = self._device_resume_time()
512            total_up = hwclock_ts - alarm
513            firmware_up = self._firmware_resume_time()
514            board_up = total_up - kernel_up - firmware_up
515            try:
516                cpu_up = self._ts('cpu_ready_time', 0) - start_resume
517            except error.TestError:
518                # can be missing on non-SMP machines
519                cpu_up = None
520            if total_up > self._MAX_RESUME_TIME:
521                raise error.TestError('Sanity check failed: missed RTC wakeup.')
522
523            logging.info('Success(%d): %g down, %g up, %g board, %g firmware, '
524                         '%g kernel, %g cpu, %g devices',
525                         iteration, kernel_down, total_up, board_up,
526                         firmware_up, kernel_up, cpu_up, devices_up)
527
528            if hasattr(self, '_s0ix_residency_stats'):
529                s0ix_residency_secs = \
530                        self._s0ix_residency_stats.\
531                                get_accumulated_residency_secs()
532                if not s0ix_residency_secs:
533                    raise sys_power.S0ixResidencyNotChanged(
534                        'S0ix residency did not change.')
535                logging.info('S0ix residency : %d secs.', s0ix_residency_secs)
536
537            self.successes.append({
538                'seconds_system_suspend': kernel_down,
539                'seconds_system_resume': total_up,
540                'seconds_system_resume_firmware': firmware_up + board_up,
541                'seconds_system_resume_firmware_cpu': firmware_up,
542                'seconds_system_resume_firmware_ec': board_up,
543                'seconds_system_resume_kernel': kernel_up,
544                'seconds_system_resume_kernel_cpu': cpu_up,
545                'seconds_system_resume_kernel_dev': devices_up,
546                })
547
548            if hasattr(self, 'device_times'):
549                self._individual_device_times(start_resume)
550                return (self.successes[-1], self.device_times[-1])
551            else:
552                return self.successes[-1]
553
554        except sys_power.SuspendFailure as ex:
555            message = '%s(%d): %s' % (type(ex).__name__, iteration, ex)
556            logging.error(message)
557            self.failures.append(ex)
558            if self._throw:
559                if type(ex).__name__ in ['KernelError', 'SuspendTimeout']:
560                    raise error.TestWarn(message)
561                else:
562                    raise error.TestFail(message)
563            return None
564
565
566    def finalize(self):
567        """Restore normal environment (not turning 3G back on for now...)"""
568        if os.path.exists(self.HWCLOCK_FILE):
569            os.remove(self.HWCLOCK_FILE)
570            if self._restart_tlsdated:
571                utils.system('initctl start tlsdated')
572            if self._reset_pm_print_times:
573                self._set_pm_print_times(False)
574        if hasattr(self, '_power_pref_changer'):
575            self._power_pref_changer.finalize()
576
577
578    def __del__(self):
579        self.finalize()
580