1# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import collections, logging, os, re, shutil, time 6 7from autotest_lib.client.bin import utils 8from autotest_lib.client.common_lib import base_utils, error 9from autotest_lib.client.cros import cros_logging, sys_power 10from autotest_lib.client.cros import power_utils 11from autotest_lib.client.cros import power_status 12#pylint: disable=W0611 13from autotest_lib.client.cros import flimflam_test_path 14import flimflam 15 16class Suspender(object): 17 """Class for suspend/resume measurements. 18 19 Public attributes: 20 disconnect_3G_time: Amount of seconds it took to disable 3G. 21 successes[]: List of timing measurement dicts from successful suspends. 22 failures[]: List of SuspendFailure exceptions from failed suspends. 23 device_times[]: List of individual device suspend/resume time dicts. 24 25 Public methods: 26 suspend: Do a suspend/resume cycle. Return timing measurement dict. 27 28 Private attributes: 29 _logs: Array of /var/log/messages lines since start of suspend cycle. 30 _log_file: Open file descriptor at the end of /var/log/messages. 31 _logdir: Directory to store firmware logs in case of errors. 32 _suspend: Set to the sys_power suspend function to use. 33 _throw: Set to have SuspendFailure exceptions raised to the caller. 34 _reset_pm_print_times: Set to deactivate pm_print_times after the test. 35 _restart_tlsdated: Set to restart tlsdated after the test. 36 37 Private methods: 38 __init__: Shuts off tlsdated for duration of test, disables 3G 39 __del__: Restore tlsdated (must run eventually, but GC delay no problem) 40 _set_pm_print_times: Enable/disable kernel device suspend timing output. 41 _check_failure_log: Check /sys/.../suspend_stats for new failures. 42 _ts: Returns a timestamp from /var/run/power_manager/last_resume_timings 43 _hwclock_ts: Read RTC timestamp left on resume in hwclock-on-resume 44 _device_resume_time: Read seconds overall device resume took from logs. 45 _individual_device_times: Reads individual device suspend/resume times. 46 _identify_driver: Return the driver name of a device (or "unknown"). 47 """ 48 49 # board-specific "time to suspend" values determined empirically 50 # TODO: migrate to separate file with http://crosbug.com/38148 51 _DEFAULT_SUSPEND_DELAY = 5 52 _SUSPEND_DELAY = { 53 # TODO: Reevaluate this when http://crosbug.com/38460 is fixed 54 'daisy': 6, 55 'daisy_spring': 6, 56 'peach_pit': 6, 57 58 # TODO: Reevaluate these when http://crosbug.com/38225 is fixed 59 'x86-mario': 6, 60 'x86-alex': 5, 61 62 # Lumpy and Stumpy need high values, because it seems to mitigate their 63 # RTC interrupt problem. See http://crosbug.com/36004 64 'lumpy': 5, 65 'stumpy': 5, 66 67 # RTS5209 card reader has a really bad staging driver, can take ~1 sec 68 'butterfly': 4, 69 70 # Hard disk sync and overall just slow 71 'parrot': 8, 72 'kiev': 9, 73 } 74 75 # alarm/not_before value guaranteed to raise SpuriousWakeup in _hwclock_ts 76 _ALARM_FORCE_EARLY_WAKEUP = 2147483647 77 78 # File written by send_metrics_on_resume containing timing information about 79 # the last resume. 80 _TIMINGS_FILE = '/var/run/power_manager/root/last_resume_timings' 81 82 # Amount of lines to dump from the eventlog on a SpuriousWakeup. Should be 83 # enough to include ACPI Wake Reason... 10 should be far on the safe side. 84 _RELEVANT_EVENTLOG_LINES = 10 85 86 # Sanity check value to catch overlong resume times (from missed RTC wakes) 87 _MAX_RESUME_TIME = 10 88 89 # File written by powerd_suspend containing the hwclock time at resume. 90 HWCLOCK_FILE = '/var/run/power_manager/root/hwclock-on-resume' 91 92 # File read by powerd to decide on the state to suspend (mem or freeze). 93 _SUSPEND_STATE_PREF_FILE = 'suspend_to_idle' 94 95 def __init__(self, logdir, method=sys_power.do_suspend, 96 throw=False, device_times=False, suspend_state=''): 97 """ 98 Prepare environment for suspending. 99 @param suspend_state: Suspend state to enter into. It can be 100 'mem' or 'freeze' or an empty string. If 101 the suspend state is an empty string, 102 system suspends to the default pref. 103 """ 104 self.disconnect_3G_time = 0 105 self.successes = [] 106 self.failures = [] 107 self._logdir = logdir 108 self._suspend = method 109 self._throw = throw 110 self._reset_pm_print_times = False 111 self._restart_tlsdated = False 112 self._log_file = None 113 self._suspend_state = suspend_state 114 if device_times: 115 self.device_times = [] 116 117 # stop tlsdated, make sure we/hwclock have /dev/rtc for ourselves 118 if utils.system_output('initctl status tlsdated').find('start') != -1: 119 utils.system('initctl stop tlsdated') 120 self._restart_tlsdated = True 121 # give process's file descriptors time to asynchronously tear down 122 time.sleep(0.1) 123 124 # prime powerd_suspend RTC timestamp saving and make sure hwclock works 125 utils.open_write_close(self.HWCLOCK_FILE, '') 126 hwclock_output = utils.system_output('hwclock -r --debug --utc', 127 ignore_status=True) 128 if not re.search('Using.*/dev interface to.*clock', hwclock_output): 129 raise error.TestError('hwclock cannot find rtc: ' + hwclock_output) 130 131 # activate device suspend timing debug output 132 if hasattr(self, 'device_times'): 133 if not int(utils.read_one_line('/sys/power/pm_print_times')): 134 self._set_pm_print_times(True) 135 self._reset_pm_print_times = True 136 137 # Shut down 3G to remove its variability from suspend time measurements 138 flim = flimflam.FlimFlam() 139 service = flim.FindCellularService(0) 140 if service: 141 logging.info('Found 3G interface, disconnecting.') 142 start_time = time.time() 143 (success, status) = flim.DisconnectService( 144 service=service, wait_timeout=60) 145 if success: 146 logging.info('3G disconnected successfully.') 147 self.disconnect_3G_time = time.time() - start_time 148 else: 149 logging.error('Could not disconnect: %s.', status) 150 self.disconnect_3G_time = -1 151 152 self._configure_suspend_state() 153 154 def _configure_suspend_state(self): 155 """Configure the suspend state as requested.""" 156 if self._suspend_state: 157 available_suspend_states = utils.read_one_line('/sys/power/state') 158 if self._suspend_state not in available_suspend_states: 159 raise error.TestNAError('Invalid suspend state: ' + 160 self._suspend_state) 161 # Check the current state. If it is same as the one requested, 162 # we don't want to call PowerPrefChanger(restarts powerd). 163 if self._suspend_state == power_utils.get_sleep_state(): 164 return 165 should_freeze = '1' if self._suspend_state == 'freeze' else '0' 166 new_prefs = {self._SUSPEND_STATE_PREF_FILE: should_freeze} 167 self._power_pref_changer = power_utils.PowerPrefChanger(new_prefs) 168 169 def _set_pm_print_times(self, on): 170 """Enable/disable extra suspend timing output from powerd to syslog.""" 171 if utils.system('echo %s > /sys/power/pm_print_times' % int(bool(on)), 172 ignore_status=True): 173 logging.warning('Failed to set pm_print_times to %s', bool(on)) 174 del self.device_times 175 self._reset_pm_print_times = False 176 else: 177 logging.info('Device resume times set to %s', bool(on)) 178 179 180 def _get_board(self): 181 """Remove _freon from get_board if found.""" 182 return (utils.get_board().replace("_freon", "")) 183 184 185 def _reset_logs(self): 186 """Throw away cached log lines and reset log pointer to current end.""" 187 if self._log_file: 188 self._log_file.close() 189 self._log_file = open('/var/log/messages') 190 self._log_file.seek(0, os.SEEK_END) 191 self._logs = [] 192 193 194 def _update_logs(self, retries=11): 195 """ 196 Read all lines logged since last reset into log cache. Block until last 197 powerd_suspend resume message was read, raise if it takes too long. 198 """ 199 finished_regex = re.compile(r'powerd_suspend\[\d+\]: Resume finished') 200 for retry in xrange(retries + 1): 201 lines = self._log_file.readlines() 202 if lines: 203 if self._logs and self._logs[-1][-1] != '\n': 204 # Reassemble line that was cut in the middle 205 self._logs[-1] += lines.pop(0) 206 self._logs += lines 207 for line in reversed(self._logs): 208 if (finished_regex.search(line)): 209 return 210 time.sleep(0.005 * 2**retry) 211 212 raise error.TestError("Sanity check failed: did not try to suspend.") 213 214 215 def _ts(self, name, retries=11): 216 """Searches logs for last timestamp with a given suspend message.""" 217 # Occasionally need to retry due to races from process wakeup order 218 for retry in xrange(retries + 1): 219 try: 220 f = open(self._TIMINGS_FILE) 221 for line in f: 222 words = line.split('=') 223 if name == words[0]: 224 try: 225 timestamp = float(words[1]) 226 except ValueError: 227 logging.warning('Invalid timestamp: %s', line) 228 timestamp = 0 229 return timestamp 230 except IOError: 231 pass 232 time.sleep(0.005 * 2**retry) 233 234 raise error.TestError('Could not find %s entry.' % name) 235 236 237 def _hwclock_ts(self, not_before, retries=3): 238 """Read the RTC resume timestamp saved by powerd_suspend.""" 239 for retry in xrange(retries + 1): 240 early_wakeup = False 241 if os.path.exists(self.HWCLOCK_FILE): 242 match = re.search(r'(.+\w)\s+(-?[0-9.]+) seconds', 243 utils.read_file(self.HWCLOCK_FILE), re.DOTALL) 244 if match: 245 timeval = time.strptime(match.group(1), 246 "%a %b %d %H:%M:%S %Y") 247 seconds = time.mktime(timeval) 248 seconds += float(match.group(2)) 249 logging.debug('RTC resume timestamp read: %f', seconds) 250 if seconds >= not_before: 251 return seconds 252 early_wakeup = True 253 time.sleep(0.05 * retry) 254 if early_wakeup: 255 logging.debug('Early wakeup, dumping eventlog if it exists:\n') 256 elog = utils.system_output('mosys eventlog list | tail -n %d' % 257 self._RELEVANT_EVENTLOG_LINES, ignore_status=True) 258 wake_elog = (['unknown'] + re.findall(r'Wake Source.*', elog))[-1] 259 for line in reversed(self._logs): 260 match = re.search(r'PM1_STS: WAK.*', line) 261 if match: 262 wake_syslog = match.group(0) 263 break 264 else: 265 wake_syslog = 'unknown' 266 for b, e, s in sys_power.SpuriousWakeupError.S3_WHITELIST: 267 if (re.search(b, utils.get_board()) and 268 re.search(e, wake_elog) and re.search(s, wake_syslog)): 269 logging.warning('Whitelisted spurious wake in S3: %s | %s', 270 wake_elog, wake_syslog) 271 return None 272 raise sys_power.SpuriousWakeupError('Spurious wake in S3: %s | %s' 273 % (wake_elog, wake_syslog)) 274 if self._get_board() in ['lumpy', 'stumpy', 'kiev']: 275 logging.debug('RTC read failure (crosbug/36004), dumping nvram:\n' + 276 utils.system_output('mosys nvram dump', ignore_status=True)) 277 return None 278 raise error.TestError('Broken RTC timestamp: ' + 279 utils.read_file(self.HWCLOCK_FILE)) 280 281 282 def _firmware_resume_time(self): 283 """Calculate seconds for firmware resume from logged TSC. (x86 only)""" 284 if utils.get_arch() not in ['i686', 'x86_64']: 285 # TODO: support this on ARM somehow 286 return 0 287 regex_freeze = re.compile(r'PM: resume from suspend-to-idle') 288 regex_tsc = re.compile(r'TSC at resume: (\d+)$') 289 freq = 1000 * int(utils.read_one_line( 290 '/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq')) 291 for line in reversed(self._logs): 292 match_freeze = regex_freeze.search(line) 293 if match_freeze: 294 logging.info('fw resume time zero due to suspend-to-idle\n') 295 return 0 296 match_tsc = regex_tsc.search(line) 297 if match_tsc: 298 return float(match_tsc.group(1)) / freq 299 300 raise error.TestError('Failed to find TSC resume value in syslog.') 301 302 303 def _device_resume_time(self): 304 """Read amount of seconds for overall device resume from syslog.""" 305 regex = re.compile(r'PM: resume of devices complete after ([0-9.]+)') 306 for line in reversed(self._logs): 307 match = regex.search(line) 308 if match: 309 return float(match.group(1)) / 1000 310 311 raise error.TestError('Failed to find device resume time in syslog.') 312 313 def _get_phase_times(self): 314 phase_times = [] 315 regex = re.compile(r'PM: (\w+ )?(resume|suspend) of devices complete') 316 for line in self._logs: 317 match = regex.search(line) 318 if match: 319 ts = cros_logging.extract_kernel_timestamp(line) 320 phase = match.group(1) 321 if not phase: 322 phase = 'REG' 323 phase_times.append((phase.upper(), ts)) 324 return sorted(phase_times, key = lambda entry: entry[1]) 325 326 def _get_phase(self, ts, phase_table, dev): 327 for entry in phase_table: 328 #checking if timestamp was before that phase's cutoff 329 if ts < entry[1]: 330 return entry[0] 331 raise error.TestError('Device %s has a timestamp after all devices %s', 332 dev, 'had already resumed') 333 334 def _individual_device_times(self, start_resume): 335 """Return dict of individual device suspend and resume times.""" 336 self.device_times.append(dict()) 337 dev_details = collections.defaultdict(dict) 338 regex = re.compile(r'call ([^ ]+)\+ returned 0 after ([0-9]+) usecs') 339 phase_table = self._get_phase_times() 340 for line in self._logs: 341 match = regex.search(line) 342 if match: 343 device = match.group(1).replace(':', '-') 344 key = 'seconds_dev_' + device 345 secs = float(match.group(2)) / 1e6 346 ts = cros_logging.extract_kernel_timestamp(line) 347 if ts > start_resume: 348 key += '_resume' 349 else: 350 key += '_suspend' 351 #looking if we're in a special phase 352 phase = self._get_phase(ts, phase_table, device) 353 dev = dev_details[key] 354 if phase in dev: 355 logging.warning('Duplicate %s entry for device %s, +%f', phase, 356 device, secs) 357 dev[phase] += secs 358 else: 359 dev[phase] = secs 360 361 for dev_key, dev in dev_details.iteritems(): 362 total_secs = sum(dev.values()) 363 self.device_times[-1][dev_key] = total_secs 364 report = '%s: %f TOT' % (dev_key, total_secs) 365 for phase in dev.keys(): 366 if phase is 'REG': 367 continue 368 report += ', %f %s' % (dev[phase], phase) 369 logging.debug(report) 370 371 def _identify_driver(self, device): 372 """Return the driver name of a device (or "unknown").""" 373 for path, subdirs, _ in os.walk('/sys/devices'): 374 if device in subdirs: 375 node = os.path.join(path, device, 'driver') 376 if not os.path.exists(node): 377 return "unknown" 378 return os.path.basename(os.path.realpath(node)) 379 else: 380 return "unknown" 381 382 383 def _check_for_errors(self, ignore_kernel_warns): 384 """Find and identify suspend errors. 385 386 @param ignore_kernel_warns: Ignore kernel errors. 387 388 @returns: True iff we should retry. 389 390 @raises: 391 sys_power.KernelError: for non-whitelisted kernel failures. 392 sys_power.SuspendTimeout: took too long to enter suspend. 393 sys_power.SpuriousWakeupError: woke too soon from suspend. 394 sys_power.SuspendFailure: unidentified failure. 395 """ 396 warning_regex = re.compile(r' kernel: \[.*WARNING:') 397 abort_regex = re.compile(r' kernel: \[.*Freezing of tasks abort' 398 r'| powerd_suspend\[.*Cancel suspend at kernel' 399 r'| kernel: \[.*PM: Wakeup pending, aborting suspend') 400 # rsyslogd can put this out of order with dmesg, so track in variable 401 fail_regex = re.compile(r'powerd_suspend\[\d+\]: Error') 402 failed = False 403 404 # TODO(scottz): warning_monitor crosbug.com/38092 405 log_len = len(self._logs) 406 for i in xrange(log_len): 407 line = self._logs[i] 408 if warning_regex.search(line): 409 # match the source file from the WARNING line, and the 410 # actual error text by peeking one or two lines below that 411 src = cros_logging.strip_timestamp(line) 412 text = '' 413 if i+1 < log_len: 414 text = cros_logging.strip_timestamp(self._logs[i + 1]) 415 if i+2 < log_len: 416 text += '\n' + cros_logging.strip_timestamp( 417 self._logs[i + 2]) 418 for p1, p2 in sys_power.KernelError.WHITELIST: 419 if re.search(p1, src) and re.search(p2, text): 420 logging.info('Whitelisted KernelError: %s', src) 421 break 422 else: 423 if ignore_kernel_warns: 424 logging.warn('Non-whitelisted KernelError: %s', src) 425 else: 426 raise sys_power.KernelError("%s\n%s" % (src, text)) 427 if abort_regex.search(line): 428 wake_source = 'unknown' 429 match = re.search(r'last active wakeup source: (.*)$', 430 '\n'.join(self._logs[i-5:i+3]), re.MULTILINE) 431 if match: 432 wake_source = match.group(1) 433 driver = self._identify_driver(wake_source) 434 for b, w in sys_power.SpuriousWakeupError.S0_WHITELIST: 435 if (re.search(b, utils.get_board()) and 436 re.search(w, wake_source)): 437 logging.warning('Whitelisted spurious wake before ' 438 'S3: %s | %s', wake_source, driver) 439 return True 440 if "rtc" in driver: 441 raise sys_power.SuspendTimeout('System took too ' 442 'long to suspend.') 443 raise sys_power.SpuriousWakeupError('Spurious wake ' 444 'before S3: %s | %s' % (wake_source, driver)) 445 if fail_regex.search(line): 446 failed = True 447 448 if failed: 449 raise sys_power.SuspendFailure('Unidentified problem.') 450 return False 451 452 def suspend(self, duration=10, ignore_kernel_warns=False): 453 """ 454 Do a single suspend for 'duration' seconds. Estimates the amount of time 455 it takes to suspend for a board (see _SUSPEND_DELAY), so the actual RTC 456 wakeup delay will be longer. Returns None on errors, or raises the 457 exception when _throw is set. Returns a dict of general measurements, 458 or a tuple (general_measurements, individual_device_times) when 459 _device_times is set. 460 461 @param duration: time in seconds to do a suspend prior to waking. 462 @param ignore_kernel_warns: Ignore kernel errors. Defaults to false. 463 """ 464 465 if power_utils.get_sleep_state() == 'freeze': 466 self._s0ix_residency_stats = power_status.S0ixResidencyStats() 467 468 try: 469 iteration = len(self.failures) + len(self.successes) + 1 470 # Retry suspend in case we hit a known (whitelisted) bug 471 for _ in xrange(10): 472 self._reset_logs() 473 utils.system('sync') 474 board_delay = self._SUSPEND_DELAY.get(self._get_board(), 475 self._DEFAULT_SUSPEND_DELAY) 476 try: 477 alarm = self._suspend(duration + board_delay) 478 except sys_power.SpuriousWakeupError: 479 # might be another error, we check for it ourselves below 480 alarm = self._ALARM_FORCE_EARLY_WAKEUP 481 482 if os.path.exists('/sys/firmware/log'): 483 for msg in re.findall(r'^.*ERROR.*$', 484 utils.read_file('/sys/firmware/log'), re.M): 485 for board, pattern in sys_power.FirmwareError.WHITELIST: 486 if (re.search(board, utils.get_board()) and 487 re.search(pattern, msg)): 488 logging.info('Whitelisted FW error: ' + msg) 489 break 490 else: 491 firmware_log = os.path.join(self._logdir, 492 'firmware.log.' + str(iteration)) 493 shutil.copy('/sys/firmware/log', firmware_log) 494 logging.info('Saved firmware log: ' + firmware_log) 495 raise sys_power.FirmwareError(msg.strip('\r\n ')) 496 497 self._update_logs() 498 if not self._check_for_errors(ignore_kernel_warns): 499 hwclock_ts = self._hwclock_ts(alarm) 500 if hwclock_ts: 501 break 502 503 else: 504 raise error.TestWarn('Ten tries failed due to whitelisted bug') 505 506 # calculate general measurements 507 start_resume = self._ts('start_resume_time') 508 kernel_down = (self._ts('end_suspend_time') - 509 self._ts('start_suspend_time')) 510 kernel_up = self._ts('end_resume_time') - start_resume 511 devices_up = self._device_resume_time() 512 total_up = hwclock_ts - alarm 513 firmware_up = self._firmware_resume_time() 514 board_up = total_up - kernel_up - firmware_up 515 try: 516 cpu_up = self._ts('cpu_ready_time', 0) - start_resume 517 except error.TestError: 518 # can be missing on non-SMP machines 519 cpu_up = None 520 if total_up > self._MAX_RESUME_TIME: 521 raise error.TestError('Sanity check failed: missed RTC wakeup.') 522 523 logging.info('Success(%d): %g down, %g up, %g board, %g firmware, ' 524 '%g kernel, %g cpu, %g devices', 525 iteration, kernel_down, total_up, board_up, 526 firmware_up, kernel_up, cpu_up, devices_up) 527 528 if hasattr(self, '_s0ix_residency_stats'): 529 s0ix_residency_secs = \ 530 self._s0ix_residency_stats.\ 531 get_accumulated_residency_secs() 532 if not s0ix_residency_secs: 533 raise sys_power.S0ixResidencyNotChanged( 534 'S0ix residency did not change.') 535 logging.info('S0ix residency : %d secs.', s0ix_residency_secs) 536 537 self.successes.append({ 538 'seconds_system_suspend': kernel_down, 539 'seconds_system_resume': total_up, 540 'seconds_system_resume_firmware': firmware_up + board_up, 541 'seconds_system_resume_firmware_cpu': firmware_up, 542 'seconds_system_resume_firmware_ec': board_up, 543 'seconds_system_resume_kernel': kernel_up, 544 'seconds_system_resume_kernel_cpu': cpu_up, 545 'seconds_system_resume_kernel_dev': devices_up, 546 }) 547 548 if hasattr(self, 'device_times'): 549 self._individual_device_times(start_resume) 550 return (self.successes[-1], self.device_times[-1]) 551 else: 552 return self.successes[-1] 553 554 except sys_power.SuspendFailure as ex: 555 message = '%s(%d): %s' % (type(ex).__name__, iteration, ex) 556 logging.error(message) 557 self.failures.append(ex) 558 if self._throw: 559 if type(ex).__name__ in ['KernelError', 'SuspendTimeout']: 560 raise error.TestWarn(message) 561 else: 562 raise error.TestFail(message) 563 return None 564 565 566 def finalize(self): 567 """Restore normal environment (not turning 3G back on for now...)""" 568 if os.path.exists(self.HWCLOCK_FILE): 569 os.remove(self.HWCLOCK_FILE) 570 if self._restart_tlsdated: 571 utils.system('initctl start tlsdated') 572 if self._reset_pm_print_times: 573 self._set_pm_print_times(False) 574 if hasattr(self, '_power_pref_changer'): 575 self._power_pref_changer.finalize() 576 577 578 def __del__(self): 579 self.finalize() 580