1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from __future__ import print_function 6 7import logging, os, time 8 9from autotest_lib.client.common_lib import error 10from autotest_lib.client.common_lib import utils 11from autotest_lib.client.cros import constants 12from autotest_lib.client.cros.crash.crash_test import CrashTest as CrashTestDefs 13from autotest_lib.server import test 14 15class platform_KernelErrorPaths(test.test): 16 """Performs various kernel crash tests and makes sure that the expected 17 results are found in the crash report.""" 18 version = 1 19 POLLING_INTERVAL_SECONDS = 5 20 KCRASH_TIMEOUT_SECONDS = 120 21 22 def _run_client_command(self, command): 23 try: 24 # Simply sending the trigger into lkdtm resets the target 25 # immediately, leaving files unsaved to disk and the master ssh 26 # connection wedged for a long time. 27 self.client.run( 28 'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command) 29 except error.AutoservRunError as e: 30 # It is expected that this will cause a non-zero exit status. 31 pass 32 33 def _provoke_crash(self, interface, trigger, cpu): 34 """ 35 This test is ensuring that the machine will reboot on any 36 type of kernel panic. If the sysctls below are not set 37 correctly, the machine will not reboot. After verifying 38 that the machine has the proper sysctl state, we make it 39 reboot by writing to lkdtm. 40 41 @param interface: which filesystem interface to write into 42 @param trigger: the text string to write for triggering a crash 43 @param cpu: None or a specific cpu number to pin before crashing 44 """ 45 self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"'); 46 self.client.run('sysctl kernel.panic_on_oops|' 47 'grep "kernel.panic_on_oops = 1"'); 48 49 if cpu != None: 50 # Run on a specific CPU using taskset 51 command = "echo %s | taskset -c %d tee %s" % (trigger, cpu, 52 interface) 53 else: 54 # Run normally 55 command = "echo %s > %s" % (trigger, interface) 56 57 logging.info("KernelErrorPaths: executing '%s' on %s", 58 command, self.client.hostname) 59 self._run_client_command(command) 60 61 def _exists_on_client(self, f): 62 return self.client.run('ls "%s"' % f, 63 ignore_status=True).exit_status == 0 64 65 def _enable_consent(self): 66 """ Enable consent so that crashes get stored in /var/spool/crash. """ 67 self._consent_files = [ 68 (CrashTestDefs._PAUSE_FILE, None, 'chronos'), 69 (CrashTestDefs._CONSENT_FILE, None, 'chronos'), 70 (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'), 71 (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'), 72 ] 73 for dst, src, owner in self._consent_files: 74 if self._exists_on_client(dst): 75 self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst)) 76 if src: 77 full_src = os.path.join(self.autodir, 'client/cros', src) 78 self.client.send_file(full_src, dst) 79 else: 80 self.client.run('touch "%s"' % dst) 81 self.client.run('chown "%s" "%s"' % (owner, dst)) 82 83 def _restore_consent_files(self): 84 """ Restore consent files to their previous values. """ 85 for f, _, _ in self._consent_files: 86 self.client.run('rm -f "%s"' % f) 87 if self._exists_on_client('%s.autotest_backup' % f): 88 self.client.run('mv "%s.autotest_backup" "%s"' % (f, f)) 89 90 def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0, 91 timeout=10): 92 """ 93 Wait for panic reboot to complete and check @text in kcrash file. 94 95 @param bootid: Boot ID of the current boot. 96 @param trigger: Text string that specifies what caused the panic/reboot. 97 @param text: Text string to match in the kcrash file. 98 @param cpu: CPU on which the trigger happened. 99 @param timeout: Time to wait for the remote host to go down. 100 101 @raises error.TestFail if the @text string is not found in kcrash file. 102 """ 103 try: 104 self.client.wait_for_restart( 105 down_timeout=timeout, 106 down_warning=timeout, 107 old_boot_id=boot_id, 108 # Extend the default reboot timeout as some targets take 109 # longer than normal before ssh is available again. 110 timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4) 111 except error.AutoservShutdownError: 112 self.client.run('ps alx') 113 raise 114 115 kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir 116 117 # give the crash_reporter some time to log the crash 118 try: 119 utils.poll_for_condition( 120 condition=lambda: self.client.list_files_glob( 121 kcrash_file_path), 122 timeout=self.KCRASH_TIMEOUT_SECONDS, 123 sleep_interval=self.POLLING_INTERVAL_SECONDS, 124 desc="crash_reporter logging crash") 125 except utils.TimeoutError: 126 raise error.TestFail('No kcrash files found on client') 127 128 result = self.client.run('cat %s/kernel.*.kcrash' % 129 self._crash_log_dir) 130 if not type(text) == tuple: 131 match = (text, ) 132 else: 133 match = text 134 if not any(s in result.stdout for s in match): 135 raise error.TestFail( 136 "'%s' not found in log after sending '%s' on cpu %d" % 137 ((match,), trigger, cpu)) 138 139 def _client_run_output(self, cmd): 140 return self.client.run(cmd).stdout.strip() 141 142 def _get_pid(self, comm, parent): 143 """ 144 Fetch PID of process named comm. 145 146 This function tries to lookup the PID for process named @comm. If 147 @parent is not None, the parent process is first looked up and then the 148 PID of child process matching @comm is returned. Since this method is 149 typically called when processes are getting killed/re-spawned, lets 150 try looking up the PID up to 10 times if there were errors. 151 152 @param comm: Name of the process whose PID needs to be fetched. 153 @param parent: Name of @comm's parent process. This parameter can be 154 None. 155 156 @returns PID of matching process. 157 158 @raises error.TestFail exception if PID for @comm is not found. 159 """ 160 for _ in range(10): 161 try: 162 if parent: 163 ppid = self._client_run_output('ps -C %s -o pid=' % parent) 164 pid_list = self._client_run_output('ps --ppid %s -o pid= -o comm=' % 165 ppid).splitlines() 166 for line in pid_list: 167 pair = line.split() 168 pid = pair[0] 169 new_comm = pair[1] 170 if comm == new_comm: 171 break 172 if comm != new_comm: 173 logging.info("comm mismatch: %s != %s", comm, new_comm) 174 time.sleep(1) 175 continue 176 else: 177 pid = self._client_run_output('ps -C %s -o pid=' % comm) 178 return pid 179 except error.AutoservRunError as e: 180 logging.debug("AutotestRunError is: %s", e) 181 time.sleep(1) 182 raise error.TestFail("Unable to get pid. comm = %s, parent = %s" 183 % (comm, parent)) 184 185 def _trigger_sysrq_x(self): 186 self._run_client_command('echo x > /proc/sysrq-trigger') 187 188 def _test_sysrq_x(self): 189 """ 190 Test sysrq-x. 191 192 To help debug system hangs, we ask users to invoke alt-volume_up-x 193 key combination. The kernel sysrq-x handler is what handles the 194 alt-volume_up-x key combination. The sysrq-x handler in the kernel 195 does the following for successive sysrq-x invocations within a 20 196 second interval: 197 1. Abort the chrome process whose parent is the session_manager process. 198 2. Panic the kernel. 199 This function tests the above steps. 200 """ 201 process = 'chrome' 202 parent = 'session_manager' 203 orig_pid = self._get_pid(process, parent) 204 self._trigger_sysrq_x() 205 for _ in range(10): 206 new_pid = self._get_pid(process, parent) 207 logging.info("%s's original pid was %s and new pid is %s", 208 process, orig_pid, new_pid) 209 if new_pid != orig_pid: 210 break 211 time.sleep(1) 212 else: 213 raise error.TestFail('%s did not restart on sysrq-x' % process) 214 215 boot_id = self.client.get_boot_id() 216 trigger = 'sysrq-x' 217 text = 'sysrq_handle_cros_xkey' 218 self._trigger_sysrq_x() 219 self._wait_for_restart_and_check(boot_id, trigger, text) 220 221 def _test_panic_path(self, lkdtm, kcrash_tuple): 222 """ 223 Test the kernel panic paths. 224 """ 225 226 # Figure out which kernel crash interface is available. 227 interface = "/sys/kernel/debug/provoke-crash/DIRECT" 228 trigger = lkdtm 229 breakme, timeout, all_cpu, text = kcrash_tuple 230 if not self._exists_on_client(interface): 231 interface = "/proc/breakme" 232 trigger = breakme 233 logging.info("Falling back to %s", interface) 234 235 # Find out how many cpus we have 236 client_cpus = map(lambda x: int(x), 237 self.client.run( 238 'cat /proc/cpuinfo | grep processor | cut -f 2 -d :') 239 .stdout.split()) 240 241 # Skip any triggers that are undefined for the given interface. 242 if trigger == None: 243 logging.info("Skipping unavailable trigger %s", lkdtm) 244 return 245 if lkdtm == "HARDLOCKUP": 246 # ARM systems do not (presently) have NMI, so skip them for now. 247 arch = self.client.get_arch() 248 if arch.startswith('arm'): 249 logging.info("Skipping %s on architecture %s.", 250 trigger, arch) 251 return 252 # Make sure a soft lockup detection doesn't get in the way. 253 self.client.run("sysctl -w kernel.softlockup_panic=0") 254 255 if trigger == "SPINLOCKUP": 256 # This needs to be pre-triggered so the second one locks. 257 self._provoke_crash(interface, trigger, None) 258 259 if all_cpu: 260 which_cpus = client_cpus 261 else: 262 which_cpus = [client_cpus[0]] 263 264 for cpu in which_cpus: 265 # Always run on at least one cpu 266 # Delete crash results, if any 267 self.client.run('rm -f %s/*' % self._crash_log_dir) 268 boot_id = self.client.get_boot_id() 269 # This should cause target reset. 270 # Run on a specific cpu if we're running on all of them, 271 # otherwise run normally 272 if all_cpu : 273 self._provoke_crash(interface, trigger, cpu) 274 else: 275 self._provoke_crash(interface, trigger, None) 276 self._wait_for_restart_and_check(boot_id, trigger, text, 277 cpu=cpu, timeout=timeout) 278 279 def run_once(self, kcrashes, host=None): 280 self.client = host 281 self._enable_consent() 282 self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR 283 284 # kcrash data is given by a dictionary with key lkdtm string to write 285 # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict 286 # value is a tuple containing 1) the string to write to /proc/breakme. 287 # if lkdtm is not available, 2) the timeout, and 3)whether we run 288 # the tests on all CPUs or not. Some tests take less to run than other 289 # (null pointer and panic) so it would be best if we would run them on 290 # all the CPUS as it wouldn't add that much time to the total. 291 # The final component is the crash report string to look for in the 292 # crash dump after target restarts. 293 kcrash_types = { 294 'BUG' : ('bug', 10, False, ('kernel BUG at', 'BUG: failure at')), 295 'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'), 296 'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'), 297 'HARDLOCKUP' : ('nmiwatchdog', 50, False, 298 'Watchdog detected hard LOCKUP'), 299 'SPINLOCKUP' : (None, 25, False, ('softlockup: hung tasks', 300 'BUG: scheduling while atomic', 301 'BUG: sleeping function called')), 302 'EXCEPTION' : ('nullptr', 10, True, 303 # Logs differ slightly between different kernels and archs (v5.4, 304 # x86, ARM), but all contain 'kernel NULL pointer dereference'. 305 'kernel NULL pointer dereference'), 306 'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'), 307 'CORRUPT_STACK' : (None, 10, True, 308 'stack-protector: Kernel stack is ' 309 'corrupted in:') 310 } 311 312 bad_kcrashes = [] 313 314 # Expected input is comma-delimited kcrashes string 315 kcrash_list = kcrashes.split(',') 316 if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list: 317 self._test_sysrq_x() 318 if 'SYSRQ_X' in kcrash_list: 319 kcrash_list.remove('SYSRQ_X') 320 if 'ALL' in kcrash_list: 321 kcrash_list = kcrash_types.keys() 322 for kcrash in kcrash_list: 323 if kcrash_types.get(kcrash) == None: 324 bad_kcrashes.append(kcrash) 325 continue 326 self._test_panic_path(kcrash,kcrash_types[kcrash]) 327 328 if len(bad_kcrashes) > 0: 329 raise error.TestFail("Wrong kcrash type " 330 "requested (%s)" % str(bad_kcrashes)) 331 332 def cleanup(self): 333 self._restore_consent_files() 334 test.test.cleanup(self) 335