1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import logging, os, time 6 7from autotest_lib.client.common_lib import error 8from autotest_lib.client.cros import constants 9from autotest_lib.client.cros.crash_test import CrashTest as CrashTestDefs 10from autotest_lib.server import test 11 12class platform_KernelErrorPaths(test.test): 13 """Performs various kernel crash tests and makes sure that the expected 14 results are found in the crash report.""" 15 version = 1 16 17 def _run_client_command(self, command): 18 try: 19 # Simply sending the trigger into lkdtm resets the target 20 # immediately, leaving files unsaved to disk and the master ssh 21 # connection wedged for a long time. The sequence below borrowed 22 # from logging_KernelCrashServer.py makes sure that the test 23 # proceeds smoothly. 24 self.client.run( 25 'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command) 26 except error.AutoservRunError, e: 27 # It is expected that this will cause a non-zero exit status. 28 pass 29 30 def _provoke_crash(self, interface, trigger, cpu): 31 """ 32 This test is ensuring that the machine will reboot on any 33 type of kernel panic. If the sysctls below are not set 34 correctly, the machine will not reboot. After verifying 35 that the machine has the proper sysctl state, we make it 36 reboot by writing to lkdtm. 37 38 @param interface: which filesystem interface to write into 39 @param trigger: the text string to write for triggering a crash 40 @param cpu: None or a specific cpu number to pin before crashing 41 """ 42 self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"'); 43 self.client.run('sysctl kernel.panic_on_oops|' 44 'grep "kernel.panic_on_oops = 1"'); 45 46 if cpu != None: 47 # Run on a specific CPU using taskset 48 command = "echo %s | taskset -c %d tee %s" % (trigger, cpu, 49 interface) 50 else: 51 # Run normally 52 command = "echo %s > %s" % (trigger, interface) 53 54 logging.info("KernelErrorPaths: executing '%s' on %s", 55 command, self.client.hostname) 56 self._run_client_command(command) 57 58 def _exists_on_client(self, f): 59 return self.client.run('ls "%s"' % f, 60 ignore_status=True).exit_status == 0 61 62 def _enable_consent(self): 63 """ Enable consent so that crashes get stored in /var/spool/crash. """ 64 self._consent_files = [ 65 (CrashTestDefs._PAUSE_FILE, None, 'chronos'), 66 (CrashTestDefs._CONSENT_FILE, None, 'chronos'), 67 (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'), 68 (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'), 69 ] 70 for dst, src, owner in self._consent_files: 71 if self._exists_on_client(dst): 72 self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst)) 73 if src: 74 full_src = os.path.join(self.autodir, 'client/cros', src) 75 self.client.send_file(full_src, dst) 76 else: 77 self.client.run('touch "%s"' % dst) 78 self.client.run('chown "%s" "%s"' % (owner, dst)) 79 80 def _restore_consent_files(self): 81 """ Restore consent files to their previous values. """ 82 for f, _, _ in self._consent_files: 83 self.client.run('rm -f "%s"' % f) 84 if self._exists_on_client('%s.autotest_backup' % f): 85 self.client.run('mv "%s.autotest_backup" "%s"' % (f, f)) 86 87 def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0, 88 timeout=10): 89 """ 90 Wait for panic reboot to complete and check @text in kcrash file. 91 92 @param bootid: Boot ID of the current boot. 93 @param trigger: Text string that specifies what caused the panic/reboot. 94 @param text: Text string to match in the kcrash file. 95 @param cpu: CPU on which the trigger happened. 96 @param timeout: Time to wait for the remote host to go down. 97 98 @raises error.TestFail if the @text string is not found in kcrash file. 99 """ 100 try: 101 self.client.wait_for_restart( 102 down_timeout=timeout, 103 down_warning=timeout, 104 old_boot_id=boot_id, 105 # Extend the default reboot timeout as some targets take 106 # longer than normal before ssh is available again. 107 timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4) 108 except error.AutoservShutdownError: 109 self.client.run('ps alx') 110 raise 111 112 # give the crash_reporter some time to log the crash 113 time.sleep(5) 114 115 # check if dir /var/spool/crash exists on client or not 116 if not self._exists_on_client(self._crash_log_dir): 117 raise error.TestFail( 118 '%s does not exists on client' % self._crash_log_dir) 119 120 # check if kernel.*.kcrash files are on the client or not 121 kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir 122 if not self.client.list_files_glob(kcrash_file_path): 123 raise error.TestFail('No kcrash files found on client') 124 125 result = self.client.run('cat %s/kernel.*.kcrash' % 126 self._crash_log_dir) 127 if text not in result.stdout: 128 raise error.TestFail( 129 "No '%s' in the log after sending '%s' on cpu %d" % 130 (text, trigger, cpu)) 131 132 def _client_run_output(self, cmd): 133 return self.client.run(cmd).stdout.strip() 134 135 def _get_pid(self, comm, parent): 136 """ 137 Fetch PID of process named comm. 138 139 This function tries to lookup the PID for process named @comm. If 140 @parent is not None, the parent process is first looked up and then the 141 PID of child process matching @comm is returned. Since this method is 142 typically called when processes are getting killed/re-spawned, lets 143 try looking up the PID up to 10 times if there were errors. 144 145 @param comm: Name of the process whose PID needs to be fetched. 146 @param parent: Name of @comm's parent process. This parameter can be 147 None. 148 149 @returns PID of matching process. 150 151 @raises error.TestFail exception if PID for @comm is not found. 152 """ 153 for _ in range(10): 154 try: 155 if parent: 156 ppid = self._client_run_output('ps -C %s -o pid=' % parent) 157 pid = self._client_run_output('ps --ppid %s -o pid=' % ppid) 158 new_comm = self._client_run_output('ps -p %s -o comm=' % 159 pid) 160 if comm != new_comm: 161 logging.info("comm mismatch: %s != %s", comm, new_comm) 162 time.sleep(1) 163 continue 164 else: 165 pid = self._client_run_output('ps -C %s -o pid=' % comm) 166 return pid 167 except error.AutoservRunError as e: 168 logging.debug("AutotestRunError is: %s", e) 169 time.sleep(1) 170 raise error.TestFail("Unable to get pid. comm = %s, parent = %s" 171 % (comm, parent)) 172 173 def _trigger_sysrq_x(self): 174 self._run_client_command('echo x > /proc/sysrq-trigger') 175 176 def _test_sysrq_x(self): 177 """ 178 Test sysrq-x. 179 180 To help debug system hangs, we ask users to invoke alt-volume_up-x 181 key combination. The kernel sysrq-x handler is what handles the 182 alt-volume_up-x key combination. The sysrq-x handler in the kernel 183 does the following for successive sysrq-x invocations within a 20 184 second interval: 185 1. Abort the chrome process whose parent is the session_manager process. 186 2. Abort the X process. On Freon enabled systems, X is no longer present 187 so this step is a no-op. 188 3. Panic the kernel. 189 This function tests the above steps. 190 """ 191 for process, parent in [('chrome', 'session_manager'), 192 ('X', None)]: 193 if process is 'X': 194 # With Freon there is no longer an X process. Lets send the 195 # sysrq_x and then continue on. 196 self._trigger_sysrq_x() 197 continue 198 orig_pid = self._get_pid(process, parent) 199 self._trigger_sysrq_x() 200 for _ in range(10): 201 new_pid = self._get_pid(process, parent) 202 logging.info("%s's original pid was %s and new pid is %s", 203 process, orig_pid, new_pid) 204 if new_pid != orig_pid: 205 break 206 time.sleep(1) 207 else: 208 raise error.TestFail('%s did not restart on sysrq-x' % process) 209 210 boot_id = self.client.get_boot_id() 211 trigger = 'sysrq-x' 212 text = 'sysrq_handle_cros_xkey' 213 self._trigger_sysrq_x() 214 self._wait_for_restart_and_check(boot_id, trigger, text) 215 216 def _test_panic_path(self, lkdtm, kcrash_tuple): 217 """ 218 Test the kernel panic paths. 219 """ 220 221 # Figure out which kernel crash interface is available. 222 interface = "/sys/kernel/debug/provoke-crash/DIRECT" 223 trigger = lkdtm 224 breakme, timeout, all_cpu, text = kcrash_tuple 225 if not self._exists_on_client(interface): 226 interface = "/proc/breakme" 227 trigger = breakme 228 logging.info("Falling back to %s", interface) 229 230 # Find out how many cpus we have 231 client_no_cpus = int( 232 self.client.run('cat /proc/cpuinfo | grep processor | wc -l') 233 .stdout.strip()) 234 no_cpus = 1 235 236 # Skip any triggers that are undefined for the given interface. 237 if trigger == None: 238 logging.info("Skipping unavailable trigger %s", lkdtm) 239 return 240 if lkdtm == "HARDLOCKUP": 241 # ARM systems do not (presently) have NMI, so skip them for now. 242 arch = self.client.get_arch() 243 if arch.startswith('arm'): 244 logging.info("Skipping %s on architecture %s.", 245 trigger, arch) 246 return 247 # Make sure a soft lockup detection doesn't get in the way. 248 self.client.run("sysctl -w kernel.softlockup_panic=0") 249 250 if trigger == "SPINLOCKUP": 251 # This needs to be pre-triggered so the second one locks. 252 self._provoke_crash(interface, trigger, None) 253 254 if not all_cpu: 255 no_cpus = 1 256 else: 257 no_cpus = client_no_cpus 258 for cpu in range(no_cpus): 259 # Always run on at least one cpu 260 # Delete crash results, if any 261 self.client.run('rm -f %s/*' % self._crash_log_dir) 262 boot_id = self.client.get_boot_id() 263 # This should cause target reset. 264 # Run on a specific cpu if we're running on all of them, 265 # otherwise run normally 266 if all_cpu : 267 self._provoke_crash(interface, trigger, cpu) 268 else: 269 self._provoke_crash(interface, trigger, None) 270 self._wait_for_restart_and_check(boot_id, trigger, text, 271 cpu=cpu, timeout=timeout) 272 273 def run_once(self, kcrashes, host=None): 274 self.client = host 275 self._enable_consent() 276 self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR 277 278 # kcrash data is given by a dictionary with key lkdtm string to write 279 # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict 280 # value is a tupple conraining 1) the string to write to /proc/breakme. 281 # if lkdtm is not available, 2) the timeout, and 3)whether we run 282 # the tests on all CPUs or not. Some tests take less to run than other 283 # (null pointer and panic) so it would be best if we would run them on 284 # all the CPUS as it wouldn't add that much time to the total. 285 # The final component is the crash report string to look for in the 286 # crash dump after target restarts. 287 kcrash_types = { 288 'LOOP' : ('softlockup', 25, False, 'BUG: soft lockup'), 289 'BUG' : ('bug', 10, False, 'kernel BUG at'), 290 'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'), 291 'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'), 292 'HARDLOCKUP' : ('nmiwatchdog', 50, False, 293 'Watchdog detected hard LOCKUP'), 294 'SPINLOCKUP' : (None, 25, False, 'softlockup: hung tasks'), 295 'EXCEPTION' : ('nullptr', 10, True, 296 # x86 gives "BUG: unable to" while ARM gives "Unableto". 297 'nable to handle kernel NULL pointer ' 298 'dereference at'), 299 'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'), 300 'CORRUPT_STACK' : (None, 10, True, 301 'stack-protector: Kernel stack is ' 302 'corrupted in:') 303 } 304 305 bad_kcrashes = [] 306 307 #Expected input is comma-delimited kcrashes string 308 kcrash_list = kcrashes.split(',') 309 if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list: 310 self._test_sysrq_x() 311 if 'SYSRQ_X' in kcrash_list: 312 kcrash_list.remove('SYSRQ_X') 313 if 'ALL' in kcrash_list: 314 kcrash_list = kcrash_types.keys() 315 for kcrash in kcrash_list: 316 if kcrash_types.get(kcrash) == None: 317 bad_kcrashes.append(kcrash) 318 continue 319 self._test_panic_path(kcrash,kcrash_types[kcrash]) 320 321 if len(bad_kcrashes) > 0: 322 raise error.TestFail("Wrong kcrash type " 323 "requested (%s)" % str(bad_kcrashes)) 324 325 def cleanup(self): 326 self._restore_consent_files() 327 test.test.cleanup(self) 328