• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from __future__ import print_function
6
7import logging, os, time
8
9from autotest_lib.client.common_lib import error
10from autotest_lib.client.common_lib import utils
11from autotest_lib.client.cros import constants
12from autotest_lib.client.cros.crash.crash_test import CrashTest as CrashTestDefs
13from autotest_lib.server import test
14
15class platform_KernelErrorPaths(test.test):
16    """Performs various kernel crash tests and makes sure that the expected
17       results are found in the crash report."""
18    version = 1
19    POLLING_INTERVAL_SECONDS = 5
20    KCRASH_TIMEOUT_SECONDS = 120
21
22    def _run_client_command(self, command):
23        try:
24            # Simply sending the trigger into lkdtm resets the target
25            # immediately, leaving files unsaved to disk and the master ssh
26            # connection wedged for a long time.
27            self.client.run(
28                'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command)
29        except error.AutoservRunError as e:
30            # It is expected that this will cause a non-zero exit status.
31            pass
32
33    def _provoke_crash(self, interface, trigger, cpu):
34        """
35        This test is ensuring that the machine will reboot on any
36        type of kernel panic.  If the sysctls below are not set
37        correctly, the machine will not reboot.  After verifying
38        that the machine has the proper sysctl state, we make it
39        reboot by writing to lkdtm.
40
41        @param interface: which filesystem interface to write into
42        @param trigger: the text string to write for triggering a crash
43        @param cpu: None or a specific cpu number to pin before crashing
44        """
45        self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"');
46        self.client.run('sysctl kernel.panic_on_oops|'
47                        'grep "kernel.panic_on_oops = 1"');
48
49        if cpu != None:
50            # Run on a specific CPU using taskset
51            command = "echo %s | taskset -c %d tee %s" % (trigger, cpu,
52                                                          interface)
53        else:
54            # Run normally
55            command = "echo %s > %s" % (trigger, interface)
56
57        logging.info("KernelErrorPaths: executing '%s' on %s",
58                     command, self.client.hostname)
59        self._run_client_command(command)
60
61    def _exists_on_client(self, f):
62        return self.client.run('ls "%s"' % f,
63                               ignore_status=True).exit_status == 0
64
65    def _enable_consent(self):
66        """ Enable consent so that crashes get stored in /var/spool/crash. """
67        self._consent_files = [
68            (CrashTestDefs._PAUSE_FILE, None, 'chronos'),
69            (CrashTestDefs._CONSENT_FILE, None, 'chronos'),
70            (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'),
71            (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'),
72            ]
73        for dst, src, owner in self._consent_files:
74            if self._exists_on_client(dst):
75                self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst))
76            if src:
77                full_src = os.path.join(self.autodir, 'client/cros', src)
78                self.client.send_file(full_src, dst)
79            else:
80                self.client.run('touch "%s"' % dst)
81            self.client.run('chown "%s" "%s"' % (owner, dst))
82
83    def _restore_consent_files(self):
84        """ Restore consent files to their previous values. """
85        for f, _, _ in self._consent_files:
86            self.client.run('rm -f "%s"' % f)
87            if self._exists_on_client('%s.autotest_backup' % f):
88                self.client.run('mv "%s.autotest_backup" "%s"' % (f, f))
89
90    def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0,
91                                    timeout=10):
92        """
93        Wait for panic reboot to complete and check @text in kcrash file.
94
95        @param bootid: Boot ID of the current boot.
96        @param trigger: Text string that specifies what caused the panic/reboot.
97        @param text: Text string to match in the kcrash file.
98        @param cpu: CPU on which the trigger happened.
99        @param timeout: Time to wait for the remote host to go down.
100
101        @raises error.TestFail if the @text string is not found in kcrash file.
102        """
103        try:
104            self.client.wait_for_restart(
105                down_timeout=timeout,
106                down_warning=timeout,
107                old_boot_id=boot_id,
108                # Extend the default reboot timeout as some targets take
109                # longer than normal before ssh is available again.
110                timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4)
111        except error.AutoservShutdownError:
112            self.client.run('ps alx')
113            raise
114
115        kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir
116
117        # give the crash_reporter some time to log the crash
118        try:
119            utils.poll_for_condition(
120                    condition=lambda: self.client.list_files_glob(
121                            kcrash_file_path),
122                    timeout=self.KCRASH_TIMEOUT_SECONDS,
123                    sleep_interval=self.POLLING_INTERVAL_SECONDS,
124                    desc="crash_reporter logging crash")
125        except utils.TimeoutError:
126            raise error.TestFail('No kcrash files found on client')
127
128        result = self.client.run('cat %s/kernel.*.kcrash' %
129                                 self._crash_log_dir)
130        if not type(text) == tuple:
131            match = (text, )
132        else:
133            match = text
134        if not any(s in result.stdout for s in match):
135            raise error.TestFail(
136                "'%s' not found in log after sending '%s' on cpu %d" %
137                ((match,), trigger, cpu))
138
139    def _client_run_output(self, cmd):
140        return self.client.run(cmd).stdout.strip()
141
142    def _get_pid(self, comm, parent):
143        """
144        Fetch PID of process named comm.
145
146        This function tries to lookup the PID for process named @comm. If
147        @parent is not None, the parent process is first looked up and then the
148        PID of child process matching @comm is returned. Since this method is
149        typically called when processes are getting killed/re-spawned, lets
150        try looking up the PID up to 10 times if there were errors.
151
152        @param comm: Name of the process whose PID needs to be fetched.
153        @param parent: Name of @comm's parent process. This parameter can be
154                       None.
155
156        @returns PID of matching process.
157
158        @raises error.TestFail exception if PID for @comm is not found.
159        """
160        for _ in range(10):
161            try:
162                if parent:
163                    ppid = self._client_run_output('ps -C %s -o pid=' % parent)
164                    pid_list = self._client_run_output('ps --ppid %s -o pid= -o comm=' %
165                                                       ppid).splitlines()
166                    for line in pid_list:
167                        pair = line.split()
168                        pid = pair[0]
169                        new_comm = pair[1]
170                        if comm == new_comm:
171                            break
172                    if comm != new_comm:
173                        logging.info("comm mismatch: %s != %s", comm, new_comm)
174                        time.sleep(1)
175                        continue
176                else:
177                    pid = self._client_run_output('ps -C %s -o pid=' % comm)
178                return pid
179            except error.AutoservRunError as e:
180                logging.debug("AutotestRunError is: %s", e)
181                time.sleep(1)
182        raise error.TestFail("Unable to get pid. comm = %s, parent = %s"
183                             % (comm, parent))
184
185    def _trigger_sysrq_x(self):
186        self._run_client_command('echo x > /proc/sysrq-trigger')
187
188    def _test_sysrq_x(self):
189        """
190        Test sysrq-x.
191
192        To help debug system hangs, we ask users to invoke alt-volume_up-x
193        key combination. The kernel sysrq-x handler is what handles the
194        alt-volume_up-x key combination. The sysrq-x handler in the kernel
195        does the following for successive sysrq-x invocations within a 20
196        second interval:
197        1. Abort the chrome process whose parent is the session_manager process.
198        2. Panic the kernel.
199        This function tests the above steps.
200        """
201        process = 'chrome'
202        parent = 'session_manager'
203        orig_pid = self._get_pid(process, parent)
204        self._trigger_sysrq_x()
205        for _ in range(10):
206            new_pid = self._get_pid(process, parent)
207            logging.info("%s's original pid was %s and new pid is %s",
208                          process, orig_pid, new_pid)
209            if new_pid != orig_pid:
210                break
211            time.sleep(1)
212        else:
213            raise error.TestFail('%s did not restart on sysrq-x' % process)
214
215        boot_id = self.client.get_boot_id()
216        trigger = 'sysrq-x'
217        text = 'sysrq_handle_cros_xkey'
218        self._trigger_sysrq_x()
219        self._wait_for_restart_and_check(boot_id, trigger, text)
220
221    def _test_panic_path(self, lkdtm, kcrash_tuple):
222        """
223        Test the kernel panic paths.
224        """
225
226        # Figure out which kernel crash interface is available.
227        interface = "/sys/kernel/debug/provoke-crash/DIRECT"
228        trigger = lkdtm
229        breakme, timeout, all_cpu, text = kcrash_tuple
230        if not self._exists_on_client(interface):
231            interface = "/proc/breakme"
232            trigger = breakme
233            logging.info("Falling back to %s", interface)
234
235        # Find out how many cpus we have
236        client_cpus = map(lambda x: int(x),
237            self.client.run(
238                'cat /proc/cpuinfo | grep processor | cut -f 2 -d :')
239                .stdout.split())
240
241        # Skip any triggers that are undefined for the given interface.
242        if trigger == None:
243            logging.info("Skipping unavailable trigger %s", lkdtm)
244            return
245        if lkdtm == "HARDLOCKUP":
246            # ARM systems do not (presently) have NMI, so skip them for now.
247            arch = self.client.get_arch()
248            if arch.startswith('arm'):
249                logging.info("Skipping %s on architecture %s.",
250                             trigger, arch)
251                return
252            # Make sure a soft lockup detection doesn't get in the way.
253            self.client.run("sysctl -w kernel.softlockup_panic=0")
254
255        if trigger == "SPINLOCKUP":
256            # This needs to be pre-triggered so the second one locks.
257            self._provoke_crash(interface, trigger, None)
258
259        if all_cpu:
260            which_cpus = client_cpus
261        else:
262            which_cpus = [client_cpus[0]]
263
264        for cpu in which_cpus:
265            # Always run on at least one cpu
266            # Delete crash results, if any
267            self.client.run('rm -f %s/*' % self._crash_log_dir)
268            boot_id = self.client.get_boot_id()
269            # This should cause target reset.
270            # Run on a specific cpu if we're running on all of them,
271            # otherwise run normally
272            if all_cpu :
273                self._provoke_crash(interface, trigger, cpu)
274            else:
275                self._provoke_crash(interface, trigger, None)
276            self._wait_for_restart_and_check(boot_id, trigger, text,
277                                             cpu=cpu, timeout=timeout)
278
279    def run_once(self, kcrashes, host=None):
280        self.client = host
281        self._enable_consent()
282        self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR
283
284        # kcrash data is given by a dictionary with key lkdtm string to write
285        # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict
286        # value is a tuple containing 1) the string to write to /proc/breakme.
287        # if lkdtm is not available, 2) the timeout, and 3)whether we run
288        # the tests on all CPUs or not. Some tests take less to run than other
289        # (null pointer and panic) so it would be best if we would run them on
290        # all the CPUS as it wouldn't add that much time to the total.
291        # The final component is the crash report string to look for in the
292        # crash dump after target restarts.
293        kcrash_types = {
294            'BUG' : ('bug', 10, False, ('kernel BUG at', 'BUG: failure at')),
295            'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'),
296            'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'),
297            'HARDLOCKUP' : ('nmiwatchdog', 50, False,
298                            'Watchdog detected hard LOCKUP'),
299            'SPINLOCKUP' : (None, 25, False, ('softlockup: hung tasks',
300                                             'BUG: scheduling while atomic',
301                                             'BUG: sleeping function called')),
302            'EXCEPTION' : ('nullptr',     10, True,
303             # Logs differ slightly between different kernels and archs (v5.4,
304             # x86, ARM), but all contain 'kernel NULL pointer dereference'.
305                           'kernel NULL pointer dereference'),
306            'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'),
307            'CORRUPT_STACK' : (None, 10, True,
308                               'stack-protector: Kernel stack is '
309                               'corrupted in:')
310            }
311
312        bad_kcrashes = []
313
314        # Expected input is comma-delimited kcrashes string
315        kcrash_list = kcrashes.split(',')
316        if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list:
317            self._test_sysrq_x()
318            if 'SYSRQ_X' in kcrash_list:
319                kcrash_list.remove('SYSRQ_X')
320            if 'ALL' in kcrash_list:
321                kcrash_list = kcrash_types.keys()
322        for kcrash in kcrash_list:
323            if kcrash_types.get(kcrash) == None:
324                bad_kcrashes.append(kcrash)
325                continue
326            self._test_panic_path(kcrash,kcrash_types[kcrash])
327
328        if len(bad_kcrashes) > 0:
329            raise error.TestFail("Wrong kcrash type "
330                                 "requested (%s)" % str(bad_kcrashes))
331
332    def cleanup(self):
333        self._restore_consent_files()
334        test.test.cleanup(self)
335