1# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import logging, os, time
6
7from autotest_lib.client.common_lib import error
8from autotest_lib.client.cros import constants
9from autotest_lib.client.cros.crash_test import CrashTest as CrashTestDefs
10from autotest_lib.server import test
11
12class platform_KernelErrorPaths(test.test):
13    """Performs various kernel crash tests and makes sure that the expected
14       results are found in the crash report."""
15    version = 1
16
17    def _run_client_command(self, command):
18        try:
19            # Simply sending the trigger into lkdtm resets the target
20            # immediately, leaving files unsaved to disk and the master ssh
21            # connection wedged for a long time. The sequence below borrowed
22            # from logging_KernelCrashServer.py makes sure that the test
23            # proceeds smoothly.
24            self.client.run(
25                'sh -c "sync; sleep 1; %s" >/dev/null 2>&1 &' % command)
26        except error.AutoservRunError, e:
27            # It is expected that this will cause a non-zero exit status.
28            pass
29
30    def _provoke_crash(self, interface, trigger, cpu):
31        """
32        This test is ensuring that the machine will reboot on any
33        type of kernel panic.  If the sysctls below are not set
34        correctly, the machine will not reboot.  After verifying
35        that the machine has the proper sysctl state, we make it
36        reboot by writing to lkdtm.
37
38        @param interface: which filesystem interface to write into
39        @param trigger: the text string to write for triggering a crash
40        @param cpu: None or a specific cpu number to pin before crashing
41        """
42        self.client.run('sysctl kernel.panic|grep "kernel.panic = -1"');
43        self.client.run('sysctl kernel.panic_on_oops|'
44                        'grep "kernel.panic_on_oops = 1"');
45
46        if cpu != None:
47            # Run on a specific CPU using taskset
48            command = "echo %s | taskset -c %d tee %s" % (trigger, cpu,
49                                                          interface)
50        else:
51            # Run normally
52            command = "echo %s > %s" % (trigger, interface)
53
54        logging.info("KernelErrorPaths: executing '%s' on %s",
55                     command, self.client.hostname)
56        self._run_client_command(command)
57
58    def _exists_on_client(self, f):
59        return self.client.run('ls "%s"' % f,
60                               ignore_status=True).exit_status == 0
61
62    def _enable_consent(self):
63        """ Enable consent so that crashes get stored in /var/spool/crash. """
64        self._consent_files = [
65            (CrashTestDefs._PAUSE_FILE, None, 'chronos'),
66            (CrashTestDefs._CONSENT_FILE, None, 'chronos'),
67            (constants.SIGNED_POLICY_FILE, 'mock_metrics_on.policy', 'root'),
68            (constants.OWNER_KEY_FILE, 'mock_metrics_owner.key', 'root'),
69            ]
70        for dst, src, owner in self._consent_files:
71            if self._exists_on_client(dst):
72                self.client.run('mv "%s" "%s.autotest_backup"' % (dst, dst))
73            if src:
74                full_src = os.path.join(self.autodir, 'client/cros', src)
75                self.client.send_file(full_src, dst)
76            else:
77                self.client.run('touch "%s"' % dst)
78            self.client.run('chown "%s" "%s"' % (owner, dst))
79
80    def _restore_consent_files(self):
81        """ Restore consent files to their previous values. """
82        for f, _, _ in self._consent_files:
83            self.client.run('rm -f "%s"' % f)
84            if self._exists_on_client('%s.autotest_backup' % f):
85                self.client.run('mv "%s.autotest_backup" "%s"' % (f, f))
86
87    def _wait_for_restart_and_check(self, boot_id, trigger, text, cpu=0,
88                                    timeout=10):
89        """
90        Wait for panic reboot to complete and check @text in kcrash file.
91
92        @param bootid: Boot ID of the current boot.
93        @param trigger: Text string that specifies what caused the panic/reboot.
94        @param text: Text string to match in the kcrash file.
95        @param cpu: CPU on which the trigger happened.
96        @param timeout: Time to wait for the remote host to go down.
97
98        @raises error.TestFail if the @text string is not found in kcrash file.
99        """
100        try:
101            self.client.wait_for_restart(
102                down_timeout=timeout,
103                down_warning=timeout,
104                old_boot_id=boot_id,
105                # Extend the default reboot timeout as some targets take
106                # longer than normal before ssh is available again.
107                timeout=self.client.DEFAULT_REBOOT_TIMEOUT * 4)
108        except error.AutoservShutdownError:
109            self.client.run('ps alx')
110            raise
111
112        # give the crash_reporter some time to log the crash
113        time.sleep(5)
114
115        # check if dir /var/spool/crash exists on client or not
116        if not self._exists_on_client(self._crash_log_dir):
117            raise error.TestFail(
118                '%s does not exists on client' % self._crash_log_dir)
119
120        # check if kernel.*.kcrash files are on the client or not
121        kcrash_file_path = '%s/kernel.*.kcrash' % self._crash_log_dir
122        if not self.client.list_files_glob(kcrash_file_path):
123            raise error.TestFail('No kcrash files found on client')
124
125        result = self.client.run('cat %s/kernel.*.kcrash' %
126                                 self._crash_log_dir)
127        if text not in result.stdout:
128            raise error.TestFail(
129                "No '%s' in the log after sending '%s' on cpu %d" %
130                (text, trigger, cpu))
131
132    def _client_run_output(self, cmd):
133        return self.client.run(cmd).stdout.strip()
134
135    def _get_pid(self, comm, parent):
136        """
137        Fetch PID of process named comm.
138
139        This function tries to lookup the PID for process named @comm. If
140        @parent is not None, the parent process is first looked up and then the
141        PID of child process matching @comm is returned. Since this method is
142        typically called when processes are getting killed/re-spawned, lets
143        try looking up the PID up to 10 times if there were errors.
144
145        @param comm: Name of the process whose PID needs to be fetched.
146        @param parent: Name of @comm's parent process. This parameter can be
147                       None.
148
149        @returns PID of matching process.
150
151        @raises error.TestFail exception if PID for @comm is not found.
152        """
153        for _ in range(10):
154            try:
155                if parent:
156                    ppid = self._client_run_output('ps -C %s -o pid=' % parent)
157                    pid = self._client_run_output('ps --ppid %s -o pid=' % ppid)
158                    new_comm = self._client_run_output('ps -p %s -o comm=' %
159                                                       pid)
160                    if comm != new_comm:
161                        logging.info("comm mismatch: %s != %s", comm, new_comm)
162                        time.sleep(1)
163                        continue
164                else:
165                    pid = self._client_run_output('ps -C %s -o pid=' % comm)
166                return pid
167            except error.AutoservRunError as e:
168                logging.debug("AutotestRunError is: %s", e)
169                time.sleep(1)
170        raise error.TestFail("Unable to get pid. comm = %s, parent = %s"
171                             % (comm, parent))
172
173    def _trigger_sysrq_x(self):
174        self._run_client_command('echo x > /proc/sysrq-trigger')
175
176    def _test_sysrq_x(self):
177        """
178        Test sysrq-x.
179
180        To help debug system hangs, we ask users to invoke alt-volume_up-x
181        key combination. The kernel sysrq-x handler is what handles the
182        alt-volume_up-x key combination. The sysrq-x handler in the kernel
183        does the following for successive sysrq-x invocations within a 20
184        second interval:
185        1. Abort the chrome process whose parent is the session_manager process.
186        2. Abort the X process. On Freon enabled systems, X is no longer present
187           so this step is a no-op.
188        3. Panic the kernel.
189        This function tests the above steps.
190        """
191        for process, parent in [('chrome', 'session_manager'),
192                                ('X', None)]:
193            if process is 'X':
194                # With Freon there is no longer an X process. Lets send the
195                # sysrq_x and then continue on.
196                self._trigger_sysrq_x()
197                continue
198            orig_pid = self._get_pid(process, parent)
199            self._trigger_sysrq_x()
200            for _ in range(10):
201                new_pid = self._get_pid(process, parent)
202                logging.info("%s's original pid was %s and new pid is %s",
203                              process, orig_pid, new_pid)
204                if new_pid != orig_pid:
205                    break
206                time.sleep(1)
207            else:
208                raise error.TestFail('%s did not restart on sysrq-x' % process)
209
210        boot_id = self.client.get_boot_id()
211        trigger = 'sysrq-x'
212        text = 'sysrq_handle_cros_xkey'
213        self._trigger_sysrq_x()
214        self._wait_for_restart_and_check(boot_id, trigger, text)
215
216    def _test_panic_path(self, lkdtm, kcrash_tuple):
217        """
218        Test the kernel panic paths.
219        """
220
221        # Figure out which kernel crash interface is available.
222        interface = "/sys/kernel/debug/provoke-crash/DIRECT"
223        trigger = lkdtm
224        breakme, timeout, all_cpu, text = kcrash_tuple
225        if not self._exists_on_client(interface):
226            interface = "/proc/breakme"
227            trigger = breakme
228            logging.info("Falling back to %s", interface)
229
230        # Find out how many cpus we have
231        client_no_cpus = int(
232            self.client.run('cat /proc/cpuinfo | grep processor | wc -l')
233                            .stdout.strip())
234        no_cpus = 1
235
236        # Skip any triggers that are undefined for the given interface.
237        if trigger == None:
238            logging.info("Skipping unavailable trigger %s", lkdtm)
239            return
240        if lkdtm == "HARDLOCKUP":
241            # ARM systems do not (presently) have NMI, so skip them for now.
242            arch = self.client.get_arch()
243            if arch.startswith('arm'):
244                logging.info("Skipping %s on architecture %s.",
245                             trigger, arch)
246                return
247            # Make sure a soft lockup detection doesn't get in the way.
248            self.client.run("sysctl -w kernel.softlockup_panic=0")
249
250        if trigger == "SPINLOCKUP":
251            # This needs to be pre-triggered so the second one locks.
252            self._provoke_crash(interface, trigger, None)
253
254        if not all_cpu:
255            no_cpus = 1
256        else:
257            no_cpus = client_no_cpus
258        for cpu in range(no_cpus):
259            # Always run on at least one cpu
260            # Delete crash results, if any
261            self.client.run('rm -f %s/*' % self._crash_log_dir)
262            boot_id = self.client.get_boot_id()
263            # This should cause target reset.
264            # Run on a specific cpu if we're running on all of them,
265            # otherwise run normally
266            if all_cpu :
267                self._provoke_crash(interface, trigger, cpu)
268            else:
269                self._provoke_crash(interface, trigger, None)
270            self._wait_for_restart_and_check(boot_id, trigger, text,
271                                             cpu=cpu, timeout=timeout)
272
273    def run_once(self, kcrashes, host=None):
274        self.client = host
275        self._enable_consent()
276        self._crash_log_dir = CrashTestDefs._SYSTEM_CRASH_DIR
277
278        # kcrash data is given by a dictionary with key lkdtm string to write
279        # to /sys/kernel/debug/provoke-crash/DIRECT on the target. The dict
280        # value is a tupple conraining 1) the string to write to /proc/breakme.
281        # if lkdtm is not available, 2) the timeout, and 3)whether we run
282        # the tests on all CPUs or not. Some tests take less to run than other
283        # (null pointer and panic) so it would be best if we would run them on
284        # all the CPUS as it wouldn't add that much time to the total.
285        # The final component is the crash report string to look for in the
286        # crash dump after target restarts.
287        kcrash_types = {
288            'LOOP' : ('softlockup', 25, False, 'BUG: soft lockup'),
289            'BUG' : ('bug', 10, False, 'kernel BUG at'),
290            'HUNG_TASK' : ('hungtask', 300, False, 'hung_task: blocked tasks'),
291            'SOFTLOCKUP' : (None, 25, False, 'BUG: soft lockup'),
292            'HARDLOCKUP' : ('nmiwatchdog', 50, False,
293                            'Watchdog detected hard LOCKUP'),
294            'SPINLOCKUP' : (None, 25, False, 'softlockup: hung tasks'),
295            'EXCEPTION' : ('nullptr',     10, True,
296             # x86 gives "BUG: unable to" while ARM gives "Unableto".
297                           'nable to handle kernel NULL pointer '
298                           'dereference at'),
299            'PANIC' : ('panic', 10, True, 'Kernel panic - not syncing:'),
300            'CORRUPT_STACK' : (None, 10, True,
301                               'stack-protector: Kernel stack is '
302                               'corrupted in:')
303            }
304
305        bad_kcrashes = []
306
307        #Expected input is comma-delimited kcrashes string
308        kcrash_list = kcrashes.split(',')
309        if 'SYSRQ_X' in kcrash_list or 'ALL' in kcrash_list:
310            self._test_sysrq_x()
311            if 'SYSRQ_X' in kcrash_list:
312                kcrash_list.remove('SYSRQ_X')
313            if 'ALL' in kcrash_list:
314                kcrash_list = kcrash_types.keys()
315        for kcrash in kcrash_list:
316            if kcrash_types.get(kcrash) == None:
317                bad_kcrashes.append(kcrash)
318                continue
319            self._test_panic_path(kcrash,kcrash_types[kcrash])
320
321        if len(bad_kcrashes) > 0:
322            raise error.TestFail("Wrong kcrash type "
323                                 "requested (%s)" % str(bad_kcrashes))
324
325    def cleanup(self):
326        self._restore_consent_files()
327        test.test.cleanup(self)
328