1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import json
6import logging
7import os
8import time
9
10import common
11from autotest_lib.client.common_lib import error
12from autotest_lib.client.common_lib import global_config
13from autotest_lib.client.common_lib import hosts
14from autotest_lib.server import afe_utils
15from autotest_lib.server import crashcollect
16from autotest_lib.server.hosts import repair
17from autotest_lib.server.hosts import cros_firmware
18
19# _DEV_MODE_ALLOW_POOLS - The set of pools that are allowed to be
20# in dev mode (usually, those should be unmanaged devices)
21#
22_DEV_MODE_ALLOWED_POOLS = set(
23    global_config.global_config.get_config_value(
24            'CROS',
25            'pools_dev_mode_allowed',
26            type=str,
27            default='',
28            allow_blank=True).split(','))
29
30class ACPowerVerifier(hosts.Verifier):
31    """Check for AC power and a reasonable battery charge."""
32
33    def verify(self, host):
34        # Temporarily work around a problem caused by some old FSI
35        # builds that don't have the power_supply_info command by
36        # ignoring failures.  The repair triggers believe that this
37        # verifier can't be fixed by re-installing, which means if a DUT
38        # gets stuck with one of those old builds, it can't be repaired.
39        #
40        # TODO(jrbarnette): This is for crbug.com/599158; we need a
41        # better solution.
42        try:
43            info = host.get_power_supply_info()
44        except:
45            logging.exception('get_power_supply_info() failed')
46            return
47        try:
48            if info['Line Power']['online'] != 'yes':
49                raise hosts.AutoservVerifyError(
50                        'AC power is not plugged in')
51        except KeyError:
52            logging.info('Cannot determine AC power status - '
53                         'skipping check.')
54        try:
55            if float(info['Battery']['percentage']) < 50.0:
56                raise hosts.AutoservVerifyError(
57                        'Battery is less than 50%')
58        except KeyError:
59            logging.info('Cannot determine battery status - '
60                         'skipping check.')
61
62    @property
63    def description(self):
64        return 'The DUT is plugged in to AC power'
65
66
67class WritableVerifier(hosts.Verifier):
68    """
69    Confirm the stateful file systems are writable.
70
71    The standard linux response to certain unexpected file system errors
72    (including hardware errors in block devices) is to change the file
73    system status to read-only.  This checks that that hasn't happened.
74
75    The test covers the two file systems that need to be writable for
76    critical operations like AU:
77      * The (unencrypted) stateful system which includes
78        /mnt/stateful_partition.
79      * The encrypted stateful partition, which includes /var.
80
81    The test doesn't check various bind mounts; those are expected to
82    fail the same way as their underlying main mounts.  Whether the
83    Linux kernel can guarantee that is untested...
84    """
85
86    # N.B. Order matters here:  Encrypted stateful is loop-mounted from
87    # a file in unencrypted stateful, so we don't test for errors in
88    # encrypted stateful if unencrypted fails.
89    _TEST_DIRECTORIES = ['/mnt/stateful_partition', '/var/tmp']
90
91    def verify(self, host):
92        # This deliberately stops looking after the first error.
93        # See above for the details.
94        for testdir in self._TEST_DIRECTORIES:
95            filename = os.path.join(testdir, 'writable_test')
96            command = 'touch %s && rm %s' % (filename, filename)
97            rv = host.run(command=command, ignore_status=True)
98            if rv.exit_status != 0:
99                msg = 'Can\'t create a file in %s' % testdir
100                raise hosts.AutoservVerifyError(msg)
101
102    @property
103    def description(self):
104        return 'The stateful filesystems are writable'
105
106
107class EXT4fsErrorVerifier(hosts.Verifier):
108    """
109    Confirm we have not seen critical file system kernel errors.
110    """
111    def verify(self, host):
112        # grep for stateful FS errors of the type "EXT4-fs error (device sda1):"
113        command = ("dmesg | grep -E \"EXT4-fs error \(device "
114                   "$(cut -d ' ' -f 5,9 /proc/$$/mountinfo | "
115                   "grep -e '^/mnt/stateful_partition ' | "
116                   "cut -d ' ' -f 2 | cut -d '/' -f 3)\):\"")
117        output = host.run(command=command, ignore_status=True).stdout
118        if output:
119            sample = output.splitlines()[0]
120            message = 'Saw file system error: %s' % sample
121            raise hosts.AutoservVerifyError(message)
122        # Check for other critical FS errors.
123        command = 'dmesg | grep "This should not happen!!  Data will be lost"'
124        output = host.run(command=command, ignore_status=True).stdout
125        if output:
126            message = 'Saw file system error: Data will be lost'
127            raise hosts.AutoservVerifyError(message)
128        else:
129            logging.error('Could not determine stateful mount.')
130
131    @property
132    def description(self):
133        return 'Did not find critical file system errors'
134
135
136class UpdateSuccessVerifier(hosts.Verifier):
137    """
138    Checks that the DUT successfully finished its last provision job.
139
140    At the start of any update (e.g. for a Provision job), the code
141    creates a marker file named `host.PROVISION_FAILED`.  The file is
142    located in a part of the stateful partition that will be removed if
143    an update finishes successfully.  Thus, the presence of the file
144    indicates that a prior update failed.
145
146    The verifier tests for the existence of the marker file and fails if
147    it still exists.
148    """
149    def verify(self, host):
150        result = host.run('test -f %s' % host.PROVISION_FAILED,
151                          ignore_status=True)
152        if result.exit_status == 0:
153            raise hosts.AutoservVerifyError(
154                    'Last AU on this DUT failed')
155
156    @property
157    def description(self):
158        return 'The most recent AU attempt on this DUT succeeded'
159
160
161class TPMStatusVerifier(hosts.Verifier):
162    """Verify that the host's TPM is in a good state."""
163
164    def verify(self, host):
165        # This cryptohome command emits status information in JSON format. It
166        # looks something like this:
167        # {
168        #    "installattrs": {
169        #       ...
170        #    },
171        #    "mounts": [ {
172        #       ...
173        #    } ],
174        #    "tpm": {
175        #       "being_owned": false,
176        #       "can_connect": true,
177        #       "can_decrypt": false,
178        #       "can_encrypt": false,
179        #       "can_load_srk": true,
180        #       "can_load_srk_pubkey": true,
181        #       "enabled": true,
182        #       "has_context": true,
183        #       "has_cryptohome_key": false,
184        #       "has_key_handle": false,
185        #       "last_error": 0,
186        #       "owned": true
187        #    }
188        # }
189        output = host.run('cryptohome --action=status').stdout.strip()
190        try:
191            status = json.loads(output)
192        except ValueError:
193            logging.info('Cannot determine the Crytohome valid status - '
194                         'skipping check.')
195            return
196        try:
197            tpm = status['tpm']
198            if not tpm['enabled']:
199                raise hosts.AutoservVerifyError(
200                        'TPM is not enabled -- Hardware is not working.')
201            if not tpm['can_connect']:
202                raise hosts.AutoservVerifyError(
203                        ('TPM connect failed -- '
204                         'last_error=%d.' % tpm['last_error']))
205            if tpm['owned'] and not tpm['can_load_srk']:
206                raise hosts.AutoservVerifyError(
207                        'Cannot load the TPM SRK')
208            if tpm['can_load_srk'] and not tpm['can_load_srk_pubkey']:
209                raise hosts.AutoservVerifyError(
210                        'Cannot load the TPM SRK public key')
211        except KeyError:
212            logging.info('Cannot determine the Crytohome valid status - '
213                         'skipping check.')
214
215    @property
216    def description(self):
217        return 'The host\'s TPM is available and working'
218
219
220class PythonVerifier(hosts.Verifier):
221    """Confirm the presence of a working Python interpreter."""
222
223    def verify(self, host):
224        result = host.run('python -c "import cPickle"',
225                          ignore_status=True)
226        if result.exit_status != 0:
227            message = 'The python interpreter is broken'
228            if result.exit_status == 127:
229                search = host.run('which python', ignore_status=True)
230                if search.exit_status != 0 or not search.stdout:
231                    message = ('Python is missing; may be caused by '
232                               'powerwash')
233            raise hosts.AutoservVerifyError(message)
234
235    @property
236    def description(self):
237        return 'Python on the host is installed and working'
238
239
240class DevModeVerifier(hosts.Verifier):
241    """Verify that the host is not in dev mode."""
242
243    def verify(self, host):
244        # Some pools are allowed to be in dev mode
245        info = host.host_info_store.get()
246        if (bool(info.pools & _DEV_MODE_ALLOWED_POOLS)):
247            return
248
249        result = host.run('crossystem devsw_boot', ignore_status=True).stdout
250        if result != '0':
251            raise hosts.AutoservVerifyError('The host is in dev mode')
252
253    @property
254    def description(self):
255        return 'The host should not be in dev mode'
256
257
258class ServoSysRqRepair(hosts.RepairAction):
259    """
260    Repair a Chrome device by sending a system request to the kernel.
261
262    Sending 3 times the Alt+VolUp+x key combination (aka sysrq-x)
263    will ask the kernel to panic itself and reboot while conserving
264    the kernel logs in console ramoops.
265    """
266
267    def repair(self, host):
268        if not host.servo:
269            raise hosts.AutoservRepairError(
270                    '%s has no servo support.' % host.hostname)
271        # Press 3 times Alt+VolUp+X
272        # no checking DUT health between each press as
273        # killing Chrome is not really likely to fix the DUT SSH.
274        for _ in range(3):
275            try:
276                host.servo.sysrq_x()
277            except error.TestFail, ex:
278                raise hosts.AutoservRepairError(
279                      'cannot press sysrq-x: %s.' % str(ex))
280            # less than 5 seconds between presses.
281            time.sleep(2.0)
282
283        if host.wait_up(host.BOOT_TIMEOUT):
284            # Collect logs once we regain ssh access before clobbering them.
285            local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_sysrq')
286            host.collect_logs('/var/log', local_log_dir, ignore_errors=True)
287            # Collect crash info.
288            crashcollect.get_crashinfo(host, None)
289            return
290        raise hosts.AutoservRepairError(
291                '%s is still offline after reset.' % host.hostname)
292
293    @property
294    def description(self):
295        return 'Reset the DUT via kernel sysrq'
296
297
298class ServoResetRepair(hosts.RepairAction):
299    """Repair a Chrome device by resetting it with servo."""
300
301    def repair(self, host):
302        if not host.servo:
303            raise hosts.AutoservRepairError(
304                    '%s has no servo support.' % host.hostname)
305        host.servo.get_power_state_controller().reset()
306        if host.wait_up(host.BOOT_TIMEOUT):
307            # Collect logs once we regain ssh access before clobbering them.
308            local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_reset')
309            host.collect_logs('/var/log', local_log_dir, ignore_errors=True)
310            # Collect crash info.
311            crashcollect.get_crashinfo(host, None)
312            return
313        raise hosts.AutoservRepairError(
314                '%s is still offline after reset.' % host.hostname)
315
316    @property
317    def description(self):
318        return 'Reset the DUT via servo'
319
320
321class AutoUpdateRepair(hosts.RepairAction):
322    """
323    Repair by re-installing a test image using autoupdate.
324
325    Try to install the DUT's designated "stable test image" using the
326    standard procedure for installing a new test image via autoupdate.
327    """
328
329    def repair(self, host):
330        afe_utils.machine_install_and_update_labels(host, repair=True)
331
332    @property
333    def description(self):
334        return 'Re-install the stable build via AU'
335
336
337class PowerWashRepair(AutoUpdateRepair):
338    """
339    Powerwash the DUT, then re-install using autoupdate.
340
341    Powerwash the DUT, then attempt to re-install a stable test image as
342    for `AutoUpdateRepair`.
343    """
344
345    def repair(self, host):
346        host.run('echo "fast safe" > '
347                 '/mnt/stateful_partition/factory_install_reset')
348        host.reboot(timeout=host.POWERWASH_BOOT_TIMEOUT, wait=True)
349        super(PowerWashRepair, self).repair(host)
350
351    @property
352    def description(self):
353        return 'Powerwash and then re-install the stable build via AU'
354
355
356class ServoInstallRepair(hosts.RepairAction):
357    """
358    Reinstall a test image from USB using servo.
359
360    Use servo to re-install the DUT's designated "stable test image"
361    from servo-attached USB storage.
362    """
363
364    def repair(self, host):
365        if not host.servo:
366            raise hosts.AutoservRepairError(
367                    '%s has no servo support.' % host.hostname)
368        host.servo_install(host.stage_image_for_servo())
369
370    @property
371    def description(self):
372        return 'Reinstall from USB using servo'
373
374
375def create_cros_repair_strategy():
376    """Return a `RepairStrategy` for a `CrosHost`."""
377    FirmwareStatusVerifier = cros_firmware.FirmwareStatusVerifier
378    FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier
379    verify_dag = [
380        (repair.SshVerifier,         'ssh',      []),
381        (DevModeVerifier,            'devmode',  ['ssh']),
382        (ACPowerVerifier,            'power',    ['ssh']),
383        (EXT4fsErrorVerifier,        'ext4',     ['ssh']),
384        (WritableVerifier,           'writable', ['ssh']),
385        (TPMStatusVerifier,          'tpm',      ['ssh']),
386        (UpdateSuccessVerifier,      'good_au',  ['ssh']),
387        (FirmwareStatusVerifier,     'fwstatus', ['ssh']),
388        (FirmwareVersionVerifier,    'rwfw',     ['ssh']),
389        (PythonVerifier,             'python',   ['ssh']),
390        (repair.LegacyHostVerifier,  'cros',     ['ssh']),
391    ]
392
393    # The dependencies and triggers for the 'au', 'powerwash', and 'usb'
394    # repair actions stack up:  Each one is able to repair progressively
395    # more verifiers than the one before.  The 'triggers' lists below
396    # show the progression.
397    #
398    # N.B. AC power detection depends on software on the DUT, and there
399    # have been bugs where detection failed even though the DUT really
400    # did have power.  So, we make the 'power' verifier a trigger for
401    # reinstall repair actions, too.
402    #
403    # TODO(jrbarnette):  AU repair can't fix all problems reported by
404    # the 'cros' verifier; it's listed as an AU trigger as a
405    # simplification.  The ultimate fix is to split the 'cros' verifier
406    # into smaller individual verifiers.
407
408    usb_triggers       = ['ssh', 'writable']
409    powerwash_triggers = ['tpm', 'good_au', 'ext4']
410    au_triggers        = ['power', 'rwfw', 'python', 'cros']
411
412    FirmwareRepair = cros_firmware.FirmwareRepair
413    repair_actions = [
414        # RPM cycling must precede Servo reset:  if the DUT has a dead
415        # battery, we need to reattach AC power before we reset via servo.
416        (repair.RPMCycleRepair, 'rpm', [], ['ssh', 'power']),
417        (ServoSysRqRepair, 'sysrq', [], ['ssh']),
418        (ServoResetRepair, 'servoreset', [], ['ssh']),
419
420        # N.B. FirmwareRepair can't fix a 'good_au' failure directly,
421        # because it doesn't remove the flag file that triggers the
422        # failure.  We include it as a repair trigger because it's
423        # possible the the last update failed because of the firmware,
424        # and we want the repair steps below to be able to trust the
425        # firmware.
426        (FirmwareRepair, 'firmware', [], ['ssh', 'fwstatus', 'good_au']),
427
428        (repair.RebootRepair, 'reboot', ['ssh'], ['devmode', 'writable']),
429
430        (AutoUpdateRepair, 'au',
431                usb_triggers + powerwash_triggers, au_triggers),
432        (PowerWashRepair, 'powerwash',
433                usb_triggers, powerwash_triggers + au_triggers),
434        (ServoInstallRepair, 'usb',
435                [], usb_triggers + powerwash_triggers + au_triggers),
436    ]
437    return hosts.RepairStrategy(verify_dag, repair_actions)
438
439
440
441def create_moblab_repair_strategy():
442    """
443    Return a `RepairStrategy` for a `MoblabHost`.
444
445    Moblab is a subset of the CrOS verify and repair.  Several pieces
446    are removed because they're not expected to be meaningful.  Some
447    others are removed for more specific reasons:
448
449    'tpm':  Moblab DUTs don't run the tests that matter to this
450        verifier.  TODO(jrbarnette)  This assertion is unproven.
451
452    'good_au':  This verifier can't pass, because the Moblab AU
453        procedure doesn't properly delete CrosHost.PROVISION_FAILED.
454        TODO(jrbarnette) We should refactor _machine_install() so that
455        it can be different for Moblab.
456
457    'firmware':  Moblab DUTs shouldn't be in FAFT pools, so we don't try
458        this.
459
460    'powerwash':  Powerwash on Moblab causes trouble with deleting the
461        DHCP leases file, so we skip it.
462    """
463    FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier
464    verify_dag = [
465        (repair.SshVerifier,         'ssh',     []),
466        (ACPowerVerifier,            'power',   ['ssh']),
467        (FirmwareVersionVerifier,    'rwfw',    ['ssh']),
468        (PythonVerifier,             'python',  ['ssh']),
469        (repair.LegacyHostVerifier,  'cros',    ['ssh']),
470    ]
471    au_triggers = ['power', 'rwfw', 'python', 'cros']
472    repair_actions = [
473        (repair.RPMCycleRepair, 'rpm', [], ['ssh', 'power']),
474        (AutoUpdateRepair, 'au', ['ssh'], au_triggers),
475    ]
476    return hosts.RepairStrategy(verify_dag, repair_actions)
477