1# Copyright 2016 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import json 6import logging 7import os 8import time 9 10import common 11from autotest_lib.client.common_lib import error 12from autotest_lib.client.common_lib import global_config 13from autotest_lib.client.common_lib import hosts 14from autotest_lib.server import afe_utils 15from autotest_lib.server import crashcollect 16from autotest_lib.server.hosts import repair 17from autotest_lib.server.hosts import cros_firmware 18 19# _DEV_MODE_ALLOW_POOLS - The set of pools that are allowed to be 20# in dev mode (usually, those should be unmanaged devices) 21# 22_DEV_MODE_ALLOWED_POOLS = set( 23 global_config.global_config.get_config_value( 24 'CROS', 25 'pools_dev_mode_allowed', 26 type=str, 27 default='', 28 allow_blank=True).split(',')) 29 30class ACPowerVerifier(hosts.Verifier): 31 """Check for AC power and a reasonable battery charge.""" 32 33 def verify(self, host): 34 # Temporarily work around a problem caused by some old FSI 35 # builds that don't have the power_supply_info command by 36 # ignoring failures. The repair triggers believe that this 37 # verifier can't be fixed by re-installing, which means if a DUT 38 # gets stuck with one of those old builds, it can't be repaired. 39 # 40 # TODO(jrbarnette): This is for crbug.com/599158; we need a 41 # better solution. 42 try: 43 info = host.get_power_supply_info() 44 except: 45 logging.exception('get_power_supply_info() failed') 46 return 47 try: 48 if info['Line Power']['online'] != 'yes': 49 raise hosts.AutoservVerifyError( 50 'AC power is not plugged in') 51 except KeyError: 52 logging.info('Cannot determine AC power status - ' 53 'skipping check.') 54 try: 55 if float(info['Battery']['percentage']) < 50.0: 56 raise hosts.AutoservVerifyError( 57 'Battery is less than 50%') 58 except KeyError: 59 logging.info('Cannot determine battery status - ' 60 'skipping check.') 61 62 @property 63 def description(self): 64 return 'The DUT is plugged in to AC power' 65 66 67class WritableVerifier(hosts.Verifier): 68 """ 69 Confirm the stateful file systems are writable. 70 71 The standard linux response to certain unexpected file system errors 72 (including hardware errors in block devices) is to change the file 73 system status to read-only. This checks that that hasn't happened. 74 75 The test covers the two file systems that need to be writable for 76 critical operations like AU: 77 * The (unencrypted) stateful system which includes 78 /mnt/stateful_partition. 79 * The encrypted stateful partition, which includes /var. 80 81 The test doesn't check various bind mounts; those are expected to 82 fail the same way as their underlying main mounts. Whether the 83 Linux kernel can guarantee that is untested... 84 """ 85 86 # N.B. Order matters here: Encrypted stateful is loop-mounted from 87 # a file in unencrypted stateful, so we don't test for errors in 88 # encrypted stateful if unencrypted fails. 89 _TEST_DIRECTORIES = ['/mnt/stateful_partition', '/var/tmp'] 90 91 def verify(self, host): 92 # This deliberately stops looking after the first error. 93 # See above for the details. 94 for testdir in self._TEST_DIRECTORIES: 95 filename = os.path.join(testdir, 'writable_test') 96 command = 'touch %s && rm %s' % (filename, filename) 97 rv = host.run(command=command, ignore_status=True) 98 if rv.exit_status != 0: 99 msg = 'Can\'t create a file in %s' % testdir 100 raise hosts.AutoservVerifyError(msg) 101 102 @property 103 def description(self): 104 return 'The stateful filesystems are writable' 105 106 107class EXT4fsErrorVerifier(hosts.Verifier): 108 """ 109 Confirm we have not seen critical file system kernel errors. 110 """ 111 def verify(self, host): 112 # grep for stateful FS errors of the type "EXT4-fs error (device sda1):" 113 command = ("dmesg | grep -E \"EXT4-fs error \(device " 114 "$(cut -d ' ' -f 5,9 /proc/$$/mountinfo | " 115 "grep -e '^/mnt/stateful_partition ' | " 116 "cut -d ' ' -f 2 | cut -d '/' -f 3)\):\"") 117 output = host.run(command=command, ignore_status=True).stdout 118 if output: 119 sample = output.splitlines()[0] 120 message = 'Saw file system error: %s' % sample 121 raise hosts.AutoservVerifyError(message) 122 # Check for other critical FS errors. 123 command = 'dmesg | grep "This should not happen!! Data will be lost"' 124 output = host.run(command=command, ignore_status=True).stdout 125 if output: 126 message = 'Saw file system error: Data will be lost' 127 raise hosts.AutoservVerifyError(message) 128 else: 129 logging.error('Could not determine stateful mount.') 130 131 @property 132 def description(self): 133 return 'Did not find critical file system errors' 134 135 136class UpdateSuccessVerifier(hosts.Verifier): 137 """ 138 Checks that the DUT successfully finished its last provision job. 139 140 At the start of any update (e.g. for a Provision job), the code 141 creates a marker file named `host.PROVISION_FAILED`. The file is 142 located in a part of the stateful partition that will be removed if 143 an update finishes successfully. Thus, the presence of the file 144 indicates that a prior update failed. 145 146 The verifier tests for the existence of the marker file and fails if 147 it still exists. 148 """ 149 def verify(self, host): 150 result = host.run('test -f %s' % host.PROVISION_FAILED, 151 ignore_status=True) 152 if result.exit_status == 0: 153 raise hosts.AutoservVerifyError( 154 'Last AU on this DUT failed') 155 156 @property 157 def description(self): 158 return 'The most recent AU attempt on this DUT succeeded' 159 160 161class TPMStatusVerifier(hosts.Verifier): 162 """Verify that the host's TPM is in a good state.""" 163 164 def verify(self, host): 165 # This cryptohome command emits status information in JSON format. It 166 # looks something like this: 167 # { 168 # "installattrs": { 169 # ... 170 # }, 171 # "mounts": [ { 172 # ... 173 # } ], 174 # "tpm": { 175 # "being_owned": false, 176 # "can_connect": true, 177 # "can_decrypt": false, 178 # "can_encrypt": false, 179 # "can_load_srk": true, 180 # "can_load_srk_pubkey": true, 181 # "enabled": true, 182 # "has_context": true, 183 # "has_cryptohome_key": false, 184 # "has_key_handle": false, 185 # "last_error": 0, 186 # "owned": true 187 # } 188 # } 189 output = host.run('cryptohome --action=status').stdout.strip() 190 try: 191 status = json.loads(output) 192 except ValueError: 193 logging.info('Cannot determine the Crytohome valid status - ' 194 'skipping check.') 195 return 196 try: 197 tpm = status['tpm'] 198 if not tpm['enabled']: 199 raise hosts.AutoservVerifyError( 200 'TPM is not enabled -- Hardware is not working.') 201 if not tpm['can_connect']: 202 raise hosts.AutoservVerifyError( 203 ('TPM connect failed -- ' 204 'last_error=%d.' % tpm['last_error'])) 205 if tpm['owned'] and not tpm['can_load_srk']: 206 raise hosts.AutoservVerifyError( 207 'Cannot load the TPM SRK') 208 if tpm['can_load_srk'] and not tpm['can_load_srk_pubkey']: 209 raise hosts.AutoservVerifyError( 210 'Cannot load the TPM SRK public key') 211 except KeyError: 212 logging.info('Cannot determine the Crytohome valid status - ' 213 'skipping check.') 214 215 @property 216 def description(self): 217 return 'The host\'s TPM is available and working' 218 219 220class PythonVerifier(hosts.Verifier): 221 """Confirm the presence of a working Python interpreter.""" 222 223 def verify(self, host): 224 result = host.run('python -c "import cPickle"', 225 ignore_status=True) 226 if result.exit_status != 0: 227 message = 'The python interpreter is broken' 228 if result.exit_status == 127: 229 search = host.run('which python', ignore_status=True) 230 if search.exit_status != 0 or not search.stdout: 231 message = ('Python is missing; may be caused by ' 232 'powerwash') 233 raise hosts.AutoservVerifyError(message) 234 235 @property 236 def description(self): 237 return 'Python on the host is installed and working' 238 239 240class DevModeVerifier(hosts.Verifier): 241 """Verify that the host is not in dev mode.""" 242 243 def verify(self, host): 244 # Some pools are allowed to be in dev mode 245 info = host.host_info_store.get() 246 if (bool(info.pools & _DEV_MODE_ALLOWED_POOLS)): 247 return 248 249 result = host.run('crossystem devsw_boot', ignore_status=True).stdout 250 if result != '0': 251 raise hosts.AutoservVerifyError('The host is in dev mode') 252 253 @property 254 def description(self): 255 return 'The host should not be in dev mode' 256 257 258class ServoSysRqRepair(hosts.RepairAction): 259 """ 260 Repair a Chrome device by sending a system request to the kernel. 261 262 Sending 3 times the Alt+VolUp+x key combination (aka sysrq-x) 263 will ask the kernel to panic itself and reboot while conserving 264 the kernel logs in console ramoops. 265 """ 266 267 def repair(self, host): 268 if not host.servo: 269 raise hosts.AutoservRepairError( 270 '%s has no servo support.' % host.hostname) 271 # Press 3 times Alt+VolUp+X 272 # no checking DUT health between each press as 273 # killing Chrome is not really likely to fix the DUT SSH. 274 for _ in range(3): 275 try: 276 host.servo.sysrq_x() 277 except error.TestFail, ex: 278 raise hosts.AutoservRepairError( 279 'cannot press sysrq-x: %s.' % str(ex)) 280 # less than 5 seconds between presses. 281 time.sleep(2.0) 282 283 if host.wait_up(host.BOOT_TIMEOUT): 284 # Collect logs once we regain ssh access before clobbering them. 285 local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_sysrq') 286 host.collect_logs('/var/log', local_log_dir, ignore_errors=True) 287 # Collect crash info. 288 crashcollect.get_crashinfo(host, None) 289 return 290 raise hosts.AutoservRepairError( 291 '%s is still offline after reset.' % host.hostname) 292 293 @property 294 def description(self): 295 return 'Reset the DUT via kernel sysrq' 296 297 298class ServoResetRepair(hosts.RepairAction): 299 """Repair a Chrome device by resetting it with servo.""" 300 301 def repair(self, host): 302 if not host.servo: 303 raise hosts.AutoservRepairError( 304 '%s has no servo support.' % host.hostname) 305 host.servo.get_power_state_controller().reset() 306 if host.wait_up(host.BOOT_TIMEOUT): 307 # Collect logs once we regain ssh access before clobbering them. 308 local_log_dir = crashcollect.get_crashinfo_dir(host, 'after_reset') 309 host.collect_logs('/var/log', local_log_dir, ignore_errors=True) 310 # Collect crash info. 311 crashcollect.get_crashinfo(host, None) 312 return 313 raise hosts.AutoservRepairError( 314 '%s is still offline after reset.' % host.hostname) 315 316 @property 317 def description(self): 318 return 'Reset the DUT via servo' 319 320 321class AutoUpdateRepair(hosts.RepairAction): 322 """ 323 Repair by re-installing a test image using autoupdate. 324 325 Try to install the DUT's designated "stable test image" using the 326 standard procedure for installing a new test image via autoupdate. 327 """ 328 329 def repair(self, host): 330 afe_utils.machine_install_and_update_labels(host, repair=True) 331 332 @property 333 def description(self): 334 return 'Re-install the stable build via AU' 335 336 337class PowerWashRepair(AutoUpdateRepair): 338 """ 339 Powerwash the DUT, then re-install using autoupdate. 340 341 Powerwash the DUT, then attempt to re-install a stable test image as 342 for `AutoUpdateRepair`. 343 """ 344 345 def repair(self, host): 346 host.run('echo "fast safe" > ' 347 '/mnt/stateful_partition/factory_install_reset') 348 host.reboot(timeout=host.POWERWASH_BOOT_TIMEOUT, wait=True) 349 super(PowerWashRepair, self).repair(host) 350 351 @property 352 def description(self): 353 return 'Powerwash and then re-install the stable build via AU' 354 355 356class ServoInstallRepair(hosts.RepairAction): 357 """ 358 Reinstall a test image from USB using servo. 359 360 Use servo to re-install the DUT's designated "stable test image" 361 from servo-attached USB storage. 362 """ 363 364 def repair(self, host): 365 if not host.servo: 366 raise hosts.AutoservRepairError( 367 '%s has no servo support.' % host.hostname) 368 host.servo_install(host.stage_image_for_servo()) 369 370 @property 371 def description(self): 372 return 'Reinstall from USB using servo' 373 374 375def create_cros_repair_strategy(): 376 """Return a `RepairStrategy` for a `CrosHost`.""" 377 FirmwareStatusVerifier = cros_firmware.FirmwareStatusVerifier 378 FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier 379 verify_dag = [ 380 (repair.SshVerifier, 'ssh', []), 381 (DevModeVerifier, 'devmode', ['ssh']), 382 (ACPowerVerifier, 'power', ['ssh']), 383 (EXT4fsErrorVerifier, 'ext4', ['ssh']), 384 (WritableVerifier, 'writable', ['ssh']), 385 (TPMStatusVerifier, 'tpm', ['ssh']), 386 (UpdateSuccessVerifier, 'good_au', ['ssh']), 387 (FirmwareStatusVerifier, 'fwstatus', ['ssh']), 388 (FirmwareVersionVerifier, 'rwfw', ['ssh']), 389 (PythonVerifier, 'python', ['ssh']), 390 (repair.LegacyHostVerifier, 'cros', ['ssh']), 391 ] 392 393 # The dependencies and triggers for the 'au', 'powerwash', and 'usb' 394 # repair actions stack up: Each one is able to repair progressively 395 # more verifiers than the one before. The 'triggers' lists below 396 # show the progression. 397 # 398 # N.B. AC power detection depends on software on the DUT, and there 399 # have been bugs where detection failed even though the DUT really 400 # did have power. So, we make the 'power' verifier a trigger for 401 # reinstall repair actions, too. 402 # 403 # TODO(jrbarnette): AU repair can't fix all problems reported by 404 # the 'cros' verifier; it's listed as an AU trigger as a 405 # simplification. The ultimate fix is to split the 'cros' verifier 406 # into smaller individual verifiers. 407 408 usb_triggers = ['ssh', 'writable'] 409 powerwash_triggers = ['tpm', 'good_au', 'ext4'] 410 au_triggers = ['power', 'rwfw', 'python', 'cros'] 411 412 FirmwareRepair = cros_firmware.FirmwareRepair 413 repair_actions = [ 414 # RPM cycling must precede Servo reset: if the DUT has a dead 415 # battery, we need to reattach AC power before we reset via servo. 416 (repair.RPMCycleRepair, 'rpm', [], ['ssh', 'power']), 417 (ServoSysRqRepair, 'sysrq', [], ['ssh']), 418 (ServoResetRepair, 'servoreset', [], ['ssh']), 419 420 # N.B. FirmwareRepair can't fix a 'good_au' failure directly, 421 # because it doesn't remove the flag file that triggers the 422 # failure. We include it as a repair trigger because it's 423 # possible the the last update failed because of the firmware, 424 # and we want the repair steps below to be able to trust the 425 # firmware. 426 (FirmwareRepair, 'firmware', [], ['ssh', 'fwstatus', 'good_au']), 427 428 (repair.RebootRepair, 'reboot', ['ssh'], ['devmode', 'writable']), 429 430 (AutoUpdateRepair, 'au', 431 usb_triggers + powerwash_triggers, au_triggers), 432 (PowerWashRepair, 'powerwash', 433 usb_triggers, powerwash_triggers + au_triggers), 434 (ServoInstallRepair, 'usb', 435 [], usb_triggers + powerwash_triggers + au_triggers), 436 ] 437 return hosts.RepairStrategy(verify_dag, repair_actions) 438 439 440 441def create_moblab_repair_strategy(): 442 """ 443 Return a `RepairStrategy` for a `MoblabHost`. 444 445 Moblab is a subset of the CrOS verify and repair. Several pieces 446 are removed because they're not expected to be meaningful. Some 447 others are removed for more specific reasons: 448 449 'tpm': Moblab DUTs don't run the tests that matter to this 450 verifier. TODO(jrbarnette) This assertion is unproven. 451 452 'good_au': This verifier can't pass, because the Moblab AU 453 procedure doesn't properly delete CrosHost.PROVISION_FAILED. 454 TODO(jrbarnette) We should refactor _machine_install() so that 455 it can be different for Moblab. 456 457 'firmware': Moblab DUTs shouldn't be in FAFT pools, so we don't try 458 this. 459 460 'powerwash': Powerwash on Moblab causes trouble with deleting the 461 DHCP leases file, so we skip it. 462 """ 463 FirmwareVersionVerifier = cros_firmware.FirmwareVersionVerifier 464 verify_dag = [ 465 (repair.SshVerifier, 'ssh', []), 466 (ACPowerVerifier, 'power', ['ssh']), 467 (FirmwareVersionVerifier, 'rwfw', ['ssh']), 468 (PythonVerifier, 'python', ['ssh']), 469 (repair.LegacyHostVerifier, 'cros', ['ssh']), 470 ] 471 au_triggers = ['power', 'rwfw', 'python', 'cros'] 472 repair_actions = [ 473 (repair.RPMCycleRepair, 'rpm', [], ['ssh', 'power']), 474 (AutoUpdateRepair, 'au', ['ssh'], au_triggers), 475 ] 476 return hosts.RepairStrategy(verify_dag, repair_actions) 477