1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Install an initial test image on a set of DUTs. 7 8The methods in this module are meant for two nominally distinct use 9cases that share a great deal of code internally. The first use 10case is for deployment of DUTs that have just been placed in the lab 11for the first time. The second use case is for use after repairing 12a servo. 13 14Newly deployed DUTs may be in a somewhat anomalous state: 15 * The DUTs are running a production base image, not a test image. 16 By extension, the DUTs aren't reachable over SSH. 17 * The DUTs are not necessarily in the AFE database. DUTs that 18 _are_ in the database should be locked. Either way, the DUTs 19 cannot be scheduled to run tests. 20 * The servos for the DUTs need not be configured with the proper 21 board. 22 23More broadly, it's not expected that the DUT will be working at the 24start of this operation. If the DUT isn't working at the end of the 25operation, an error will be reported. 26 27The script performs the following functions: 28 * Configure the servo for the target board, and test that the 29 servo is generally in good order. 30 * For the full deployment case, install dev-signed RO firmware 31 from the designated stable test image for the DUTs. 32 * For both cases, use servo to install the stable test image from 33 USB. 34 * If the DUT isn't in the AFE database, add it. 35 36The script imposes these preconditions: 37 * Every DUT has a properly connected servo. 38 * Every DUT and servo have proper DHCP and DNS configurations. 39 * Every servo host is up and running, and accessible via SSH. 40 * There is a known, working test image that can be staged and 41 installed on the target DUTs via servo. 42 * Every DUT has the same board. 43 * For the full deployment case, every DUT must be in dev mode, 44 and configured to allow boot from USB with ctrl+U. 45 46The implementation uses the `multiprocessing` module to run all 47installations in parallel, separate processes. 48 49""" 50 51import functools 52import json 53import logging 54import multiprocessing 55import os 56import shutil 57import subprocess 58import sys 59import tempfile 60import time 61 62import common 63from autotest_lib.client.common_lib import error 64from autotest_lib.client.common_lib import time_utils 65from autotest_lib.client.common_lib import utils 66from autotest_lib.client.common_lib.cros import servo_afe_board_map 67from autotest_lib.server import frontend 68from autotest_lib.server import hosts 69from autotest_lib.server.cros.dynamic_suite.constants import VERSION_PREFIX 70from autotest_lib.site_utils.deployment import commandline 71from autotest_lib.site_utils.suite_scheduler.constants import Labels 72 73 74_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' 75 76_DEFAULT_POOL = Labels.POOL_PREFIX + 'suites' 77 78_DIVIDER = '\n============\n' 79 80_OMAHA_STATUS = 'gs://chromeos-build-release-console/omaha_status.json' 81 82 83def _report_write(report_log, message): 84 """Write a message to the report log. 85 86 Report output goes both to stdout, and to a given report 87 file. 88 89 @param report_log Write the message here and to stdout. 90 @param message Write this message. 91 """ 92 report_log.write(message) 93 sys.stdout.write(message) 94 95 96def _get_omaha_build(board): 97 """Get the currently preferred Beta channel build for `board`. 98 99 Open and read through the JSON file provided by GoldenEye that 100 describes what version Omaha is currently serving for all boards 101 on all channels. Find the entry for `board` on the Beta channel, 102 and return that version string. 103 104 @param board The board to look up from GoldenEye. 105 106 @return Returns a Chrome OS version string in standard form 107 R##-####.#.#. Will return `None` if no Beta channel 108 entry is found. 109 """ 110 omaha_board = board.replace('_', '-') 111 sp = subprocess.Popen(['gsutil', 'cat', _OMAHA_STATUS], 112 stdout=subprocess.PIPE) 113 omaha_status = json.load(sp.stdout) 114 for e in omaha_status['omaha_data']: 115 if (e['channel'] == 'beta' and 116 e['board']['public_codename'] == omaha_board): 117 milestone = e['chrome_version'].split('.')[0] 118 build = e['chrome_os_version'] 119 return 'R%s-%s' % (milestone, build) 120 return None 121 122 123def _update_build(afe, report_log, arguments): 124 """Update the stable_test_versions table. 125 126 This calls the `set_stable_version` RPC call to set the stable 127 test version selected by this run of the command. The 128 version is selected from three possible versions: 129 * The stable test version currently in the AFE database. 130 * The version Omaha is currently serving as the Beta channel 131 build. 132 * The version supplied by the user. 133 The actual version selected will be whichever of these three is 134 the most up-to-date version. 135 136 This function will log information about the available versions 137 prior to selection. 138 139 @param afe AFE object for RPC calls. 140 @param report_log File-like object for logging report output. 141 @param arguments Command line arguments determining the 142 target board and user-specified build 143 (if any). 144 @return Returns the version selected. 145 """ 146 afe_version = afe.run('get_stable_version', 147 board=arguments.board) 148 omaha_version = _get_omaha_build(arguments.board) 149 _report_write(report_log, 'AFE version is %s.\n' % afe_version) 150 _report_write(report_log, 'Omaha version is %s.\n' % omaha_version) 151 if (omaha_version is not None and 152 utils.compare_versions(afe_version, omaha_version) < 0): 153 version = omaha_version 154 else: 155 version = afe_version 156 if arguments.build: 157 if utils.compare_versions(arguments.build, version) >= 0: 158 version = arguments.build 159 else: 160 _report_write(report_log, 161 'Selected version %s is too old.\n' % 162 arguments.build) 163 if version != afe_version and not arguments.nostable: 164 afe.run('set_stable_version', 165 version=version, 166 board=arguments.board) 167 return version 168 169 170def _create_host(hostname, board): 171 """Create a CrosHost object for a DUT to be installed. 172 173 @param hostname Hostname of the target DUT. 174 @param board Board name of the target DUT. 175 """ 176 host = hosts.create_host(hostname, try_lab_servo=True) 177 # Monkey patch our host object to think there's a board label 178 # in the AFE. The horror! The horror! 179 # 180 # TODO(jrbarnette): This is wrong; we patch the method because 181 # CrosHost._servo_repair_reinstall() calls it, but that means 182 # we're coupled to the implementation of CrosHost. Alas, it's 183 # hard to do better without either 1) copying large chunks of 184 # _servo_repair_reinstall(), or 2) extensively refactoring 185 # CrosHost. 186 host._get_board_from_afe = lambda: board 187 return host 188 189 190def _check_servo(host): 191 """Check that servo for the given host is working. 192 193 Perform these steps: 194 * Confirm that the servo host is reachable via SSH. 195 * Stop `servod` on the servo host if it's running, and restart 196 it with the host's designated board. We deliberately ignore 197 any prior configuration. 198 * Re-verify that the servo service on the servo host is 199 working correctly. 200 * Re-initialize the DUT host object with the correct servo 201 object, since this won't have been done in the case that 202 `servod` was down. 203 * Re-initialize the servo settings, since restarting `servod` 204 can change the actual settings from the expected defaults. 205 (In particular, restarting `servod` leaves the USB stick 206 plugged in to the servo host.) 207 208 @param host CrosHost object with the servo to be initialized. 209 """ 210 if not host._servo_host: 211 raise Exception('No answer to ping from Servo host') 212 if not host._servo_host.is_up(): 213 raise Exception('No answer to ssh from Servo host') 214 # Stop servod, ignoring failures, then restart with the proper 215 # board. 216 # 217 # There's a lag between when `start servod` completes and when 218 # servod is actually up and serving. The call to time.sleep() 219 # below gives time to make sure that the verify() call won't 220 # fail. 221 servo_board = ( 222 servo_afe_board_map.map_afe_board_to_servo_board( 223 host._get_board_from_afe())) 224 host._servo_host.run('stop servod || :') 225 host._servo_host.run('start servod BOARD=%s' % servo_board) 226 time.sleep(10) 227 logging.debug('Starting servo host verification') 228 host._servo_host.verify() 229 host.servo = host._servo_host.get_servo() 230 host.servo.initialize_dut() 231 if not host.servo.probe_host_usb_dev(): 232 raise Exception('No USB stick detected on Servo host') 233 234 235def _configure_install_logging(log_name): 236 """Configure the logging module for `_install_dut()`. 237 238 @param log_name Name of the log file for all output. 239 """ 240 # In some cases, autotest code that we call during install may 241 # put stuff onto stdout with 'print' statements. Most notably, 242 # the AFE frontend may print 'FAILED RPC CALL' (boo, hiss). We 243 # want nothing from this subprocess going to the output we 244 # inherited from our parent, so redirect stdout and stderr here, 245 # before we make any AFE calls. Note that this does what we 246 # want only because we're in a subprocess. 247 sys.stdout = open(log_name, 'w') 248 sys.stderr = sys.stdout 249 handler = logging.StreamHandler(sys.stderr) 250 formatter = logging.Formatter(_LOG_FORMAT, time_utils.TIME_FMT) 251 handler.setFormatter(formatter) 252 root_logger = logging.getLogger() 253 for h in root_logger.handlers: 254 root_logger.removeHandler(h) 255 root_logger.addHandler(handler) 256 257 258def _try_lock_host(afe_host): 259 """Lock a host in the AFE, and report whether it succeeded. 260 261 The lock action is logged regardless of success; failures are 262 logged if they occur. 263 264 @param afe_host AFE Host instance to be locked. 265 @return `True` on success, or `False` on failure. 266 """ 267 try: 268 logging.warning('Locking host now.') 269 afe_host.modify(locked=True, 270 lock_reason='Running deployment_test') 271 except Exception as e: 272 logging.exception('Failed to lock: %s', e) 273 return False 274 return True 275 276 277def _try_unlock_host(afe_host): 278 """Unlock a host in the AFE, and report whether it succeeded. 279 280 The unlock action is logged regardless of success; failures are 281 logged if they occur. 282 283 @param afe_host AFE Host instance to be unlocked. 284 @return `True` on success, or `False` on failure. 285 """ 286 try: 287 logging.warning('Unlocking host.') 288 afe_host.modify(locked=False, lock_reason='') 289 except Exception as e: 290 logging.exception('Failed to unlock: %s', e) 291 return False 292 return True 293 294 295def _install_firmware(host): 296 """Install dev-signed firmware after removing write-protect. 297 298 At start, it's assumed that hardware write-protect is disabled, 299 the DUT is in dev mode, and the servo's USB stick already has a 300 test image installed. 301 302 The firmware is installed by powering on and typing ctrl+U on 303 the keyboard in order to boot the the test image from USB. Once 304 the DUT is booted, we run a series of commands to install the 305 read-only firmware from the test image. Then we clear debug 306 mode, and shut down. 307 308 @param host Host instance to use for servo and ssh operations. 309 """ 310 servo = host.servo 311 # First power on. We sleep to allow the firmware plenty of time 312 # to display the dev-mode screen; some boards take their time to 313 # be ready for the ctrl+U after power on. 314 servo.get_power_state_controller().power_off() 315 servo.switch_usbkey('dut') 316 servo.get_power_state_controller().power_on() 317 time.sleep(10) 318 # Dev mode screen should be up now: type ctrl+U and wait for 319 # boot from USB to finish. 320 servo.ctrl_u() 321 if not host.wait_up(timeout=host.USB_BOOT_TIMEOUT): 322 raise Exception('DUT failed to boot in dev mode for ' 323 'firmware update') 324 # Disable software-controlled write-protect for both FPROMs, and 325 # install the RO firmware. 326 for fprom in ['host', 'ec']: 327 host.run('flashrom -p %s --wp-disable' % fprom, 328 ignore_status=True) 329 host.run('chromeos-firmwareupdate --mode=factory') 330 # Get us out of dev-mode and clear GBB flags. GBB flags are 331 # non-zero because boot from USB was enabled. 332 host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0', 333 ignore_status=True) 334 host.run('crossystem disable_dev_request=1', 335 ignore_status=True) 336 host.halt() 337 338 339def _install_test_image(hostname, arguments): 340 """Install a test image to the DUT. 341 342 Install a stable test image on the DUT using the full servo 343 repair flow. 344 345 @param hostname Host name of the DUT to install on. 346 @param arguments Parsed results from 347 ArgumentParser.parse_args(). 348 """ 349 host = _create_host(hostname, arguments.board) 350 _check_servo(host) 351 try: 352 if not arguments.noinstall: 353 if not arguments.nostage: 354 host.servo.image_to_servo_usb( 355 host.stage_image_for_servo()) 356 if arguments.full_deploy: 357 _install_firmware(host) 358 host.servo_install() 359 except error.AutoservRunError as e: 360 logging.exception('Failed to install: %s', e) 361 raise Exception('chromeos-install failed') 362 finally: 363 host.close() 364 365 366def _install_and_record(afe, hostname, arguments): 367 """Perform all installation and AFE updates. 368 369 First, lock the host if it exists and is unlocked. Then, 370 install the test image on the DUT. At the end, unlock the 371 DUT, unless the installation failed and the DUT was locked 372 before we started. 373 374 If installation succeeds, make sure the DUT is in the AFE, 375 and make sure that it has basic labels. 376 377 @param afe AFE object for RPC calls. 378 @param hostname Host name of the DUT. 379 @param arguments Command line arguments with options. 380 """ 381 hostlist = afe.get_hosts([hostname]) 382 unlock_on_failure = False 383 if hostlist: 384 afe_host = hostlist[0] 385 if not afe_host.locked: 386 if _try_lock_host(afe_host): 387 unlock_on_failure = True 388 else: 389 raise Exception('Failed to lock host') 390 if (afe_host.status != 'Ready' and 391 afe_host.status != 'Repair Failed'): 392 if unlock_on_failure and not _try_unlock_host(afe_host): 393 raise Exception('Host is in use, and failed to unlock it') 394 raise Exception('Host is in use by Autotest') 395 else: 396 afe_host = None 397 398 try: 399 _install_test_image(hostname, arguments) 400 except Exception as e: 401 if unlock_on_failure and not _try_unlock_host(afe_host): 402 logging.error('Failed to unlock host!') 403 raise 404 405 if afe_host is not None: 406 if not _try_unlock_host(afe_host): 407 raise Exception('Failed to unlock after successful install') 408 else: 409 logging.debug('Creating host in AFE.') 410 atest_path = os.path.join( 411 os.path.dirname(os.path.abspath(sys.argv[0])), 412 'atest') 413 # Logging configuration reset sys.stdout to the log file, 414 # but apparently subprocess.call() uses FD 0, which is 415 # still our parent's stdout. So, explicitly redirect. 416 status = subprocess.call( 417 [atest_path, 'host', 'create', hostname], 418 stdout=sys.stdout, stderr=subprocess.STDOUT) 419 if status != 0: 420 logging.error('Host creation failed, status = %d', status) 421 raise Exception('Failed to add host to AFE') 422 # Must re-query to get state changes, especially label changes. 423 afe_host = afe.get_hosts([hostname])[0] 424 have_board = any([label.startswith(Labels.BOARD_PREFIX) 425 for label in afe_host.labels]) 426 if not have_board: 427 afe_host.delete() 428 raise Exception('Failed to add labels to host') 429 version = [label for label in afe_host.labels 430 if label.startswith(VERSION_PREFIX)] 431 if version: 432 afe_host.remove_labels(version) 433 434 435def _install_dut(arguments, hostname): 436 """Deploy or repair a single DUT. 437 438 Implementation note: This function is expected to run in a 439 subprocess created by a multiprocessing Pool object. As such, 440 it can't (shouldn't) write to shared files like `sys.stdout`. 441 442 @param hostname Host name of the DUT to install on. 443 @param arguments Parsed results from 444 ArgumentParser.parse_args(). 445 446 @return On success, return `None`. On failure, return a string 447 with an error message. 448 """ 449 _configure_install_logging( 450 os.path.join(arguments.dir, hostname + '.log')) 451 afe = frontend.AFE(server=arguments.web) 452 try: 453 _install_and_record(afe, hostname, arguments) 454 except Exception as e: 455 logging.exception('Original exception: %s', e) 456 return str(e) 457 return None 458 459 460def _report_hosts(report_log, heading, host_results_list): 461 """Report results for a list of hosts. 462 463 To improve visibility, results are preceded by a header line, 464 followed by a divider line. Then results are printed, one host 465 per line. 466 467 @param report_log File-like object for logging report 468 output. 469 @param heading The header string to be printed before 470 results. 471 @param host_results_list A list of (hostname, message) tuples 472 to be printed one per line. 473 """ 474 if not host_results_list: 475 return 476 _report_write(report_log, heading) 477 _report_write(report_log, _DIVIDER) 478 for t in host_results_list: 479 _report_write(report_log, '%-30s %s\n' % t) 480 _report_write(report_log, '\n') 481 482 483def _report_results(afe, report_log, hostnames, results): 484 """Gather and report a summary of results from installation. 485 486 Segregate results into successes and failures, reporting 487 each separately. At the end, report the total of successes 488 and failures. 489 490 @param afe AFE object for RPC calls. 491 @param report_log File-like object for logging report output. 492 @param hostnames List of the hostnames that were tested. 493 @param results List of error messages, in the same order 494 as the hostnames. `None` means the 495 corresponding host succeeded. 496 """ 497 success_hosts = [] 498 success_reports = [] 499 failure_reports = [] 500 for r, h in zip(results, hostnames): 501 if r is None: 502 success_hosts.append(h) 503 else: 504 failure_reports.append((h, r)) 505 if success_hosts: 506 afe_host_list = afe.get_hosts(hostnames=success_hosts) 507 afe.reverify_hosts(hostnames=success_hosts) 508 for h in afe.get_hosts(hostnames=success_hosts): 509 for label in h.labels: 510 if label.startswith(Labels.POOL_PREFIX): 511 success_reports.append( 512 (h.hostname, 'Host already in %s' % label)) 513 break 514 else: 515 h.add_labels([_DEFAULT_POOL]) 516 success_reports.append( 517 (h.hostname, 'Host added to %s' % _DEFAULT_POOL)) 518 _report_write(report_log, _DIVIDER) 519 _report_hosts(report_log, 'Successes', success_reports) 520 _report_hosts(report_log, 'Failures', failure_reports) 521 _report_write(report_log, 522 'Installation complete: ' 523 '%d successes, %d failures.\n' % 524 (len(success_reports), len(failure_reports))) 525 526 527def install_duts(argv, full_deploy): 528 """Install a test image on DUTs, and deploy them. 529 530 This handles command line parsing for both the repair and 531 deployment commands. The two operations are largely identical; 532 the main difference is that full deployment includes flashing 533 dev-signed firmware on the DUT prior to installing the test 534 image. 535 536 @param argv Command line arguments to be parsed. 537 @param full_deploy If true, do the full deployment that includes 538 flashing dev-signed RO firmware onto the DUT. 539 """ 540 # Override tempfile.tempdir. Some of the autotest code we call 541 # will create temporary files that don't get cleaned up. So, we 542 # put the temp files in our results directory, so that we can 543 # clean up everything in one fell swoop. 544 tempfile.tempdir = tempfile.mkdtemp() 545 546 arguments = commandline.parse_command(argv, full_deploy) 547 if not arguments: 548 sys.exit(1) 549 sys.stderr.write('Installation output logs in %s\n' % arguments.dir) 550 report_log = open(os.path.join(arguments.dir, 'report.log'), 'w') 551 afe = frontend.AFE(server=arguments.web) 552 current_build = _update_build(afe, report_log, arguments) 553 _report_write(report_log, _DIVIDER) 554 _report_write(report_log, 555 'Repair version for board %s is now %s.\n' % 556 (arguments.board, current_build)) 557 install_pool = multiprocessing.Pool(len(arguments.hostnames)) 558 install_function = functools.partial(_install_dut, arguments) 559 results_list = install_pool.map(install_function, 560 arguments.hostnames) 561 _report_results(afe, report_log, arguments.hostnames, results_list) 562 563 # MacDuff: 564 # [ ... ] 565 # Did you say all? O hell-kite! All? 566 # What, all my pretty chickens and their dam 567 # At one fell swoop? 568 shutil.rmtree(tempfile.tempdir) 569