1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Install an initial test image on a set of DUTs.
7
8The methods in this module are meant for two nominally distinct use
9cases that share a great deal of code internally.  The first use
10case is for deployment of DUTs that have just been placed in the lab
11for the first time.  The second use case is for use after repairing
12a servo.
13
14Newly deployed DUTs may be in a somewhat anomalous state:
15  * The DUTs are running a production base image, not a test image.
16    By extension, the DUTs aren't reachable over SSH.
17  * The DUTs are not necessarily in the AFE database.  DUTs that
18    _are_ in the database should be locked.  Either way, the DUTs
19    cannot be scheduled to run tests.
20  * The servos for the DUTs need not be configured with the proper
21    board.
22
23More broadly, it's not expected that the DUT will be working at the
24start of this operation.  If the DUT isn't working at the end of the
25operation, an error will be reported.
26
27The script performs the following functions:
28  * Configure the servo for the target board, and test that the
29    servo is generally in good order.
30  * For the full deployment case, install dev-signed RO firmware
31    from the designated stable test image for the DUTs.
32  * For both cases, use servo to install the stable test image from
33    USB.
34  * If the DUT isn't in the AFE database, add it.
35
36The script imposes these preconditions:
37  * Every DUT has a properly connected servo.
38  * Every DUT and servo have proper DHCP and DNS configurations.
39  * Every servo host is up and running, and accessible via SSH.
40  * There is a known, working test image that can be staged and
41    installed on the target DUTs via servo.
42  * Every DUT has the same board.
43  * For the full deployment case, every DUT must be in dev mode,
44    and configured to allow boot from USB with ctrl+U.
45
46The implementation uses the `multiprocessing` module to run all
47installations in parallel, separate processes.
48
49"""
50
51import functools
52import json
53import logging
54import multiprocessing
55import os
56import shutil
57import subprocess
58import sys
59import tempfile
60import time
61
62import common
63from autotest_lib.client.common_lib import error
64from autotest_lib.client.common_lib import time_utils
65from autotest_lib.client.common_lib import utils
66from autotest_lib.client.common_lib.cros import servo_afe_board_map
67from autotest_lib.server import frontend
68from autotest_lib.server import hosts
69from autotest_lib.server.cros.dynamic_suite.constants import VERSION_PREFIX
70from autotest_lib.site_utils.deployment import commandline
71from autotest_lib.site_utils.suite_scheduler.constants import Labels
72
73
74_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
75
76_DEFAULT_POOL = Labels.POOL_PREFIX + 'suites'
77
78_DIVIDER = '\n============\n'
79
80_OMAHA_STATUS = 'gs://chromeos-build-release-console/omaha_status.json'
81
82
83def _report_write(report_log, message):
84    """Write a message to the report log.
85
86    Report output goes both to stdout, and to a given report
87    file.
88
89    @param report_log   Write the message here and to stdout.
90    @param message      Write this message.
91    """
92    report_log.write(message)
93    sys.stdout.write(message)
94
95
96def _get_omaha_build(board):
97    """Get the currently preferred Beta channel build for `board`.
98
99    Open and read through the JSON file provided by GoldenEye that
100    describes what version Omaha is currently serving for all boards
101    on all channels.  Find the entry for `board` on the Beta channel,
102    and return that version string.
103
104    @param board  The board to look up from GoldenEye.
105
106    @return Returns a Chrome OS version string in standard form
107            R##-####.#.#.  Will return `None` if no Beta channel
108            entry is found.
109    """
110    omaha_board = board.replace('_', '-')
111    sp = subprocess.Popen(['gsutil', 'cat', _OMAHA_STATUS],
112                          stdout=subprocess.PIPE)
113    omaha_status = json.load(sp.stdout)
114    for e in omaha_status['omaha_data']:
115        if (e['channel'] == 'beta' and
116                e['board']['public_codename'] == omaha_board):
117            milestone = e['chrome_version'].split('.')[0]
118            build = e['chrome_os_version']
119            return 'R%s-%s' % (milestone, build)
120    return None
121
122
123def _update_build(afe, report_log, arguments):
124    """Update the stable_test_versions table.
125
126    This calls the `set_stable_version` RPC call to set the stable
127    test version selected by this run of the command.  The
128    version is selected from three possible versions:
129      * The stable test version currently in the AFE database.
130      * The version Omaha is currently serving as the Beta channel
131        build.
132      * The version supplied by the user.
133    The actual version selected will be whichever of these three is
134    the most up-to-date version.
135
136    This function will log information about the available versions
137    prior to selection.
138
139    @param afe          AFE object for RPC calls.
140    @param report_log   File-like object for logging report output.
141    @param arguments    Command line arguments determining the
142                        target board and user-specified build
143                        (if any).
144    @return Returns the version selected.
145    """
146    afe_version = afe.run('get_stable_version',
147                          board=arguments.board)
148    omaha_version = _get_omaha_build(arguments.board)
149    _report_write(report_log, 'AFE   version is %s.\n' % afe_version)
150    _report_write(report_log, 'Omaha version is %s.\n' % omaha_version)
151    if (omaha_version is not None and
152            utils.compare_versions(afe_version, omaha_version) < 0):
153        version = omaha_version
154    else:
155        version = afe_version
156    if arguments.build:
157        if utils.compare_versions(arguments.build, version) >= 0:
158            version = arguments.build
159        else:
160            _report_write(report_log,
161                          'Selected version %s is too old.\n' %
162                          arguments.build)
163    if version != afe_version and not arguments.nostable:
164        afe.run('set_stable_version',
165                version=version,
166                board=arguments.board)
167    return version
168
169
170def _create_host(hostname, board):
171    """Create a CrosHost object for a DUT to be installed.
172
173    @param hostname  Hostname of the target DUT.
174    @param board     Board name of the target DUT.
175    """
176    host = hosts.create_host(hostname, try_lab_servo=True)
177    # Monkey patch our host object to think there's a board label
178    # in the AFE.  The horror!  The horror!
179    #
180    # TODO(jrbarnette):  This is wrong; we patch the method because
181    # CrosHost._servo_repair_reinstall() calls it, but that means
182    # we're coupled to the implementation of CrosHost.  Alas, it's
183    # hard to do better without either 1) copying large chunks of
184    # _servo_repair_reinstall(), or 2) extensively refactoring
185    # CrosHost.
186    host._get_board_from_afe = lambda: board
187    return host
188
189
190def _check_servo(host):
191    """Check that servo for the given host is working.
192
193    Perform these steps:
194      * Confirm that the servo host is reachable via SSH.
195      * Stop `servod` on the servo host if it's running, and restart
196        it with the host's designated board.  We deliberately ignore
197        any prior configuration.
198      * Re-verify that the servo service on the servo host is
199        working correctly.
200      * Re-initialize the DUT host object with the correct servo
201        object, since this won't have been done in the case that
202        `servod` was down.
203      * Re-initialize the servo settings, since restarting `servod`
204        can change the actual settings from the expected defaults.
205        (In particular, restarting `servod` leaves the USB stick
206        plugged in to the servo host.)
207
208    @param host  CrosHost object with the servo to be initialized.
209    """
210    if not host._servo_host:
211        raise Exception('No answer to ping from Servo host')
212    if not host._servo_host.is_up():
213        raise Exception('No answer to ssh from Servo host')
214    # Stop servod, ignoring failures, then restart with the proper
215    # board.
216    #
217    # There's a lag between when `start servod` completes and when
218    # servod is actually up and serving.  The call to time.sleep()
219    # below gives time to make sure that the verify() call won't
220    # fail.
221    servo_board = (
222        servo_afe_board_map.map_afe_board_to_servo_board(
223            host._get_board_from_afe()))
224    host._servo_host.run('stop servod || :')
225    host._servo_host.run('start servod BOARD=%s' % servo_board)
226    time.sleep(10)
227    logging.debug('Starting servo host verification')
228    host._servo_host.verify()
229    host.servo = host._servo_host.get_servo()
230    host.servo.initialize_dut()
231    if not host.servo.probe_host_usb_dev():
232        raise Exception('No USB stick detected on Servo host')
233
234
235def _configure_install_logging(log_name):
236    """Configure the logging module for `_install_dut()`.
237
238    @param log_name  Name of the log file for all output.
239    """
240    # In some cases, autotest code that we call during install may
241    # put stuff onto stdout with 'print' statements.  Most notably,
242    # the AFE frontend may print 'FAILED RPC CALL' (boo, hiss).  We
243    # want nothing from this subprocess going to the output we
244    # inherited from our parent, so redirect stdout and stderr here,
245    # before we make any AFE calls.  Note that this does what we
246    # want only because we're in a subprocess.
247    sys.stdout = open(log_name, 'w')
248    sys.stderr = sys.stdout
249    handler = logging.StreamHandler(sys.stderr)
250    formatter = logging.Formatter(_LOG_FORMAT, time_utils.TIME_FMT)
251    handler.setFormatter(formatter)
252    root_logger = logging.getLogger()
253    for h in root_logger.handlers:
254        root_logger.removeHandler(h)
255    root_logger.addHandler(handler)
256
257
258def _try_lock_host(afe_host):
259    """Lock a host in the AFE, and report whether it succeeded.
260
261    The lock action is logged regardless of success; failures are
262    logged if they occur.
263
264    @param afe_host AFE Host instance to be locked.
265    @return `True` on success, or `False` on failure.
266    """
267    try:
268        logging.warning('Locking host now.')
269        afe_host.modify(locked=True,
270                        lock_reason='Running deployment_test')
271    except Exception as e:
272        logging.exception('Failed to lock: %s', e)
273        return False
274    return True
275
276
277def _try_unlock_host(afe_host):
278    """Unlock a host in the AFE, and report whether it succeeded.
279
280    The unlock action is logged regardless of success; failures are
281    logged if they occur.
282
283    @param afe_host AFE Host instance to be unlocked.
284    @return `True` on success, or `False` on failure.
285    """
286    try:
287        logging.warning('Unlocking host.')
288        afe_host.modify(locked=False, lock_reason='')
289    except Exception as e:
290        logging.exception('Failed to unlock: %s', e)
291        return False
292    return True
293
294
295def _install_firmware(host):
296    """Install dev-signed firmware after removing write-protect.
297
298    At start, it's assumed that hardware write-protect is disabled,
299    the DUT is in dev mode, and the servo's USB stick already has a
300    test image installed.
301
302    The firmware is installed by powering on and typing ctrl+U on
303    the keyboard in order to boot the the test image from USB.  Once
304    the DUT is booted, we run a series of commands to install the
305    read-only firmware from the test image.  Then we clear debug
306    mode, and shut down.
307
308    @param host   Host instance to use for servo and ssh operations.
309    """
310    servo = host.servo
311    # First power on.  We sleep to allow the firmware plenty of time
312    # to display the dev-mode screen; some boards take their time to
313    # be ready for the ctrl+U after power on.
314    servo.get_power_state_controller().power_off()
315    servo.switch_usbkey('dut')
316    servo.get_power_state_controller().power_on()
317    time.sleep(10)
318    # Dev mode screen should be up now:  type ctrl+U and wait for
319    # boot from USB to finish.
320    servo.ctrl_u()
321    if not host.wait_up(timeout=host.USB_BOOT_TIMEOUT):
322        raise Exception('DUT failed to boot in dev mode for '
323                        'firmware update')
324    # Disable software-controlled write-protect for both FPROMs, and
325    # install the RO firmware.
326    for fprom in ['host', 'ec']:
327        host.run('flashrom -p %s --wp-disable' % fprom,
328                 ignore_status=True)
329    host.run('chromeos-firmwareupdate --mode=factory')
330    # Get us out of dev-mode and clear GBB flags.  GBB flags are
331    # non-zero because boot from USB was enabled.
332    host.run('/usr/share/vboot/bin/set_gbb_flags.sh 0',
333             ignore_status=True)
334    host.run('crossystem disable_dev_request=1',
335             ignore_status=True)
336    host.halt()
337
338
339def _install_test_image(hostname, arguments):
340    """Install a test image to the DUT.
341
342    Install a stable test image on the DUT using the full servo
343    repair flow.
344
345    @param hostname   Host name of the DUT to install on.
346    @param arguments  Parsed results from
347                      ArgumentParser.parse_args().
348    """
349    host = _create_host(hostname, arguments.board)
350    _check_servo(host)
351    try:
352        if not arguments.noinstall:
353            if not arguments.nostage:
354                host.servo.image_to_servo_usb(
355                        host.stage_image_for_servo())
356            if arguments.full_deploy:
357                _install_firmware(host)
358            host.servo_install()
359    except error.AutoservRunError as e:
360        logging.exception('Failed to install: %s', e)
361        raise Exception('chromeos-install failed')
362    finally:
363        host.close()
364
365
366def _install_and_record(afe, hostname, arguments):
367    """Perform all installation and AFE updates.
368
369    First, lock the host if it exists and is unlocked.  Then,
370    install the test image on the DUT.  At the end, unlock the
371    DUT, unless the installation failed and the DUT was locked
372    before we started.
373
374    If installation succeeds, make sure the DUT is in the AFE,
375    and make sure that it has basic labels.
376
377    @param afe          AFE object for RPC calls.
378    @param hostname     Host name of the DUT.
379    @param arguments    Command line arguments with options.
380    """
381    hostlist = afe.get_hosts([hostname])
382    unlock_on_failure = False
383    if hostlist:
384        afe_host = hostlist[0]
385        if not afe_host.locked:
386            if _try_lock_host(afe_host):
387                unlock_on_failure = True
388            else:
389                raise Exception('Failed to lock host')
390        if (afe_host.status != 'Ready' and
391                 afe_host.status != 'Repair Failed'):
392            if unlock_on_failure and not _try_unlock_host(afe_host):
393                raise Exception('Host is in use, and failed to unlock it')
394            raise Exception('Host is in use by Autotest')
395    else:
396        afe_host = None
397
398    try:
399        _install_test_image(hostname, arguments)
400    except Exception as e:
401        if unlock_on_failure and not _try_unlock_host(afe_host):
402            logging.error('Failed to unlock host!')
403        raise
404
405    if afe_host is not None:
406        if not _try_unlock_host(afe_host):
407            raise Exception('Failed to unlock after successful install')
408    else:
409        logging.debug('Creating host in AFE.')
410        atest_path = os.path.join(
411                os.path.dirname(os.path.abspath(sys.argv[0])),
412                'atest')
413        # Logging configuration reset sys.stdout to the log file,
414        # but apparently subprocess.call() uses FD 0, which is
415        # still our parent's stdout.  So, explicitly redirect.
416        status = subprocess.call(
417                [atest_path, 'host', 'create', hostname],
418                stdout=sys.stdout, stderr=subprocess.STDOUT)
419        if status != 0:
420            logging.error('Host creation failed, status = %d', status)
421            raise Exception('Failed to add host to AFE')
422    # Must re-query to get state changes, especially label changes.
423    afe_host = afe.get_hosts([hostname])[0]
424    have_board = any([label.startswith(Labels.BOARD_PREFIX)
425                         for label in afe_host.labels])
426    if not have_board:
427        afe_host.delete()
428        raise Exception('Failed to add labels to host')
429    version = [label for label in afe_host.labels
430                   if label.startswith(VERSION_PREFIX)]
431    if version:
432        afe_host.remove_labels(version)
433
434
435def _install_dut(arguments, hostname):
436    """Deploy or repair a single DUT.
437
438    Implementation note: This function is expected to run in a
439    subprocess created by a multiprocessing Pool object.  As such,
440    it can't (shouldn't) write to shared files like `sys.stdout`.
441
442    @param hostname   Host name of the DUT to install on.
443    @param arguments  Parsed results from
444                      ArgumentParser.parse_args().
445
446    @return On success, return `None`.  On failure, return a string
447            with an error message.
448    """
449    _configure_install_logging(
450            os.path.join(arguments.dir, hostname + '.log'))
451    afe = frontend.AFE(server=arguments.web)
452    try:
453        _install_and_record(afe, hostname, arguments)
454    except Exception as e:
455        logging.exception('Original exception: %s', e)
456        return str(e)
457    return None
458
459
460def _report_hosts(report_log, heading, host_results_list):
461    """Report results for a list of hosts.
462
463    To improve visibility, results are preceded by a header line,
464    followed by a divider line.  Then results are printed, one host
465    per line.
466
467    @param report_log         File-like object for logging report
468                              output.
469    @param heading            The header string to be printed before
470                              results.
471    @param host_results_list  A list of (hostname, message) tuples
472                              to be printed one per line.
473    """
474    if not host_results_list:
475        return
476    _report_write(report_log, heading)
477    _report_write(report_log, _DIVIDER)
478    for t in host_results_list:
479        _report_write(report_log, '%-30s %s\n' % t)
480    _report_write(report_log, '\n')
481
482
483def _report_results(afe, report_log, hostnames, results):
484    """Gather and report a summary of results from installation.
485
486    Segregate results into successes and failures, reporting
487    each separately.  At the end, report the total of successes
488    and failures.
489
490    @param afe          AFE object for RPC calls.
491    @param report_log   File-like object for logging report output.
492    @param hostnames    List of the hostnames that were tested.
493    @param results      List of error messages, in the same order
494                        as the hostnames.  `None` means the
495                        corresponding host succeeded.
496    """
497    success_hosts = []
498    success_reports = []
499    failure_reports = []
500    for r, h in zip(results, hostnames):
501        if r is None:
502            success_hosts.append(h)
503        else:
504            failure_reports.append((h, r))
505    if success_hosts:
506        afe_host_list = afe.get_hosts(hostnames=success_hosts)
507        afe.reverify_hosts(hostnames=success_hosts)
508        for h in afe.get_hosts(hostnames=success_hosts):
509            for label in h.labels:
510                if label.startswith(Labels.POOL_PREFIX):
511                    success_reports.append(
512                            (h.hostname, 'Host already in %s' % label))
513                    break
514            else:
515                h.add_labels([_DEFAULT_POOL])
516                success_reports.append(
517                        (h.hostname, 'Host added to %s' % _DEFAULT_POOL))
518    _report_write(report_log, _DIVIDER)
519    _report_hosts(report_log, 'Successes', success_reports)
520    _report_hosts(report_log, 'Failures', failure_reports)
521    _report_write(report_log,
522                  'Installation complete:  '
523                  '%d successes, %d failures.\n' %
524                  (len(success_reports), len(failure_reports)))
525
526
527def install_duts(argv, full_deploy):
528    """Install a test image on DUTs, and deploy them.
529
530    This handles command line parsing for both the repair and
531    deployment commands.  The two operations are largely identical;
532    the main difference is that full deployment includes flashing
533    dev-signed firmware on the DUT prior to installing the test
534    image.
535
536    @param argv         Command line arguments to be parsed.
537    @param full_deploy  If true, do the full deployment that includes
538                        flashing dev-signed RO firmware onto the DUT.
539    """
540    # Override tempfile.tempdir.  Some of the autotest code we call
541    # will create temporary files that don't get cleaned up.  So, we
542    # put the temp files in our results directory, so that we can
543    # clean up everything in one fell swoop.
544    tempfile.tempdir = tempfile.mkdtemp()
545
546    arguments = commandline.parse_command(argv, full_deploy)
547    if not arguments:
548        sys.exit(1)
549    sys.stderr.write('Installation output logs in %s\n' % arguments.dir)
550    report_log = open(os.path.join(arguments.dir, 'report.log'), 'w')
551    afe = frontend.AFE(server=arguments.web)
552    current_build = _update_build(afe, report_log, arguments)
553    _report_write(report_log, _DIVIDER)
554    _report_write(report_log,
555                  'Repair version for board %s is now %s.\n' %
556                  (arguments.board, current_build))
557    install_pool = multiprocessing.Pool(len(arguments.hostnames))
558    install_function = functools.partial(_install_dut, arguments)
559    results_list = install_pool.map(install_function,
560                                    arguments.hostnames)
561    _report_results(afe, report_log, arguments.hostnames, results_list)
562
563    # MacDuff:
564    #   [ ... ]
565    #   Did you say all? O hell-kite! All?
566    #   What, all my pretty chickens and their dam
567    #   At one fell swoop?
568    shutil.rmtree(tempfile.tempdir)
569