1# Lint as: python2, python3
2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6from __future__ import print_function
7
8import logging
9import os
10import re
11import six
12import sys
13import six.moves.urllib.parse
14
15from autotest_lib.client.bin import utils
16from autotest_lib.client.common_lib import error
17from autotest_lib.client.common_lib.cros import dev_server
18from autotest_lib.client.common_lib.cros import kernel_utils
19from autotest_lib.server import autotest
20from autotest_lib.server.cros.dynamic_suite import constants as ds_constants
21from autotest_lib.server.cros.dynamic_suite import tools
22
23try:
24    from chromite.lib import metrics
25except ImportError:
26    metrics = utils.metrics_mock
27
28
29def _metric_name(base_name):
30    return 'chromeos/autotest/provision/' + base_name
31
32
33_QUICK_PROVISION_SCRIPT = 'quick-provision'
34
35# PROVISION_FAILED - A flag file to indicate provision failures.  The
36# file is created at the start of any AU procedure (see
37# `ChromiumOSProvisioner._prepare_host()`).  The file's location in
38# stateful means that on successul update it will be removed.  Thus, if
39# this file exists, it indicates that we've tried and failed in a
40# previous attempt to update.
41PROVISION_FAILED = '/var/tmp/provision_failed'
42
43# A flag file used to enable special handling in lab DUTs.  Some
44# parts of the system in Chromium OS test images will behave in ways
45# convenient to the test lab when this file is present.  Generally,
46# we create this immediately after any update completes.
47_LAB_MACHINE_FILE = '/mnt/stateful_partition/.labmachine'
48
49# _TARGET_VERSION - A file containing the new version to which we plan
50# to update.  This file is used by the CrOS shutdown code to detect and
51# handle certain version downgrade cases.  Specifically:  Downgrading
52# may trigger an unwanted powerwash in the target build when the
53# following conditions are met:
54#  * Source build is a v4.4 kernel with R69-10756.0.0 or later.
55#  * Target build predates the R69-10756.0.0 cutoff.
56# When this file is present and indicates a downgrade, the OS shutdown
57# code on the DUT knows how to prevent the powerwash.
58_TARGET_VERSION = '/run/update_target_version'
59
60# _REBOOT_FAILURE_MESSAGE - This is the standard message text returned
61# when the Host.reboot() method fails.  The source of this text comes
62# from `wait_for_restart()` in client/common_lib/hosts/base_classes.py.
63
64_REBOOT_FAILURE_MESSAGE = 'Host did not return from reboot'
65
66DEVSERVER_PORT = '8082'
67GS_CACHE_PORT = '8888'
68
69
70class _AttributedUpdateError(error.TestFail):
71    """Update failure with an attributed cause."""
72
73    def __init__(self, attribution, msg):
74        super(_AttributedUpdateError,
75              self).__init__('%s: %s' % (attribution, msg))
76        self._message = msg
77
78    def _classify(self):
79        for err_pattern, classification in self._CLASSIFIERS:
80            if re.match(err_pattern, self._message):
81                return classification
82        return None
83
84    @property
85    def failure_summary(self):
86        """Summarize this error for metrics reporting."""
87        classification = self._classify()
88        if classification:
89            return '%s: %s' % (self._SUMMARY, classification)
90        else:
91            return self._SUMMARY
92
93
94class HostUpdateError(_AttributedUpdateError):
95    """Failure updating a DUT attributable to the DUT.
96
97    This class of exception should be raised when the most likely cause
98    of failure was a condition existing on the DUT prior to the update,
99    such as a hardware problem, or a bug in the software on the DUT.
100    """
101
102    DUT_DOWN = 'No answer to ssh'
103
104    _SUMMARY = 'DUT failed prior to update'
105    _CLASSIFIERS = [
106            (DUT_DOWN, DUT_DOWN),
107            (_REBOOT_FAILURE_MESSAGE, 'Reboot failed'),
108    ]
109
110    def __init__(self, hostname, msg):
111        super(HostUpdateError,
112              self).__init__('Error on %s prior to update' % hostname, msg)
113
114
115class ImageInstallError(_AttributedUpdateError):
116    """Failure updating a DUT when installing from the devserver.
117
118    This class of exception should be raised when the target DUT fails
119    to download and install the target image from the devserver, and
120    either the devserver or the DUT might be at fault.
121    """
122
123    _SUMMARY = 'Image failed to download and install'
124    _CLASSIFIERS = []
125
126    def __init__(self, hostname, devserver, msg):
127        super(ImageInstallError, self).__init__(
128                'Download and install failed from %s onto %s' %
129                (devserver, hostname), msg)
130
131
132class NewBuildUpdateError(_AttributedUpdateError):
133    """Failure updating a DUT attributable to the target build.
134
135    This class of exception should be raised when updating to a new
136    build fails, and the most likely cause of the failure is a bug in
137    the newly installed target build.
138    """
139
140    CHROME_FAILURE = 'Chrome failed to reach login screen'
141    ROLLBACK_FAILURE = 'System rolled back to previous build'
142
143    _SUMMARY = 'New build failed'
144    _CLASSIFIERS = [
145            (CHROME_FAILURE, 'Chrome did not start'),
146            (ROLLBACK_FAILURE, ROLLBACK_FAILURE),
147    ]
148
149    def __init__(self, update_version, msg):
150        super(NewBuildUpdateError,
151              self).__init__('Failure in build %s' % update_version, msg)
152
153    @property
154    def failure_summary(self):
155        #pylint: disable=missing-docstring
156        return 'Build failed to work after installing'
157
158
159def _url_to_version(update_url):
160    """Return the version based on update_url.
161
162    @param update_url: url to the image to update to.
163
164    """
165    # The Chrome OS version is generally the last element in the URL. The only
166    # exception is delta update URLs, which are rooted under the version; e.g.,
167    # http://.../update/.../0.14.755.0/au/0.14.754.0. In this case we want to
168    # strip off the au section of the path before reading the version.
169    return re.sub('/au/.*', '',
170                  six.moves.urllib.parse.urlparse(update_url).path).split(
171                          '/')[-1].strip()
172
173
174def url_to_image_name(update_url):
175    """Return the image name based on update_url.
176
177    From a URL like:
178        http://172.22.50.205:8082/update/lumpy-release/R27-3837.0.0
179    return lumpy-release/R27-3837.0.0
180
181    @param update_url: url to the image to update to.
182    @returns a string representing the image name in the update_url.
183
184    """
185    return six.moves.urllib.parse.urlparse(update_url).path[len('/update/'):]
186
187
188def get_update_failure_reason(exception):
189    """Convert an exception into a failure reason for metrics.
190
191    The passed in `exception` should be one raised by failure of
192    `ChromiumOSProvisioner.run_provision`.  The returned string will describe
193    the failure.  If the input exception value is not a truish value
194    the return value will be `None`.
195
196    The number of possible return strings is restricted to a limited
197    enumeration of values so that the string may be safely used in
198    Monarch metrics without worrying about cardinality of the range of
199    string values.
200
201    @param exception  Exception to be converted to a failure reason.
202
203    @return A string suitable for use in Monarch metrics, or `None`.
204    """
205    if exception:
206        if isinstance(exception, _AttributedUpdateError):
207            return exception.failure_summary
208        else:
209            return 'Unknown Error: %s' % type(exception).__name__
210    return None
211
212
213class ChromiumOSProvisioner(object):
214    """Chromium OS specific DUT update functionality."""
215
216    def __init__(self,
217                 update_url,
218                 host=None,
219                 interactive=True,
220                 is_release_bucket=None,
221                 is_servohost=False):
222        """Initializes the object.
223
224        @param update_url: The URL we want the update to use.
225        @param host: A client.common_lib.hosts.Host implementation.
226        @param interactive: Bool whether we are doing an interactive update.
227        @param is_release_bucket: If True, use release bucket
228            gs://chromeos-releases.
229        @param is_servohost: Bool whether the update target is a servohost.
230        """
231        self.update_url = update_url
232        self.host = host
233        self.interactive = interactive
234        self.update_version = _url_to_version(update_url)
235        self._is_release_bucket = is_release_bucket
236        self._is_servohost = is_servohost
237
238    def _run(self, cmd, *args, **kwargs):
239        """Abbreviated form of self.host.run(...)"""
240        return self.host.run(cmd, *args, **kwargs)
241
242    def _rootdev(self, options=''):
243        """Returns the stripped output of rootdev <options>.
244
245        @param options: options to run rootdev.
246
247        """
248        return self._run('rootdev %s' % options).stdout.strip()
249
250    def _reset_update_engine(self):
251        """Resets the host to prepare for a clean update regardless of state."""
252        self._run('stop ui || true')
253        self._run('stop update-engine || true; start update-engine')
254
255    def _reset_stateful_partition(self):
256        """Clear any pending stateful update request."""
257        cmd = ['rm', '-rf']
258        for f in ('var_new', 'dev_image_new', '.update_available'):
259            cmd += [os.path.join('/mnt/stateful_partition', f)]
260        # TODO(b/165024723): This is a temporary measure until we figure out the
261        # root cause of this bug.
262        for f in ('dev_image/share/tast/data', 'dev_image/libexec/tast',
263                  'dev_image/tmp/tast'):
264            cmd += [os.path.join('/mnt/stateful_partition', f)]
265        cmd += [_TARGET_VERSION, '2>&1']
266        self._run(cmd)
267
268    def _set_target_version(self):
269        """Set the "target version" for the update."""
270        # Version strings that come from release buckets do not have RXX- at the
271        # beginning. So remove this prefix only if the version has it.
272        version_number = (self.update_version.split('-')[1] if
273                          '-' in self.update_version else self.update_version)
274        self._run('echo %s > %s' % (version_number, _TARGET_VERSION))
275
276    def _revert_boot_partition(self):
277        """Revert the boot partition."""
278        part = self._rootdev('-s')
279        logging.warning('Reverting update; Boot partition will be %s', part)
280        return self._run('/postinst %s 2>&1' % part)
281
282    def _get_remote_script(self, script_name):
283        """Ensure that `script_name` is present on the DUT.
284
285        The given script (e.g. `quick-provision`) may be present in the
286        stateful partition under /usr/local/bin, or we may have to
287        download it from the devserver.
288
289        Determine whether the script is present or must be downloaded
290        and download if necessary.  Then, return a command fragment
291        sufficient to run the script from whereever it now lives on the
292        DUT.
293
294        @param script_name  The name of the script as expected in
295                            /usr/local/bin and on the devserver.
296        @return A string with the command (minus arguments) that will
297                run the target script.
298        """
299        remote_script = '/usr/local/bin/%s' % script_name
300        if self.host.path_exists(remote_script):
301            return remote_script
302        self.host.run('mkdir -p -m 1777 /usr/local/tmp')
303        remote_tmp_script = '/usr/local/tmp/%s' % script_name
304        server_name = six.moves.urllib.parse.urlparse(self.update_url)[1]
305        script_url = 'http://%s/static/%s' % (server_name, script_name)
306        fetch_script = 'curl -Ss -o %s %s && head -1 %s' % (
307                remote_tmp_script, script_url, remote_tmp_script)
308
309        first_line = self._run(fetch_script).stdout.strip()
310
311        if first_line and first_line.startswith('#!'):
312            script_interpreter = first_line.lstrip('#!')
313            if script_interpreter:
314                return '%s %s' % (script_interpreter, remote_tmp_script)
315        return None
316
317    def _prepare_host(self):
318        """Make sure the target DUT is working and ready for update.
319
320        Initially, the target DUT's state is unknown.  The DUT is
321        expected to be online, but we strive to be forgiving if Chrome
322        and/or the update engine aren't fully functional.
323        """
324        # Summary of work, and the rationale:
325        #  1. Reboot, because it's a good way to clear out problems.
326        #  2. Touch the PROVISION_FAILED file, to allow repair to detect
327        #     failure later.
328        #  3. Run the hook for host class specific preparation.
329        #  4. Stop Chrome, because the system is designed to eventually
330        #     reboot if Chrome is stuck in a crash loop.
331        #  5. Force `update-engine` to start, because if Chrome failed
332        #     to start properly, the status of the `update-engine` job
333        #     will be uncertain.
334        if not self.host.is_up():
335            raise HostUpdateError(self.host.hostname, HostUpdateError.DUT_DOWN)
336        self._reset_stateful_partition()
337        # Servohost reboot logic is handled by themselves.
338        if not self._is_servohost:
339            self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
340            self._run('touch %s' % PROVISION_FAILED)
341        self.host.prepare_for_update()
342        # Servohost will only update via quick provision.
343        if not self._is_servohost:
344            self._reset_update_engine()
345        logging.info('Updating from version %s to %s.',
346                     self.host.get_release_version(), self.update_version)
347
348    def _quick_provision_with_gs_cache(self, provision_command, devserver_name,
349                                       image_name):
350        """Run quick_provision using GsCache server.
351
352        @param provision_command: The path of quick_provision command.
353        @param devserver_name: The devserver name and port (optional).
354        @param image_name: The image to be installed.
355        """
356        logging.info('Try quick provision with gs_cache.')
357        # If enabled, GsCache server listion on different port on the
358        # devserver.
359        gs_cache_server = devserver_name.replace(DEVSERVER_PORT, GS_CACHE_PORT)
360        gs_cache_url = (
361                'http://%s/download/%s' %
362                (gs_cache_server, 'chromeos-releases'
363                 if self._is_release_bucket else 'chromeos-image-archive'))
364
365        # Check if GS_Cache server is enabled on the server.
366        self._run('curl -s -o /dev/null %s' % gs_cache_url)
367
368        command = '%s --noreboot %s %s' % (provision_command, image_name,
369                                           gs_cache_url)
370        self._run(command)
371        metrics.Counter(
372                _metric_name('quick_provision')).increment(fields={
373                        'devserver': devserver_name,
374                        'gs_cache': True
375                })
376
377    def _quick_provision_with_devserver(self, provision_command,
378                                        devserver_name, image_name):
379        """Run quick_provision using legacy devserver.
380
381        @param provision_command: The path of quick_provision command.
382        @param devserver_name: The devserver name and port (optional).
383        @param image_name: The image to be installed.
384        """
385        logging.info('Try quick provision with devserver.')
386        ds = dev_server.ImageServer('http://%s' % devserver_name)
387        archive_url = ('gs://chromeos-releases/%s' %
388                       image_name if self._is_release_bucket else None)
389        try:
390            ds.stage_artifacts(
391                    image_name,
392                    ['quick_provision', 'stateful', 'autotest_packages'],
393                    archive_url=archive_url)
394        except dev_server.DevServerException as e:
395            six.reraise(error.TestFail, str(e), sys.exc_info()[2])
396
397        static_url = 'http://%s/static' % devserver_name
398        command = '%s --noreboot %s %s' % (provision_command, image_name,
399                                           static_url)
400        self._run(command)
401        metrics.Counter(
402                _metric_name('quick_provision')).increment(fields={
403                        'devserver': devserver_name,
404                        'gs_cache': False
405                })
406
407    def _install_update(self):
408        """Install an updating using the `quick-provision` script.
409
410        This uses the `quick-provision` script to download and install
411        a root FS, kernel and stateful filesystem content.
412
413        @return The kernel expected to be booted next.
414        """
415        logging.info('Installing image at %s onto %s', self.update_url,
416                     self.host.hostname)
417        server_name = six.moves.urllib.parse.urlparse(self.update_url)[1]
418        image_name = url_to_image_name(self.update_url)
419
420        logging.info('Installing image using quick-provision.')
421        provision_command = self._get_remote_script(_QUICK_PROVISION_SCRIPT)
422        try:
423            try:
424                self._quick_provision_with_gs_cache(provision_command,
425                                                    server_name, image_name)
426            except Exception as e:
427                logging.error(
428                        'Failed to quick-provision with gscache with '
429                        'error %s', e)
430                self._quick_provision_with_devserver(provision_command,
431                                                     server_name, image_name)
432
433            self._set_target_version()
434            return kernel_utils.verify_kernel_state_after_update(self.host)
435        except Exception:
436            # N.B.  We handle only `Exception` here.  Non-Exception
437            # classes (such as KeyboardInterrupt) are handled by our
438            # caller.
439            logging.exception('quick-provision script failed;')
440            self._revert_boot_partition()
441            self._reset_stateful_partition()
442            self._reset_update_engine()
443            return None
444
445    def _complete_update(self, expected_kernel):
446        """Finish the update, and confirm that it succeeded.
447
448        Initial condition is that the target build has been downloaded
449        and installed on the DUT, but has not yet been booted.  This
450        function is responsible for rebooting the DUT, and checking that
451        the new build is running successfully.
452
453        @param expected_kernel: kernel expected to be active after reboot.
454        """
455        # Regarding the 'crossystem' command below: In some cases,
456        # the update flow puts the TPM into a state such that it
457        # fails verification.  We don't know why.  However, this
458        # call papers over the problem by clearing the TPM during
459        # the reboot.
460        #
461        # We ignore failures from 'crossystem'.  Although failure
462        # here is unexpected, and could signal a bug, the point of
463        # the exercise is to paper over problems; allowing this to
464        # fail would defeat the purpose.
465        self._run('crossystem clear_tpm_owner_request=1', ignore_status=True)
466        self.host.reboot(timeout=self.host.REBOOT_TIMEOUT)
467
468        # Touch the lab machine file to leave a marker that
469        # distinguishes this image from other test images.
470        # Afterwards, we must re-run the autoreboot script because
471        # it depends on the _LAB_MACHINE_FILE.
472        autoreboot_cmd = ('FILE="%s" ; [ -f "$FILE" ] || '
473                          '( touch "$FILE" ; start autoreboot )')
474        self._run(autoreboot_cmd % _LAB_MACHINE_FILE)
475        try:
476            kernel_utils.verify_boot_expectations(
477                    expected_kernel, NewBuildUpdateError.ROLLBACK_FAILURE,
478                    self.host)
479        except Exception:
480            # When the system is rolled back, the provision_failed file is
481            # removed. So add it back here and re-raise the exception.
482            self._run('touch %s' % PROVISION_FAILED)
483            raise
484
485        logging.debug('Cleaning up old autotest directories.')
486        try:
487            installed_autodir = autotest.Autotest.get_installed_autodir(
488                    self.host)
489            self._run('rm -rf ' + installed_autodir)
490        except autotest.AutodirNotFoundError:
491            logging.debug('No autotest installed directory found.')
492
493    def run_provision(self):
494        """Perform a full provision of a DUT in the test lab.
495
496        This downloads and installs the root FS and stateful partition
497        content needed for the update specified in `self.host` and
498        `self.update_url`.  The provision is performed according to the
499        requirements for provisioning a DUT for testing the requested
500        build.
501
502        At the end of the procedure, metrics are reported describing the
503        outcome of the operation.
504
505        @returns A tuple of the form `(image_name, attributes)`, where
506            `image_name` is the name of the image installed, and
507            `attributes` is new attributes to be applied to the DUT.
508        """
509        server_name = dev_server.get_resolved_hostname(self.update_url)
510        metrics.Counter(_metric_name('install')).increment(
511                fields={'devserver': server_name})
512
513        try:
514            self._prepare_host()
515        except _AttributedUpdateError:
516            raise
517        except Exception as e:
518            logging.exception('Failure preparing host prior to update.')
519            raise HostUpdateError(self.host.hostname, str(e))
520
521        try:
522            expected_kernel = self._install_update()
523        except _AttributedUpdateError:
524            raise
525        except Exception as e:
526            logging.exception('Failure during download and install.')
527            raise ImageInstallError(self.host.hostname, server_name, str(e))
528
529        # Servohost will handle post update process themselves.
530        if not self._is_servohost:
531            try:
532                self._complete_update(expected_kernel)
533            except _AttributedUpdateError:
534                raise
535            except Exception as e:
536                logging.exception('Failure from build after update.')
537                raise NewBuildUpdateError(self.update_version, str(e))
538
539        image_name = url_to_image_name(self.update_url)
540        # update_url is different from devserver url needed to stage autotest
541        # packages, therefore, resolve a new devserver url here.
542        devserver_url = dev_server.ImageServer.resolve(
543                image_name, self.host.hostname).url()
544        repo_url = tools.get_package_url(devserver_url, image_name)
545        return image_name, {ds_constants.JOB_REPO_URL: repo_url}
546