1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""
6Framework for host verification and repair in Autotest.
7
8The framework provides implementation code in support of `Host.verify()`
9and `Host.repair()` used in Verify and Repair special tasks.
10
11The framework consists of these classes:
12  * `Verifier`: A class representing a single verification check.
13  * `RepairAction`: A class representing a repair operation that can fix
14    a failed verification check.
15  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
16    and `RepairAction` instances, and invoking them in order.
17
18Individual operations during verification and repair are handled by
19instances of `Verifier` and `RepairAction`.  `Verifier` objects are
20meant to test for specific conditions that may cause tests to fail.
21`RepairAction` objects provide operations designed to fix one or
22more failures identified by a `Verifier` object.
23"""
24
25import collections
26import logging
27
28import common
29from autotest_lib.client.common_lib import error
30
31
32class AutoservVerifyError(error.AutoservError):
33    """
34    Generic Exception for failures from `Verifier` objects.
35
36    Instances of this exception can be raised when a `verify()`
37    method fails, if no more specific exception is available.
38    """
39    pass
40
41
42_DependencyFailure = collections.namedtuple(
43        '_DependencyFailure', ('dependency', 'error'))
44
45
46class AutoservVerifyDependencyError(error.AutoservError):
47    """
48    Exception raised for failures in dependencies.
49
50    This exception is used to distinguish an original failure from a
51    failure being passed back from a verification dependency.  That is,
52    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
53    to signal that the original failure is further down the dependency
54    chain.
55
56    The `failures` argument to the constructor for this class is a set
57    of instances of `_DependencyFailure`, each corresponding to one
58    failed dependency:
59      * The `dependency` attribute of each failure is the description
60        of the failed dependency.
61      * The `error` attribute of each failure is the string value of
62        the exception from the failed dependency.
63
64    Multiple methods in this module recognize and handle this exception
65    specially.
66
67    @property failures  Set of failures passed to the constructor.
68    @property _node     Instance of `_DependencyNode` reporting the
69                        failed dependencies.
70    """
71    def __init__(self, node, failures):
72        """
73        Constructor for `AutoservVerifyDependencyError`.
74
75        @param node       Instance of _DependencyNode reporting the
76                          failed dependencies.
77        @param failures   List of failure tuples as described above.
78        """
79        super(AutoservVerifyDependencyError, self).__init__(
80                '\n'.join([f.error for f in failures]))
81        self.failures = failures
82        self._node = node
83
84
85    def log_dependencies(self, action, deps):
86        """
87        Log an `AutoservVerifyDependencyError`.
88
89        This writes a short summary of the dependency failures captured
90        in this exception, using standard Python logging.
91
92        The passed in `action` string plus `self._node.description`
93        are logged at INFO level.  The `action` argument should
94        introduce or describe an action relative to `self._node`.
95
96        The passed in `deps` string and the description of each failed
97        dependency in `self` are be logged at DEBUG level.  The `deps`
98        argument is used to introduce the various failed dependencies.
99
100        @param action   A string mentioning the action being logged
101                        relative to `self._node`.
102        @param deps     A string introducing the dependencies that
103                        failed.
104        """
105        logging.info('%s: %s', action, self._node.description)
106        logging.debug('%s:', deps)
107        for failure in self.failures:
108            logging.debug('    %s', failure.dependency)
109
110
111class AutoservRepairError(error.AutoservError):
112    """
113    Generic Exception for failures from `RepairAction` objects.
114
115    Instances of this exception can be raised when a `repair()`
116    method fails, if no more specific exception is available.
117    """
118    pass
119
120
121class _DependencyNode(object):
122    """
123    An object that can depend on verifiers.
124
125    Both repair and verify operations have the notion of dependencies
126    that must pass before the operation proceeds.  This class captures
127    the shared behaviors required by both classes.
128
129    @property tag               Short identifier to be used in logging.
130    @property description       Text summary of this node's action, to be
131                                used in debug logs.
132    @property _dependency_list  Dependency pre-requisites.
133    """
134
135    def __init__(self, tag, dependencies):
136        self._dependency_list = dependencies
137        self._tag = tag
138
139
140    def _record(self, host, silent, *record_args):
141        """
142        Log a status record for `host`.
143
144        Call `host.record()` with the given `record_args`, unless
145        requested to skip by `silent`.
146
147        @param host         Host which will record the status record.
148        @param silent       Don't record the event if this is a true
149                            value.
150        @param record_args  Arguments to pass to `host.record()`.
151        """
152        if not silent:
153            host.record(*record_args)
154
155
156    def _verify_list(self, host, verifiers, silent):
157        """
158        Test a list of verifiers against a given host.
159
160        This invokes `_verify_host()` on every verifier in the given
161        list.  If any verifier in the transitive closure of dependencies
162        in the list fails, an `AutoservVerifyDependencyError` is raised
163        containing the description of each failed verifier.  Only
164        original failures are reported; verifiers that don't run due
165        to a failed dependency are omitted.
166
167        By design, original failures are logged once in `_verify_host()`
168        when `verify()` originally fails.  The additional data gathered
169        here is for the debug logs to indicate why a subsequent
170        operation never ran.
171
172        @param host       The host to be tested against the verifiers.
173        @param verifiers  List of verifiers to be checked.
174        @param silent     If true, don't log host status records.
175
176        @raises AutoservVerifyDependencyError   Raised when at least
177                        one verifier in the list has failed.
178        """
179        failures = set()
180        for v in verifiers:
181            try:
182                v._verify_host(host, silent)
183            except AutoservVerifyDependencyError as e:
184                failures.update(e.failures)
185            except Exception as e:
186                failures.add(_DependencyFailure(v.description, str(e)))
187        if failures:
188            raise AutoservVerifyDependencyError(self, failures)
189
190
191    def _verify_dependencies(self, host, silent):
192        """
193        Verify that all of this node's dependencies pass for a host.
194
195        @param host     The host to be verified.
196        @param silent   If true, don't log host status records.
197        """
198        try:
199            self._verify_list(host, self._dependency_list, silent)
200        except AutoservVerifyDependencyError as e:
201            e.log_dependencies(
202                    'Skipping this operation',
203                    'The following dependencies failed')
204            raise
205
206
207    @property
208    def tag(self):
209        """
210        Tag for use in logging status records.
211
212        This is a property with a short string used to identify the node
213        in the 'status.log' file and during node construction.  The tag
214        should contain only letters, digits, and '_' characters.  This
215        tag is not used alone, but is combined with other identifiers,
216        based on the operation being logged.
217
218        @return A short identifier-like string.
219        """
220        return self._tag
221
222
223    @property
224    def description(self):
225        """
226        Text description of this node for log messages.
227
228        This string will be logged with failures, and should describe
229        the condition required for success.
230
231        N.B. Subclasses are required to override this method, but we
232        _don't_ raise NotImplementedError here.  Various methods fail in
233        inscrutable ways if this method raises any exception, so for
234        debugging purposes, it's better to return a default value.
235
236        @return A descriptive string.
237        """
238        return ('Class %s fails to implement description().' %
239                type(self).__name__)
240
241
242class Verifier(_DependencyNode):
243    """
244    Abstract class embodying one verification check.
245
246    A concrete subclass of `Verifier` provides a simple check that can
247    determine a host's fitness for testing.  Failure indicates that the
248    check found a problem that can cause at least one test to fail.
249
250    `Verifier` objects are organized in a DAG identifying dependencies
251    among operations.  The DAG controls ordering and prevents wasted
252    effort:  If verification operation V2 requires that verification
253    operation V1 pass, then a) V1 will run before V2, and b) if V1
254    fails, V2 won't run at all.  The `_verify_host()` method ensures
255    that all dependencies run and pass before invoking the `verify()`
256    method.
257
258    A `Verifier` object caches its result the first time it calls
259    `verify()`.  Subsequent calls return the cached result, without
260    re-running the check code.  The `_reverify()` method clears the
261    cached result in the current node, and in all dependencies.
262
263    Subclasses must supply these properties and methods:
264      * `verify()`: This is the method to perform the actual
265        verification check.
266      * `description`:  A one-line summary of the verification check for
267        debug log messages.
268
269    Subclasses must override all of the above attributes; subclasses
270    should not override or extend any other attributes of this class.
271
272    The description string should be a simple sentence explaining what
273    must be true for the verifier to pass.  Do not include a terminating
274    period.  For example:
275
276        Host is available via ssh
277
278    The base class manages the following private data:
279      * `_result`:  The cached result of verification.
280      * `_dependency_list`:  The list of dependencies.
281    Subclasses should not use these attributes.
282
283    @property _result           Cached result of verification.
284    """
285
286    def __init__(self, tag, dependencies):
287        super(Verifier, self).__init__(tag, dependencies)
288        self._result = None
289        self._verify_tag = 'verify.' + self.tag
290
291
292    def _reverify(self):
293        """
294        Discard cached verification results.
295
296        Reset the cached verification result for this node, and for the
297        transitive closure of all dependencies.
298        """
299        if self._result is not None:
300            self._result = None
301            for v in self._dependency_list:
302                v._reverify()
303
304
305    def _verify_host(self, host, silent):
306        """
307        Determine the result of verification, and log results.
308
309        If this verifier does not have a cached verification result,
310        check dependencies, and if they pass, run `verify()`.  Log
311        informational messages regarding failed dependencies.  If we
312        call `verify()`, log the result in `status.log`.
313
314        If we already have a cached result, return that result without
315        logging any message.
316
317        @param host     The host to be tested for a problem.
318        @param silent   If true, don't log host status records.
319        """
320        if self._result is not None:
321            if isinstance(self._result, Exception):
322                raise self._result  # cached failure
323            elif self._result:
324                return              # cached success
325        self._result = False
326        self._verify_dependencies(host, silent)
327        logging.info('Verifying this condition: %s', self.description)
328        try:
329            self.verify(host)
330            self._record(host, silent, 'GOOD', None, self._verify_tag)
331        except Exception as e:
332            logging.exception('Failed: %s', self.description)
333            self._result = e
334            self._record(host, silent,
335                         'FAIL', None, self._verify_tag, str(e))
336            raise
337        self._result = True
338
339
340    def verify(self, host):
341        """
342        Unconditionally perform a verification check.
343
344        This method is responsible for testing for a single problem on a
345        host.  Implementations should follow these guidelines:
346          * The check should find a problem that will cause testing to
347            fail.
348          * Verification checks on a working system should run quickly
349            and should be optimized for success; a check that passes
350            should finish within seconds.
351          * Verification checks are not expected have side effects, but
352            may apply trivial fixes if they will finish within the time
353            constraints above.
354
355        A verification check should normally trigger a single set of
356        repair actions.  If two different failures can require two
357        different repairs, ideally they should use two different
358        subclasses of `Verifier`.
359
360        Implementations indicate failure by raising an exception.  The
361        exception text should be a short, 1-line summary of the error.
362        The text should be concise and diagnostic, as it will appear in
363        `status.log` files.
364
365        If this method finds no problems, it returns without raising any
366        exception.
367
368        Implementations should avoid most logging actions, but can log
369        DEBUG level messages if they provide significant information for
370        diagnosing failures.
371
372        @param host   The host to be tested for a problem.
373        """
374        raise NotImplementedError('Class %s does not implement '
375                                  'verify()' % type(self).__name__)
376
377
378class RepairAction(_DependencyNode):
379    """
380    Abstract class embodying one repair procedure.
381
382    A `RepairAction` is responsible for fixing one or more failed
383    `Verifier` checks, in order to make those checks pass.
384
385    Each repair action includes one or more verifier triggers that
386    determine when the repair action should run.  A repair action
387    will call its `repair()` method if one or more of its triggers
388    fails.  A repair action is successful if all of its triggers pass
389    after calling `repair()`.
390
391    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
392    repair action's dependencies fail, the action does not check its
393    triggers, and doesn't call `repair()`.
394
395    Subclasses must supply these attributes:
396      * `repair()`: This is the method to perform the necessary
397        repair.  The method should avoid most logging actions, but
398        can log DEBUG level messages if they provide significant
399        information for diagnosing failures.
400      * `description`:  A one-line summary of the repair action for
401        debug log messages.
402
403    Subclasses must override both of the above attributes and should
404    not override any other attributes of this class.
405
406    The description string should be a simple sentence explaining the
407    operation that will be performed.  Do not include a terminating
408    period.  For example:
409
410        Re-install the stable build via AU
411
412    @property _trigger_list   List of verification checks that will
413                              trigger this repair when they fail.
414    """
415
416    def __init__(self, tag, dependencies, triggers):
417        super(RepairAction, self).__init__(tag, dependencies)
418        self._trigger_list = triggers
419        self._repair_tag = 'repair.' + self.tag
420
421
422    def _repair_host(self, host, silent):
423        """
424        Apply this repair action if any triggers fail.
425
426        Repair is triggered when all dependencies are successful, and at
427        least one trigger fails.
428
429        If the `repair()` method triggers, the success or failure of
430        this operation is logged in `status.log` bracketed by 'START'
431        and 'END' records.  Details of whether or why `repair()`
432        triggered are written to the debug logs.   If repair doesn't
433        trigger, nothing is logged to `status.log`.
434
435        @param host     The host to be repaired.
436        @param silent   If true, don't log host status records.
437        """
438        self._verify_dependencies(host, silent)
439        try:
440            self._verify_list(host, self._trigger_list, silent)
441        except AutoservVerifyDependencyError as e:
442            e.log_dependencies(
443                    'Attempting this repair action',
444                    'Repairing because these triggers failed')
445            self._record(host, silent, 'START', None, self._repair_tag)
446            try:
447                self.repair(host)
448            except Exception as e:
449                logging.exception('Repair failed: %s', self.description)
450                self._record(host, silent,
451                             'FAIL', None, self._repair_tag, str(e))
452                self._record(host, silent,
453                             'END FAIL', None, self._repair_tag)
454                raise
455            try:
456                for v in self._trigger_list:
457                    v._reverify()
458                self._verify_list(host, self._trigger_list, silent)
459                self._record(host, silent,
460                             'END GOOD', None, self._repair_tag)
461            except AutoservVerifyDependencyError as e:
462                e.log_dependencies(
463                        'This repair action reported success',
464                        'However, these triggers still fail')
465                self._record(host, silent,
466                             'END FAIL', None, self._repair_tag)
467                raise AutoservRepairError(
468                        'Some verification checks still fail')
469            except Exception:
470                # The specification for `self._verify_list()` says
471                # that this can't happen; this is a defensive
472                # precaution.
473                self._record(host, silent,
474                             'END FAIL', None, self._repair_tag,
475                            'Internal error in repair')
476                raise
477        else:
478            logging.info('No failed triggers, skipping repair:  %s',
479                         self.description)
480
481
482    def repair(self, host):
483        """
484        Apply this repair action to the given host.
485
486        This method is responsible for applying changes to fix failures
487        in one or more verification checks.  The repair is considered
488        successful if the DUT passes the specific checks after this
489        method completes.
490
491        Implementations indicate failure by raising an exception.  The
492        exception text should be a short, 1-line summary of the error.
493        The text should be concise and diagnostic, as it will appear in
494        `status.log` files.
495
496        If this method completes successfully, it returns without
497        raising any exception.
498
499        Implementations should avoid most logging actions, but can log
500        DEBUG level messages if they provide significant information for
501        diagnosing failures.
502
503        @param host   The host to be repaired.
504        """
505        raise NotImplementedError('Class %s does not implement '
506                                  'repair()' % type(self).__name__)
507
508
509class _RootVerifier(Verifier):
510    """
511    Utility class used by `RepairStrategy`.
512
513    A node of this class by itself does nothing; it always passes (if it
514    can run).  This class exists merely to be the root of a DAG of
515    dependencies in an instance of `RepairStrategy`.
516    """
517
518    def verify(self, host):
519        pass
520
521
522    @property
523    def description(self):
524        return 'All host verification checks pass'
525
526
527
528class RepairStrategy(object):
529    """
530    A class for organizing `Verifier` and `RepairAction` objects.
531
532    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
533    objects, plus a list of `RepairAction` objects.  The class provides
534    methods for invoking those objects in the required order, when
535    needed:
536      * The `verify()` method walks the verifier DAG in dependency
537        order.
538      * The `repair()` method invokes the repair actions in list order.
539        Each repair action will invoke its dependencies and triggers as
540        needed.
541
542    # The Verifier DAG
543    The verifier DAG is constructed from the first argument passed to
544    the passed to the `RepairStrategy` constructor.  That argument is an
545    iterable consisting of three-element tuples in the form
546    `(constructor, tag, deps)`:
547      * The `constructor` value is a callable that creates a `Verifier`
548        as for the interface of the class constructor.  For classes
549        that inherit the default constructor from `Verifier`, this can
550        be the class itself.
551      * The `tag` value is the tag to be associated with the constructed
552        verifier.
553      * The `deps` value is an iterable (e.g. list or tuple) of strings.
554        Each string corresponds to the `tag` member of a `Verifier`
555        dependency.
556
557    The tag names of verifiers in the constructed DAG must all be
558    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
559    reserved and may not be used by any verifier.
560
561    In the input data for the constructor, dependencies must appear
562    before the nodes that depend on them.  Thus:
563
564        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
565        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
566
567    Internally, the DAG of verifiers is given unique root node.  So,
568    given this input:
569
570        ((C, 'c', ()),
571         (A, 'a', ('c',)),
572         (B, 'b', ('c',)))
573
574    The following DAG is constructed:
575
576          Root
577          /  \
578         A    B
579          \  /
580           C
581
582    Since nothing depends on `A` or `B`, the root node guarantees that
583    these two verifiers will both be called and properly logged.
584
585    The root node is not directly accessible; however repair actions can
586    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
587    node will be logged in `status.log` whenever `verify()` succeeds.
588
589    # The Repair Actions List
590    The list of repair actions is constructed from the second argument
591    passed to the passed to the `RepairStrategy` constructor.  That
592    argument is an iterable consisting of four-element tuples in the
593    form `(constructor, tag, deps, triggers)`:
594      * The `constructor` value is a callable that creates a
595        `RepairAction` as for the interface of the class constructor.
596        For classes that inherit the default constructor from
597        `RepairAction`, this can be the class itself.
598      * The `tag` value is the tag to be associated with the constructed
599        repair action.
600      * The `deps` value is an iterable (e.g. list or tuple) of strings.
601        Each string corresponds to the `tag` member of a `Verifier` that
602        the repair action depends on.
603      * The `triggers` value is an iterable (e.g. list or tuple) of
604        strings.  Each string corresponds to the `tag` member of a
605        `Verifier` that can trigger the repair action.
606
607    `RepairStrategy` deps and triggers can only refer to verifiers,
608    not to other repair actions.
609    """
610
611    # This name is reserved; clients may not use it.
612    ROOT_TAG = 'PASS'
613
614    @staticmethod
615    def _add_verifier(verifiers, constructor, tag, dep_tags):
616        """
617        Construct and remember a verifier.
618
619        Create a `Verifier` using `constructor` and `tag`.  Dependencies
620        for construction are found by looking up `dep_tags` in the
621        `verifiers` dictionary.
622
623        After construction, the new verifier is added to `verifiers`.
624
625        @param verifiers    Dictionary of verifiers, indexed by tag.
626        @param constructor  Verifier construction function.
627        @param tag          Tag parameter for the construction function.
628        @param dep_tags     Tags of dependencies for the constructor, to
629                            be found in `verifiers`.
630        """
631        assert tag not in verifiers
632        deps = [verifiers[d] for d in dep_tags]
633        verifiers[tag] = constructor(tag, deps)
634
635
636    def __init__(self, verifier_data, repair_data):
637        """
638        Construct a `RepairStrategy` from simplified DAG data.
639
640        The input `verifier_data` object describes how to construct
641        verify nodes and the dependencies that relate them, as detailed
642        above.
643
644        The input `repair_data` object describes how to construct repair
645        actions and their dependencies and triggers, as detailed above.
646
647        @param verifier_data  Iterable value with constructors for the
648                              elements of the verification DAG and their
649                              dependencies.
650        @param repair_data    Iterable value with constructors for the
651                              elements of the repair action list, and
652                              their dependencies and triggers.
653        """
654        # We use the `all_verifiers` list to guarantee that our root
655        # verifier will execute its dependencies in the order provided
656        # to us by our caller.
657        verifier_map = {}
658        all_tags = []
659        dependencies = set()
660        for constructor, tag, deps in verifier_data:
661            self._add_verifier(verifier_map, constructor, tag, deps)
662            dependencies.update(deps)
663            all_tags.append(tag)
664        # Capture all the verifiers that have nothing depending on them.
665        root_tags = [t for t in all_tags if t not in dependencies]
666        self._add_verifier(verifier_map, _RootVerifier,
667                           self.ROOT_TAG, root_tags)
668        self._verify_root = verifier_map[self.ROOT_TAG]
669        self._repair_actions = []
670        for constructor, tag, deps, triggers in repair_data:
671            r = constructor(tag,
672                            [verifier_map[d] for d in deps],
673                            [verifier_map[t] for t in triggers])
674            self._repair_actions.append(r)
675
676
677    def verify(self, host, silent=False):
678        """
679        Run the verifier DAG on the given host.
680
681        @param host     The target to be verified.
682        @param silent   If true, don't log host status records.
683        """
684        self._verify_root._reverify()
685        self._verify_root._verify_host(host, silent)
686
687
688    def repair(self, host, silent=False):
689        """
690        Run the repair list on the given host.
691
692        @param host     The target to be repaired.
693        @param silent   If true, don't log host status records.
694        """
695        self._verify_root._reverify()
696        for ra in self._repair_actions:
697            try:
698                ra._repair_host(host, silent)
699            except Exception as e:
700                # all logging and exception handling was done at
701                # lower levels
702                pass
703        self._verify_root._verify_host(host, silent)
704