1# Copyright 2016 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""
6Framework for host verification and repair in Autotest.
7
8The framework provides implementation code in support of `Host.verify()`
9and `Host.repair()` used in Verify and Repair special tasks.
10
11The framework consists of these classes:
12  * `Verifier`: A class representing a single verification check.
13  * `RepairAction`: A class representing a repair operation that can fix
14    a failed verification check.
15  * `RepairStrategy`:  A class for organizing a collection of `Verifier`
16    and `RepairAction` instances, and invoking them in order.
17
18Individual operations during verification and repair are handled by
19instances of `Verifier` and `RepairAction`.  `Verifier` objects are
20meant to test for specific conditions that may cause tests to fail.
21`RepairAction` objects provide operations designed to fix one or
22more failures identified by a `Verifier` object.
23"""
24
25import collections
26import logging
27import re
28
29import common
30from autotest_lib.client.common_lib import error
31
32try:
33    from chromite.lib import metrics
34except ImportError:
35    from autotest_lib.client.bin.utils import metrics_mock as metrics
36
37#Regular experssion pattern to filter out unwanted hostname.
38_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
39_DISALLOWED_HOSTNAME = 'disallowed_hostname'
40
41
42class AutoservVerifyError(error.AutoservError):
43    """
44    Generic Exception for failures from `Verifier` objects.
45
46    Instances of this exception can be raised when a `verify()`
47    method fails, if no more specific exception is available.
48    """
49    pass
50
51
52_DependencyFailure = collections.namedtuple(
53        '_DependencyFailure', ('dependency', 'error', 'tag'))
54
55
56class AutoservVerifyDependencyError(error.AutoservError):
57    """
58    Exception raised for failures in dependencies.
59
60    This exception is used to distinguish an original failure from a
61    failure being passed back from a verification dependency.  That is,
62    if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
63    to signal that the original failure is further down the dependency
64    chain.
65
66    The `failures` argument to the constructor for this class is a set
67    of instances of `_DependencyFailure`, each corresponding to one
68    failed dependency:
69      * The `dependency` attribute of each failure is the description
70        of the failed dependency.
71      * The `error` attribute of each failure is the string value of
72        the exception from the failed dependency.
73
74    Multiple methods in this module recognize and handle this exception
75    specially.
76
77    @property failures  Set of failures passed to the constructor.
78    @property _node     Instance of `_DependencyNode` reporting the
79                        failed dependencies.
80    """
81
82    def __init__(self, node, failures):
83        """
84        Constructor for `AutoservVerifyDependencyError`.
85
86        @param node       Instance of _DependencyNode reporting the
87                          failed dependencies.
88        @param failures   List of failure tuples as described above.
89        """
90        super(AutoservVerifyDependencyError, self).__init__(
91                '\n'.join([f.error for f in failures]))
92        self.failures = failures
93        self._node = node
94
95    def log_dependencies(self, action, deps):
96        """
97        Log an `AutoservVerifyDependencyError`.
98
99        This writes a short summary of the dependency failures captured
100        in this exception, using standard Python logging.
101
102        The passed in `action` string plus `self._node.description`
103        are logged at INFO level.  The `action` argument should
104        introduce or describe an action relative to `self._node`.
105
106        The passed in `deps` string and the description of each failed
107        dependency in `self` are be logged at DEBUG level.  The `deps`
108        argument is used to introduce the various failed dependencies.
109
110        @param action   A string mentioning the action being logged
111                        relative to `self._node`.
112        @param deps     A string introducing the dependencies that
113                        failed.
114        """
115        logging.info('%s: %s', action, self._node.description)
116        logging.debug('%s:', deps)
117        for failure in self.failures:
118            logging.debug('    %s', failure.dependency)
119
120
121class AutoservRepairError(error.AutoservError):
122    """
123    Generic Exception for failures from `RepairAction` objects.
124
125    Instances of this exception can be raised when a `repair()`
126    method fails, if no more specific exception is available.
127    """
128    def __init__(self, description, tag):
129        """
130        @param description  Message describe the exception.
131        @param tag          A short identifier used for metric purpose.
132        """
133        super(AutoservRepairError, self).__init__(description)
134        self.tag = tag
135
136
137class _DependencyNode(object):
138    """
139    An object that can depend on verifiers.
140
141    Both repair and verify operations have the notion of dependencies
142    that must pass before the operation proceeds.  This class captures
143    the shared behaviors required by both classes.
144
145    @property tag               Short identifier to be used in logging.
146    @property description       Text summary of this node's action, to be
147                                used in debug logs.
148    @property _dependency_list  Dependency pre-requisites.
149    """
150
151    def __init__(self, tag, record_type, dependencies):
152        self._dependency_list = dependencies
153        self._tag = tag
154        self._record_tag = record_type + '.' + tag
155
156    def _record(self, host, silent, status_code, *record_args):
157        """
158        Log a status record for `host`.
159
160        Call `host.record()` using the given status_code, and
161        operation tag `self._record_tag`, plus any extra arguments in
162        `record_args`.  Do nothing if `silent` is a true value.
163
164        @param host         Host which will record the status record.
165        @param silent       Don't record the event if this is a true
166                            value.
167        @param status_code  Value for the `status_code` parameter to
168                            `host.record()`.
169        @param record_args  Additional arguments to pass to
170                            `host.record()`.
171        """
172        if not silent:
173            host.record(status_code, None, self._record_tag,
174                        *record_args)
175
176    def _record_good(self, host, silent):
177        """Log a 'GOOD' status line.
178
179        @param host         Host which will record the status record.
180        @param silent       Don't record the event if this is a true
181                            value.
182        """
183        self._record(host, silent, 'GOOD')
184
185    def _record_fail(self, host, silent, exc):
186        """Log a 'FAIL' status line.
187
188        @param host         Host which will record the status record.
189        @param silent       Don't record the event if this is a true
190                            value.
191        @param exc          Exception describing the cause of failure.
192        """
193        self._record(host, silent, 'FAIL', str(exc))
194
195    def _verify_list(self, host, verifiers, silent):
196        """
197        Test a list of verifiers against a given host.
198
199        This invokes `_verify_host()` on every verifier in the given
200        list.  If any verifier in the transitive closure of dependencies
201        in the list fails, an `AutoservVerifyDependencyError` is raised
202        containing the description of each failed verifier.  Only
203        original failures are reported; verifiers that don't run due
204        to a failed dependency are omitted.
205
206        By design, original failures are logged once in `_verify_host()`
207        when `verify()` originally fails.  The additional data gathered
208        here is for the debug logs to indicate why a subsequent
209        operation never ran.
210
211        @param host       The host to be tested against the verifiers.
212        @param verifiers  List of verifiers to be checked.
213        @param silent     If true, don't log host status records.
214
215        @raises AutoservVerifyDependencyError   Raised when at least
216                        one verifier in the list has failed.
217        """
218        failures = set()
219        for v in verifiers:
220            try:
221                v._verify_host(host, silent)
222            except AutoservVerifyDependencyError as e:
223                failures.update(e.failures)
224            except Exception as e:
225                failures.add(_DependencyFailure(v.description, str(e), v.tag))
226        if failures:
227            raise AutoservVerifyDependencyError(self, failures)
228
229    def _verify_dependencies(self, host, silent):
230        """
231        Verify that all of this node's dependencies pass for a host.
232
233        @param host     The host to be verified.
234        @param silent   If true, don't log host status records.
235        """
236        try:
237            self._verify_list(host, self._dependency_list, silent)
238        except AutoservVerifyDependencyError as e:
239            e.log_dependencies(
240                    'Skipping this operation',
241                    'The following dependencies failed')
242            raise
243
244    @property
245    def tag(self):
246        """
247        Tag for use in logging status records.
248
249        This is a property with a short string used to identify the node
250        in the 'status.log' file and during node construction.  The tag
251        should contain only letters, digits, and '_' characters.  This
252        tag is not used alone, but is combined with other identifiers,
253        based on the operation being logged.
254
255        @return A short identifier-like string.
256        """
257        return self._tag
258
259    @property
260    def description(self):
261        """
262        Text description of this node for log messages.
263
264        This string will be logged with failures, and should describe
265        the condition required for success.
266
267        N.B. Subclasses are required to override this method, but we
268        _don't_ raise NotImplementedError here.  Various methods fail in
269        inscrutable ways if this method raises any exception, so for
270        debugging purposes, it's better to return a default value.
271
272        @return A descriptive string.
273        """
274        return ('Class %s fails to implement description().' %
275                type(self).__name__)
276
277
278class Verifier(_DependencyNode):
279    """
280    Abstract class embodying one verification check.
281
282    A concrete subclass of `Verifier` provides a simple check that can
283    determine a host's fitness for testing.  Failure indicates that the
284    check found a problem that can cause at least one test to fail.
285
286    `Verifier` objects are organized in a DAG identifying dependencies
287    among operations.  The DAG controls ordering and prevents wasted
288    effort:  If verification operation V2 requires that verification
289    operation V1 pass, then a) V1 will run before V2, and b) if V1
290    fails, V2 won't run at all.  The `_verify_host()` method ensures
291    that all dependencies run and pass before invoking the `verify()`
292    method.
293
294    A `Verifier` object caches its result the first time it calls
295    `verify()`.  Subsequent calls return the cached result, without
296    re-running the check code.  The `_reverify()` method clears the
297    cached result in the current node, and in all dependencies.
298
299    Subclasses must supply these properties and methods:
300      * `verify()`: This is the method to perform the actual
301        verification check.
302      * `description`:  A one-line summary of the verification check for
303        debug log messages.
304
305    Subclasses must override all of the above attributes; subclasses
306    should not override or extend any other attributes of this class.
307
308    The description string should be a simple sentence explaining what
309    must be true for the verifier to pass.  Do not include a terminating
310    period.  For example:
311
312        Host is available via ssh
313
314    The base class manages the following private data:
315      * `_result`:  The cached result of verification.
316      * `_dependency_list`:  The list of dependencies.
317    Subclasses should not use these attributes.
318
319    @property _result           Cached result of verification.
320    """
321
322    def __init__(self, tag, dependencies):
323        super(Verifier, self).__init__(tag, 'verify', dependencies)
324        self._result = None
325
326    def _reverify(self):
327        """
328        Discard cached verification results.
329
330        Reset the cached verification result for this node, and for the
331        transitive closure of all dependencies.
332        """
333        if self._result is not None:
334            self._result = None
335            for v in self._dependency_list:
336                v._reverify()
337
338    def _verify_host(self, host, silent):
339        """
340        Determine the result of verification, and log results.
341
342        If this verifier does not have a cached verification result,
343        check dependencies, and if they pass, run `verify()`.  Log
344        informational messages regarding failed dependencies.  If we
345        call `verify()`, log the result in `status.log`.
346
347        If we already have a cached result, return that result without
348        logging any message.
349
350        @param host     The host to be tested for a problem.
351        @param silent   If true, don't log host status records.
352        """
353        if self._result is not None:
354            if isinstance(self._result, Exception):
355                raise self._result  # cached failure
356            elif self._result:
357                return              # cached success
358        self._result = False
359        self._verify_dependencies(host, silent)
360        logging.info('Verifying this condition: %s', self.description)
361        try:
362            self.verify(host)
363            self._record_good(host, silent)
364        except Exception as e:
365            logging.exception('Failed: %s', self.description)
366            self._result = e
367            self._record_fail(host, silent, e)
368            raise
369        self._result = True
370
371    def verify(self, host):
372        """
373        Unconditionally perform a verification check.
374
375        This method is responsible for testing for a single problem on a
376        host.  Implementations should follow these guidelines:
377          * The check should find a problem that will cause testing to
378            fail.
379          * Verification checks on a working system should run quickly
380            and should be optimized for success; a check that passes
381            should finish within seconds.
382          * Verification checks are not expected have side effects, but
383            may apply trivial fixes if they will finish within the time
384            constraints above.
385
386        A verification check should normally trigger a single set of
387        repair actions.  If two different failures can require two
388        different repairs, ideally they should use two different
389        subclasses of `Verifier`.
390
391        Implementations indicate failure by raising an exception.  The
392        exception text should be a short, 1-line summary of the error.
393        The text should be concise and diagnostic, as it will appear in
394        `status.log` files.
395
396        If this method finds no problems, it returns without raising any
397        exception.
398
399        Implementations should avoid most logging actions, but can log
400        DEBUG level messages if they provide significant information for
401        diagnosing failures.
402
403        @param host   The host to be tested for a problem.
404        """
405        raise NotImplementedError('Class %s does not implement '
406                                  'verify()' % type(self).__name__)
407
408
409class RepairAction(_DependencyNode):
410    """
411    Abstract class embodying one repair procedure.
412
413    A `RepairAction` is responsible for fixing one or more failed
414    `Verifier` checks, in order to make those checks pass.
415
416    Each repair action includes one or more verifier triggers that
417    determine when the repair action should run.  A repair action
418    will call its `repair()` method if one or more of its triggers
419    fails.  A repair action is successful if all of its triggers pass
420    after calling `repair()`.
421
422    A `RepairAction` is a subclass of `_DependencyNode`; if any of a
423    repair action's dependencies fail, the action does not check its
424    triggers, and doesn't call `repair()`.
425
426    Subclasses must supply these attributes:
427      * `repair()`: This is the method to perform the necessary
428        repair.  The method should avoid most logging actions, but
429        can log DEBUG level messages if they provide significant
430        information for diagnosing failures.
431      * `description`:  A one-line summary of the repair action for
432        debug log messages.
433
434    Subclasses must override both of the above attributes and should
435    not override any other attributes of this class.
436
437    The description string should be a simple sentence explaining the
438    operation that will be performed.  Do not include a terminating
439    period.  For example:
440
441        Re-install the stable build via AU
442
443    @property _trigger_list   List of verification checks that will
444                              trigger this repair when they fail.
445    @property host_class      A string identifier that will be
446                              used as a field to send repair metrics.
447    """
448
449    def __init__(self, tag, dependencies, triggers, host_class):
450        super(RepairAction, self).__init__(tag, 'repair', dependencies)
451        self._trigger_list = triggers
452        self._failure_modes_counter = metrics.Counter(
453            'chromeos/autotest/repair/failure_modes')
454        self._failure_detail_counter = metrics.Counter(
455            'chromeos/autotest/repair/failure_detail')
456        self.host_class = host_class
457
458    def _record_start(self, host, silent):
459        """Log a 'START' status line.
460
461        @param host         Host which will record the status record.
462        @param silent       Don't record the event if this is a true
463                            value.
464        """
465        self._record(host, silent, 'START')
466
467    def _record_end_good(self, host, silent):
468        """Log an 'END GOOD' status line.
469
470        @param host         Host which will record the status record.
471        @param silent       Don't record the event if this is a true
472                            value.
473        """
474        self._record(host, silent, 'END GOOD')
475        self.status = 'repaired'
476
477    def _record_end_fail(self, host, silent, status, *args):
478        """Log an 'END FAIL' status line.
479
480        @param host         Host which will record the status record.
481        @param silent       Don't record the event if this is a true
482                            value.
483        @param args         Extra arguments to `self._record()`
484        """
485        self._record(host, silent, 'END FAIL', *args)
486        self.status = status
487
488    def _send_failure_metrics(self, host, error, stage):
489        """Send failure mode metrics to monarch
490
491        @param host         Host which this RepairAction targeted to.
492        @param error        An exception that caught in _repair_host.
493        @param stage        In which stage we caught above exception.
494                            Can be one of below value:
495                                'dep'    during verify dependencies
496                                'pre'    during pre-repair trigger verification
497                                'repair' during repair() process itself
498                                'post'   during post-repair trigger verification
499        """
500
501        def get_fields(vf_tag):
502            fields = {
503                'ra_tag': self.tag,
504                'vf_tag': vf_tag,
505                'hostname': _filter_metrics_hostname(host),
506                'stage': stage,
507                'host_class': self.host_class
508            }
509            return fields
510
511        if isinstance(error, AutoservVerifyDependencyError):
512            # We'll catch all failure tags here for a dependencies error
513            for f in error.failures:
514                self._failure_modes_counter.increment(fields=get_fields(f.tag))
515        else:
516            # When there is failure during repair or unknown failure. there
517            # will be no Verifier, so vf_tag set to 'unknown'.
518            self._failure_modes_counter.increment(fields=get_fields('unknown'))
519
520        if stage == 'repair':
521            self._send_failure_detail(error)
522
523    def _send_failure_detail(self, error):
524        """Send reason of failure inside repair() to monarch.
525
526        @param error    The exception caught inside repair().
527        """
528        tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
529        fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
530        self._failure_detail_counter.increment(fields=fields)
531
532    def _repair_host(self, host, silent):
533        """
534        Apply this repair action if any triggers fail.
535
536        Repair is triggered when all dependencies are successful, and at
537        least one trigger fails.
538
539        If the `repair()` method triggers, the success or failure of
540        this operation is logged in `status.log` bracketed by 'START'
541        and 'END' records.  Details of whether or why `repair()`
542        triggered are written to the debug logs.   If repair doesn't
543        trigger, nothing is logged to `status.log`.
544
545        @param host     The host to be repaired.
546        @param silent   If true, don't log host status records.
547        """
548        # Note:  Every exit path from the method must set `self.status`.
549        # There's a lot of exit paths, so be careful.
550        #
551        # If we're blocked by a failed dependency, we exit with an
552        # exception.  So set status to 'blocked' first.
553        self.status = 'blocked'
554        try:
555            self._verify_dependencies(host, silent)
556        except Exception as e:
557            self._send_failure_metrics(host, e, 'dep')
558            raise
559        # This is a defensive action.  Every path below should overwrite
560        # this setting, but if it doesn't, we want our status to reflect
561        # a coding error.
562        self.status = 'unknown'
563        try:
564            self._verify_list(host, self._trigger_list, silent)
565        except AutoservVerifyDependencyError as e:
566            e.log_dependencies(
567                    'Attempting this repair action',
568                    'Repairing because these triggers failed')
569            self._send_failure_metrics(host, e, 'pre')
570            self._record_start(host, silent)
571            try:
572                self.repair(host)
573            except Exception as e:
574                logging.exception('Repair failed: %s', self.description)
575                self._record_fail(host, silent, e)
576                self._record_end_fail(host, silent, 'repair_failure')
577                self._send_failure_metrics(host, e, 'repair')
578                raise
579            try:
580                for v in self._trigger_list:
581                    v._reverify()
582                self._verify_list(host, self._trigger_list, silent)
583                self._record_end_good(host, silent)
584            except AutoservVerifyDependencyError as e:
585                e.log_dependencies(
586                        'This repair action reported success',
587                        'However, these triggers still fail')
588                self._record_end_fail(host, silent, 'verify_failure')
589                self._send_failure_metrics(host, e, 'post')
590                raise AutoservRepairError(
591                        'Some verification checks still fail', 'post_verify')
592            except Exception:
593                # The specification for `self._verify_list()` says
594                # that this can't happen; this is a defensive
595                # precaution.
596                self._record_end_fail(host, silent, 'unknown',
597                                      'Internal error in repair')
598                self._send_failure_metrics(host, e, 'post')
599                raise
600        else:
601            self.status = 'skipped'
602            logging.info('No failed triggers, skipping repair:  %s',
603                         self.description)
604
605    def repair(self, host):
606        """
607        Apply this repair action to the given host.
608
609        This method is responsible for applying changes to fix failures
610        in one or more verification checks.  The repair is considered
611        successful if the DUT passes the specific checks after this
612        method completes.
613
614        Implementations indicate failure by raising an exception.  The
615        exception text should be a short, 1-line summary of the error.
616        The text should be concise and diagnostic, as it will appear in
617        `status.log` files.
618
619        If this method completes successfully, it returns without
620        raising any exception.
621
622        Implementations should avoid most logging actions, but can log
623        DEBUG level messages if they provide significant information for
624        diagnosing failures.
625
626        @param host   The host to be repaired.
627        """
628        raise NotImplementedError('Class %s does not implement '
629                                  'repair()' % type(self).__name__)
630
631
632class _RootVerifier(Verifier):
633    """
634    Utility class used by `RepairStrategy`.
635
636    A node of this class by itself does nothing; it always passes (if it
637    can run).  This class exists merely to be the root of a DAG of
638    dependencies in an instance of `RepairStrategy`.
639    """
640
641    def verify(self, host):
642        pass
643
644    @property
645    def description(self):
646        return 'All host verification checks pass'
647
648
649class RepairStrategy(object):
650    """
651    A class for organizing `Verifier` and `RepairAction` objects.
652
653    An instance of `RepairStrategy` is organized as a DAG of `Verifier`
654    objects, plus a list of `RepairAction` objects.  The class provides
655    methods for invoking those objects in the required order, when
656    needed:
657      * The `verify()` method walks the verifier DAG in dependency
658        order.
659      * The `repair()` method invokes the repair actions in list order.
660        Each repair action will invoke its dependencies and triggers as
661        needed.
662
663    # The Verifier DAG
664    The verifier DAG is constructed from the first argument passed to
665    the passed to the `RepairStrategy` constructor.  That argument is an
666    iterable consisting of three-element tuples in the form
667    `(constructor, tag, deps)`:
668      * The `constructor` value is a callable that creates a `Verifier`
669        as for the interface of the class constructor.  For classes
670        that inherit the default constructor from `Verifier`, this can
671        be the class itself.
672      * The `tag` value is the tag to be associated with the constructed
673        verifier.
674      * The `deps` value is an iterable (e.g. list or tuple) of strings.
675        Each string corresponds to the `tag` member of a `Verifier`
676        dependency.
677
678    The tag names of verifiers in the constructed DAG must all be
679    unique.  The tag name defined by `RepairStrategy.ROOT_TAG` is
680    reserved and may not be used by any verifier.
681
682    In the input data for the constructor, dependencies must appear
683    before the nodes that depend on them.  Thus:
684
685        ((A, 'a', ()), (B, 'b', ('a',)))     # This is valid
686        ((B, 'b', ('a',)), (A, 'a', ()))     # This will fail!
687
688    Internally, the DAG of verifiers is given unique root node.  So,
689    given this input:
690
691        ((C, 'c', ()),
692         (A, 'a', ('c',)),
693         (B, 'b', ('c',)))
694
695    The following DAG is constructed:
696
697          Root
698          /  \
699         A    B
700          \  /
701           C
702
703    Since nothing depends on `A` or `B`, the root node guarantees that
704    these two verifiers will both be called and properly logged.
705
706    The root node is not directly accessible; however repair actions can
707    trigger on it by using `RepairStrategy.ROOT_TAG`.  Additionally, the
708    node will be logged in `status.log` whenever `verify()` succeeds.
709
710    # The Repair Actions List
711    The list of repair actions is constructed from the second argument
712    passed to the passed to the `RepairStrategy` constructor.  That
713    argument is an iterable consisting of four-element tuples in the
714    form `(constructor, tag, deps, triggers)`:
715      * The `constructor` value is a callable that creates a
716        `RepairAction` as for the interface of the class constructor.
717        For classes that inherit the default constructor from
718        `RepairAction`, this can be the class itself.
719      * The `tag` value is the tag to be associated with the constructed
720        repair action.
721      * The `deps` value is an iterable (e.g. list or tuple) of strings.
722        Each string corresponds to the `tag` member of a `Verifier` that
723        the repair action depends on.
724      * The `triggers` value is an iterable (e.g. list or tuple) of
725        strings.  Each string corresponds to the `tag` member of a
726        `Verifier` that can trigger the repair action.
727
728    `RepairStrategy` deps and triggers can only refer to verifiers,
729    not to other repair actions.
730    """
731
732    # This name is reserved; clients may not use it.
733    ROOT_TAG = 'PASS'
734
735    @staticmethod
736    def _add_verifier(verifiers, constructor, tag, dep_tags):
737        """
738        Construct and remember a verifier.
739
740        Create a `Verifier` using `constructor` and `tag`.  Dependencies
741        for construction are found by looking up `dep_tags` in the
742        `verifiers` dictionary.
743
744        After construction, the new verifier is added to `verifiers`.
745
746        @param verifiers    Dictionary of verifiers, indexed by tag.
747        @param constructor  Verifier construction function.
748        @param tag          Tag parameter for the construction function.
749        @param dep_tags     Tags of dependencies for the constructor, to
750                            be found in `verifiers`.
751        """
752        assert tag not in verifiers
753        deps = [verifiers[d] for d in dep_tags]
754        verifiers[tag] = constructor(tag, deps)
755
756    def __init__(self, verifier_data, repair_data, host_class):
757        """
758        Construct a `RepairStrategy` from simplified DAG data.
759
760        The input `verifier_data` object describes how to construct
761        verify nodes and the dependencies that relate them, as detailed
762        above.
763
764        The input `repair_data` object describes how to construct repair
765        actions and their dependencies and triggers, as detailed above.
766
767        @param verifier_data  Iterable value with constructors for the
768                              elements of the verification DAG and their
769                              dependencies.
770        @param repair_data    Iterable value with constructors for the
771                              elements of the repair action list, and
772                              their dependencies and triggers.
773        @property host_class  A string identifier that identify what
774                              class of host this repair strategy target
775                              on, will be used as a field to send repair
776                              metrics.
777        """
778        # Metrics - we report on 'actions' for every repair action
779        # we execute; we report on 'strategy' for every complete
780        # repair operation.
781        self._strategy_counter = metrics.Counter(
782            'chromeos/autotest/repair/repair_strategy_v2')
783        self._actions_counter = metrics.Counter(
784            'chromeos/autotest/repair/repair_actions')
785        self.host_class = host_class
786        # We use the `all_verifiers` list to guarantee that our root
787        # verifier will execute its dependencies in the order provided
788        # to us by our caller.
789        verifier_map = {}
790        all_tags = []
791        dependencies = set()
792        for constructor, tag, deps in verifier_data:
793            self._add_verifier(verifier_map, constructor, tag, deps)
794            dependencies.update(deps)
795            all_tags.append(tag)
796        # Capture all the verifiers that have nothing depending on them.
797        root_tags = [t for t in all_tags if t not in dependencies]
798        self._add_verifier(verifier_map, _RootVerifier,
799                           self.ROOT_TAG, root_tags)
800        self._verify_root = verifier_map[self.ROOT_TAG]
801        self._repair_actions = []
802        for constructor, tag, deps, triggers in repair_data:
803            r = constructor(tag,
804                            [verifier_map[d] for d in deps],
805                            [verifier_map[t] for t in triggers],
806                            self.host_class)
807            self._repair_actions.append(r)
808
809    def _send_strategy_metrics(self, host, result):
810        """Send repair strategy metrics to monarch
811
812        @param host     The target to be repaired.
813        @param result   A String that describe a final result for the
814                        RepairStrategy.
815        """
816        info = host.host_info_store.get()
817        board = info.board if info.board else 'unknown'
818        model = info.model if info.model else 'unknown'
819        fields = {
820            'board': board,
821            'host_class': self.host_class,
822            'hostname': _filter_metrics_hostname(host),
823            'model': model,
824            'result': result,
825        }
826        self._strategy_counter.increment(fields=fields)
827
828    def _send_action_metrics(self, host, ra):
829        """Send repair action metrics to monarch
830
831        @param host     The target to be repaired.
832        @param ra       an RepairAction instance.
833        """
834        fields = {
835            'tag': ra.tag,
836            'status': ra.status,
837            'hostname': _filter_metrics_hostname(host),
838            'host_class': self.host_class
839        }
840        self._actions_counter.increment(fields=fields)
841
842    def verify(self, host, silent=False):
843        """
844        Run the verifier DAG on the given host.
845
846        @param host     The target to be verified.
847        @param silent   If true, don't log host status records.
848        """
849        self._verify_root._reverify()
850        self._verify_root._verify_host(host, silent)
851
852    def repair(self, host, silent=False):
853        """
854        Run the repair list on the given host.
855
856        @param host     The target to be repaired.
857        @param silent   If true, don't log host status records.
858        """
859        self._verify_root._reverify()
860        attempted = False
861        for ra in self._repair_actions:
862            try:
863                ra._repair_host(host, silent)
864            except Exception as e:
865                # all logging and exception handling was done at
866                # lower levels
867                pass
868            finally:
869                self._send_action_metrics(host, ra)
870                if ra.status not in ('skipped', 'blocked'):
871                    attempted = True
872
873        result = 'failure'
874        try:
875            self._verify_root._verify_host(host, silent)
876            result = 'success' if attempted else 'not_attempted'
877        except:
878            if not attempted:
879                result = 'attempt_blocked'
880            raise
881        finally:
882            self._send_strategy_metrics(host, result)
883
884
885def _filter_metrics_hostname(host):
886    """
887       Restrict format of hostnames we'll send to monarch
888
889       @param host    An host instance(i.e. ServoHost, CrosHost)
890    """
891    if re.match(_HOSTNAME_PATTERN, host.hostname):
892        return host.hostname
893    else:
894        return _DISALLOWED_HOSTNAME
895
896