1#!/usr/bin/env python2
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9models and swaps them with working DUTs taken from a selected pool
10of spares.  The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage:  balance_pool.py [ options ] POOL MODEL [ MODEL ... ]
15
16positional arguments:
17  POOL                  Name of the pool to balance
18  MODEL                 Names of models to balance
19
20optional arguments:
21  -h, --help            show this help message and exit
22  -t COUNT, --total COUNT
23                        Set the number of DUTs in the pool to the specified
24                        count for every MODEL
25  -a COUNT, --grow COUNT
26                        Add the specified number of DUTs to the pool for every
27                        MODEL
28  -d COUNT, --shrink COUNT
29                        Remove the specified number of DUTs from the pool for
30                        every MODEL
31  -s POOL, --spare POOL
32                        Pool from which to draw replacement spares (default:
33                        pool:suites)
34  -p PHASE, --phase PHASE
35                        Phase to restrict the balance pool operation to
36  --sku SKU             The specific SKU we intend to swap with
37  -n, --dry-run         Report actions to take in the form of shell commands
38
39
40The command attempts to remove all broken DUTs from the target POOL
41for every MODEL, and replace them with enough working DUTs taken
42from the spare pool to bring the strength of POOL to the requested
43total COUNT.
44
45If no COUNT options are supplied (i.e. there are no --total, --grow,
46or --shrink options), the command will maintain the current totals of
47DUTs for every MODEL in the target POOL.
48
49If not enough working spares are available, broken DUTs may be left
50in the pool to keep the pool at the target COUNT.
51
52When reducing pool size, working DUTs will be returned after broken
53DUTs, if it's necessary to achieve the target COUNT.
54
55"""
56
57
58import argparse
59import os
60import re
61import sys
62import time
63
64import common
65from autotest_lib.server import constants
66from autotest_lib.server import site_utils
67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
68from autotest_lib.server.lib import status_history
69from autotest_lib.site_utils import lab_inventory
70from autotest_lib.utils import labellib
71from chromite.lib import metrics
72from chromite.lib import parallel
73
74#This must be imported after chromite.lib.metrics
75from infra_libs import ts_mon
76
77_POOL_PREFIX = constants.Labels.POOL_PREFIX
78# This is the ratio of all models we should calculate the default max
79# number of broken models against.  It seemed like the best choice that
80# was neither too strict nor lax.
81_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0
82
83_ALL_CRITICAL_POOLS = 'all_critical_pools'
84_SPARE_DEFAULT = lab_inventory.SPARE_POOL
85
86
87# _VALID_POOL_PATTERN - Regular expression matching pool names that will
88# be accepted on the command line.
89#
90# Note: This pattern was selected merely to recognize all existing pool
91# names; there's no underlying technical restriction motivating this
92# pattern.  No reasonable request to add more special characters to the
93# allowed set should be refused.
94
95_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$')
96
97
98def _log_message(message, *args):
99    """Log a message with optional format arguments to stdout.
100
101    This function logs a single line to stdout, with formatting
102    if necessary, and without adornments.
103
104    If `*args` are supplied, the message will be formatted using
105    the arguments.
106
107    @param message  Message to be logged, possibly after formatting.
108    @param args     Format arguments.  If empty, the message is logged
109                    without formatting.
110
111    """
112    if args:
113        message = message % args
114    sys.stdout.write('%s\n' % message)
115
116
117def _log_info(dry_run, message, *args):
118    """Log information in a dry-run dependent fashion.
119
120    This function logs a single line to stdout, with formatting
121    if necessary.  When logging for a dry run, the message is
122    printed as a shell comment, rather than as unadorned text.
123
124    If `*args` are supplied, the message will be formatted using
125    the arguments.
126
127    @param message  Message to be logged, possibly after formatting.
128    @param args     Format arguments.  If empty, the message is logged
129                    without formatting.
130
131    """
132    if dry_run:
133        message = '# ' + message
134    _log_message(message, *args)
135
136
137def _log_error(message, *args):
138    """Log an error to stderr, with optional format arguments.
139
140    This function logs a single line to stderr, prefixed to indicate
141    that it is an error message.
142
143    If `*args` are supplied, the message will be formatted using
144    the arguments.
145
146    @param message  Message to be logged, possibly after formatting.
147    @param args     Format arguments.  If empty, the message is logged
148                    without formatting.
149
150    """
151    if args:
152        message = message % args
153    sys.stderr.write('ERROR: %s\n' % message)
154
155
156class _DUTPool(object):
157    """Information about a pool of DUTs matching given labels.
158
159    This class collects information about all DUTs for a given pool and matching
160    the given labels, and divides them into three categories:
161      + Working - the DUT is working for testing, and not locked.
162      + Broken - the DUT is unable to run tests, or it is locked.
163      + Ineligible - the DUT is not available to be removed from this pool.  The
164            DUT may be either working or broken.
165
166    DUTs with more than one pool: label are ineligible for exchange
167    during balancing.  This is done for the sake of chameleon hosts,
168    which must always be assigned to pool:suites.  These DUTs are
169    always marked with pool:chameleon to prevent their reassignment.
170
171    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
172    the `chameleon` label is a hack that should be eliminated.
173
174    _DUTPool instances are used to track both main pools that need
175    to be resupplied with working DUTs and spare pools that supply
176    those DUTs.
177
178    @property pool                Name of the pool associated with
179                                  this pool of DUTs.
180    @property labels              Labels that constrain the DUTs to consider.
181    @property working_hosts       The list of this pool's working DUTs.
182    @property broken_hosts        The list of this pool's broken DUTs.
183    @property ineligible_hosts    The list of this pool's ineligible DUTs.
184    @property pool_labels         A list of labels that identify a DUT as part
185                                  of this pool.
186    @property total_hosts         The total number of hosts in pool.
187
188    """
189
190    def __init__(self, afe, pool, labels, start_time, end_time):
191        self.pool = pool
192        self.labels = labellib.LabelsMapping(labels)
193        self.labels['pool'] = pool
194        self._pool_labels = [_POOL_PREFIX + self.pool]
195
196        self.working_hosts = []
197        self.broken_hosts = []
198        self.ineligible_hosts = []
199        self.total_hosts = self._get_hosts(afe, start_time, end_time)
200
201
202    def _get_hosts(self, afe, start_time, end_time):
203        all_histories = status_history.HostJobHistory.get_multiple_histories(
204                afe, start_time, end_time, self.labels.getlabels())
205        for h in all_histories:
206            host = h.host
207            host_pools = [l for l in host.labels
208                          if l.startswith(_POOL_PREFIX)]
209            if len(host_pools) != 1:
210                self.ineligible_hosts.append(host)
211            else:
212                diag = h.last_diagnosis()[0]
213                if (diag == status_history.WORKING and
214                        not host.locked):
215                    self.working_hosts.append(host)
216                else:
217                    self.broken_hosts.append(host)
218        return len(all_histories)
219
220
221    @property
222    def pool_labels(self):
223        """Return the AFE labels that identify this pool.
224
225        The returned labels are the labels that must be removed
226        to remove a DUT from the pool, or added to add a DUT.
227
228        @return A list of AFE labels suitable for AFE.add_labels()
229                or AFE.remove_labels().
230
231        """
232        return self._pool_labels
233
234    def calculate_spares_needed(self, target_total):
235        """Calculate and log the spares needed to achieve a target.
236
237        Return how many working spares are needed to achieve the
238        given `target_total` with all DUTs working.
239
240        The spares count may be positive or negative.  Positive
241        values indicate spares are needed to replace broken DUTs in
242        order to reach the target; negative numbers indicate that
243        no spares are needed, and that a corresponding number of
244        working devices can be returned.
245
246        If the new target total would require returning ineligible
247        DUTs, an error is logged, and the target total is adjusted
248        so that those DUTs are not exchanged.
249
250        @param target_total  The new target pool size.
251
252        @return The number of spares needed.
253
254        """
255        num_ineligible = len(self.ineligible_hosts)
256        spares_needed = target_total >= num_ineligible
257        metrics.Boolean(
258            'chromeos/autotest/balance_pools/exhausted_pools',
259            'True for each pool/model which requests more DUTs than supplied',
260            # TODO(jrbarnette) The 'board' field is a legacy.  We need
261            # to leave it here until we do the extra work Monarch
262            # requires to delete a field.
263            field_spec=[
264                    ts_mon.StringField('pool'),
265                    ts_mon.StringField('board'),
266                    ts_mon.StringField('model'),
267            ]).set(
268                    not spares_needed,
269                    fields={
270                            'pool': self.pool,
271                            'board': self.labels.get('model', ''),
272                            'model': self.labels.get('model', ''),
273                    },
274        )
275        if not spares_needed:
276            _log_error(
277                    '%s pool (%s): Target of %d is below minimum of %d DUTs.',
278                    self.pool, self.labels, target_total, num_ineligible,
279            )
280            _log_error('Adjusting target to %d DUTs.', num_ineligible)
281            target_total = num_ineligible
282        else:
283            _log_message('%s %s pool: Target of %d is above minimum.',
284                         self.labels.get('model', ''), self.pool, target_total)
285        adjustment = target_total - self.total_hosts
286        return len(self.broken_hosts) + adjustment
287
288    def allocate_surplus(self, num_broken):
289        """Allocate a list DUTs that can returned as surplus.
290
291        Return a list of devices that can be returned in order to
292        reduce this pool's supply.  Broken DUTs will be preferred
293        over working ones.
294
295        The `num_broken` parameter indicates the number of broken
296        DUTs to be left in the pool.  If this number exceeds the
297        number of broken DUTs actually in the pool, the returned
298        list will be empty.  If this number is negative, it
299        indicates a number of working DUTs to be returned in
300        addition to all broken ones.
301
302        @param num_broken    Total number of broken DUTs to be left in
303                             this pool.
304
305        @return A list of DUTs to be returned as surplus.
306
307        """
308        if num_broken >= 0:
309            surplus = self.broken_hosts[num_broken:]
310            return surplus
311        else:
312            return (self.broken_hosts +
313                    self.working_hosts[:-num_broken])
314
315
316def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
317    """Reassign a list of DUTs from one pool to another.
318
319    For all the given hosts, remove all labels associated with
320    `spare_pool`, and add the labels for `target_pool`.
321
322    If `dry_run` is true, perform no changes, but log the `atest`
323    commands needed to accomplish the necessary label changes.
324
325    @param dry_run       Whether the logging is for a dry run or
326                         for actual execution.
327    @param hosts         List of DUTs (AFE hosts) to be reassigned.
328    @param target_pool   The `_DUTPool` object from which the hosts
329                         are drawn.
330    @param spare_pool    The `_DUTPool` object to which the hosts
331                         will be added.
332
333    """
334    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
335              len(hosts), spare_pool.pool, target_pool.pool)
336    metrics.Counter(
337        'chromeos/autotest/balance_pools/duts_moved',
338        'DUTs transferred between pools',
339        # TODO(jrbarnette) The 'board' field is a legacy.  We need to
340        # leave it here until we do the extra work Monarch requires to
341        # delete a field.
342        field_spec=[
343                ts_mon.StringField('board'),
344                ts_mon.StringField('model'),
345                ts_mon.StringField('source_pool'),
346                ts_mon.StringField('target_pool'),
347        ]
348    ).increment_by(
349            len(hosts),
350            fields={
351                    'board': target_pool.labels.get('model', ''),
352                    'model': target_pool.labels.get('model', ''),
353                    'source_pool': spare_pool.pool,
354                    'target_pool': target_pool.pool,
355            },
356    )
357    if not hosts:
358        return
359
360    additions = target_pool.pool_labels
361    removals = spare_pool.pool_labels
362    for host in hosts:
363        if not dry_run:
364            _log_message('Updating host: %s.', host.hostname)
365            host.remove_labels(removals)
366            host.add_labels(additions)
367        else:
368            _log_message('atest label remove -m %s %s',
369                         host.hostname, ' '.join(removals))
370            _log_message('atest label add -m %s %s',
371                         host.hostname, ' '.join(additions))
372
373
374def _balance_model(arguments, afe, pool, labels, start_time, end_time):
375    """Balance one model as requested by command line arguments.
376
377    @param arguments     Parsed command line arguments.
378    @param afe           AFE object to be used for the changes.
379    @param pool          Pool of the model to be balanced.
380    @param labels        Restrict the balancing operation within DUTs
381                         that have these labels.
382    @param start_time    Start time for HostJobHistory objects in
383                         the DUT pools.
384    @param end_time      End time for HostJobHistory objects in the
385                         DUT pools.
386
387    """
388    spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time)
389    main_pool = _DUTPool(afe, pool, labels, start_time, end_time)
390
391    target_total = main_pool.total_hosts
392    if arguments.total is not None:
393        target_total = arguments.total
394    elif arguments.grow:
395        target_total += arguments.grow
396    elif arguments.shrink:
397        target_total -= arguments.shrink
398
399    spares_needed = main_pool.calculate_spares_needed(target_total)
400    if spares_needed > 0:
401        spare_duts = spare_pool.working_hosts[:spares_needed]
402        shortfall = spares_needed - len(spare_duts)
403    else:
404        spare_duts = []
405        shortfall = spares_needed
406
407    surplus_duts = main_pool.allocate_surplus(shortfall)
408
409    if spares_needed or surplus_duts or arguments.verbose:
410        dry_run = arguments.dry_run
411        _log_message('')
412
413        _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool)
414        _log_info(dry_run,
415                  'Total %d DUTs, %d working, %d broken, %d reserved.',
416                  main_pool.total_hosts, len(main_pool.working_hosts),
417                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
418
419        if spares_needed > 0:
420            add_msg = 'grow pool by %d DUTs' % spares_needed
421        elif spares_needed < 0:
422            add_msg = 'shrink pool by %d DUTs' % -spares_needed
423        else:
424            add_msg = 'no change to pool size'
425        _log_info(dry_run, 'Target is %d working DUTs; %s.',
426                  target_total, add_msg)
427
428        _log_info(dry_run,
429                  '%s %s pool has %d spares available for balancing pool %s',
430                  labels, spare_pool.pool, len(spare_pool.working_hosts),
431                  main_pool.pool)
432
433        if spares_needed > len(spare_duts):
434            _log_error('Not enough spares: need %d, only have %d.',
435                       spares_needed, len(spare_duts))
436        elif shortfall >= 0:
437            _log_info(dry_run,
438                      '%s %s pool will return %d broken DUTs, '
439                      'leaving %d still in the pool.',
440                      labels, main_pool.pool,
441                      len(surplus_duts),
442                      len(main_pool.broken_hosts) - len(surplus_duts))
443        else:
444            _log_info(dry_run,
445                      '%s %s pool will return %d surplus DUTs, '
446                      'including %d working DUTs.',
447                      labels, main_pool.pool,
448                      len(main_pool.broken_hosts) - shortfall,
449                      -shortfall)
450
451    if (len(main_pool.broken_hosts) > arguments.max_broken and
452        not arguments.force_rebalance):
453        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
454                   labels, main_pool.pool, len(main_pool.broken_hosts))
455        _log_error('Please investigate this model to for a bug ')
456        _log_error('that is bricking devices. Once you have finished your ')
457        _log_error('investigation, you can force a rebalance with ')
458        _log_error('--force-rebalance')
459        spare_duts = []
460        surplus_duts = []
461
462    if not spare_duts and not surplus_duts:
463        if arguments.verbose:
464            _log_info(arguments.dry_run, 'No exchange required.')
465
466    _exchange_labels(arguments.dry_run, surplus_duts,
467                     spare_pool, main_pool)
468    _exchange_labels(arguments.dry_run, spare_duts,
469                     main_pool, spare_pool)
470
471
472def _parse_command(argv):
473    """Parse the command line arguments.
474
475    Create an argument parser for this command's syntax, parse the
476    command line, and return the result of the `ArgumentParser`
477    `parse_args()` method.
478
479    @param argv Standard command line argument vector; `argv[0]` is
480                assumed to be the command name.
481
482    @return Result returned by `ArgumentParser.parse_args()`.
483
484    """
485    parser = argparse.ArgumentParser(
486            prog=os.path.basename(argv[0]),
487            description='Balance pool shortages from spares on reserve')
488
489    parser.add_argument(
490        '-w', '--web', type=str, default=None,
491        help='AFE host to use. Default comes from shadow_config.',
492    )
493    count_group = parser.add_mutually_exclusive_group()
494    count_group.add_argument('-t', '--total', type=int,
495                             metavar='COUNT', default=None,
496                             help='Set the number of DUTs in the '
497                                  'pool to the specified count for '
498                                  'every MODEL')
499    count_group.add_argument('-a', '--grow', type=int,
500                             metavar='COUNT', default=None,
501                             help='Add the specified number of DUTs '
502                                  'to the pool for every MODEL')
503    count_group.add_argument('-d', '--shrink', type=int,
504                             metavar='COUNT', default=None,
505                             help='Remove the specified number of DUTs '
506                                  'from the pool for every MODEL')
507
508    parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT,
509                        metavar='POOL',
510                        help='Pool from which to draw replacement '
511                             'spares (default: pool:%s)' % _SPARE_DEFAULT)
512    parser.add_argument('-n', '--dry-run', action='store_true',
513                        help='Report actions to take in the form of '
514                             'shell commands')
515    parser.add_argument('-v', '--verbose', action='store_true',
516                        help='Print more detail about calculations for debug '
517                             'purposes.')
518
519    parser.add_argument('-m', '--max-broken', default=2, type=int,
520                        metavar='COUNT',
521                        help='Only rebalance a pool if it has at most '
522                             'COUNT broken DUTs.')
523    parser.add_argument('-f', '--force-rebalance', action='store_true',
524                        help='Forcefully rebalance all DUTs in a pool, even '
525                             'if it has a large number of broken DUTs. '
526                             'Before doing this, please investigate whether '
527                             'there is a bug that is bricking devices in the '
528                             'lab.')
529    parser.add_argument('--production', action='store_true',
530                        help='Treat this as a production run. This will '
531                             'collect metrics.')
532
533    parser.add_argument(
534            '--all-models',
535            action='store_true',
536            help='Rebalance all managed models.  This will do a very expensive '
537                 'check to see how many models have at least one broken DUT. '
538                 'To bypass that check, set --max-broken-models to 0.',
539    )
540    parser.add_argument(
541            '--max-broken-models', default=None, type=int, metavar='COUNT',
542            help='Only rebalance all models if number of models with broken '
543                 'DUTs in the specified pool is less than COUNT.',
544    )
545
546    parser.add_argument('pool',
547                        metavar='POOL',
548                        help='Name of the pool to balance.  Use %s to balance '
549                             'all critical pools' % _ALL_CRITICAL_POOLS)
550    parser.add_argument('models', nargs='*', metavar='MODEL',
551                        help='Names of models to balance.')
552
553    parser.add_argument('-p', '--phase', metavar='PHASE',
554                        help='Optional phase label to restrict balance '
555                        'operation to.')
556
557    parser.add_argument('--sku', type=str,
558                        help='Optional name of sku to restrict to.')
559
560    arguments = parser.parse_args(argv[1:])
561
562    # Error-check arguments.
563    if arguments.models and arguments.all_models:
564        parser.error('Cannot specify individual models on the command line '
565                     'when using --all-models.')
566    if (arguments.pool == _ALL_CRITICAL_POOLS and
567        arguments.spare != _SPARE_DEFAULT):
568        parser.error('Cannot specify --spare pool to be %s when balancing all '
569                     'critical pools.' % _SPARE_DEFAULT)
570    for p in (arguments.spare, arguments.pool):
571        if not _VALID_POOL_PATTERN.match(p):
572            parser.error('Invalid pool name: %s' % p)
573    return arguments
574
575
576def infer_balancer_targets(afe, arguments, pools):
577    """Take some arguments and translate them to a list of models to balance
578
579    Args:
580    @param afe           AFE object to be used for taking inventory.
581    @param arguments     Parsed command line arguments.
582    @param pools         The list of pools to balance.
583
584    @returns    a list of (model, labels) tuples to be balanced
585
586    """
587    balancer_targets = []
588
589    for pool in pools:
590        if arguments.all_models:
591            inventory = lab_inventory.get_inventory(afe)
592            for model in inventory.get_pool_models(pool):
593                labels = labellib.LabelsMapping()
594                labels['model'] = model
595                if arguments.phase:
596                    labels['phase'] = arguments.phase
597                balancer_targets.append((pool, labels.getlabels()))
598        else:
599            for model in arguments.models:
600                labels = labellib.LabelsMapping()
601                labels['model'] = model
602                if arguments.sku:
603                    labels['sku'] = arguments.sku
604                if arguments.phase:
605                    labels['phase'] = arguments.phase
606                balancer_targets.append((pool, labels.getlabels()))
607    return balancer_targets
608
609
610def main(argv):
611    """Standard main routine.
612
613    @param argv  Command line arguments including `sys.argv[0]`.
614
615    """
616    arguments = _parse_command(argv)
617    if arguments.production:
618        metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools',
619                                                           indirect=True)
620    else:
621        metrics_manager = site_utils.TrivialContextManager()
622
623    with metrics_manager:
624        with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'):
625            end_time = time.time()
626            start_time = end_time - 24 * 60 * 60
627            afe = frontend_wrappers.RetryingAFE(server=arguments.web)
628
629            def balancer(pool, labels):
630                """Balance the specified model.
631
632                @param pool: The pool to rebalance for the model.
633                @param labels: labels to restrict to balancing operations
634                        within.
635                """
636                _balance_model(arguments, afe, pool, labels,
637                               start_time, end_time)
638                _log_message('')
639
640            pools = (lab_inventory.CRITICAL_POOLS
641                    if arguments.pool == _ALL_CRITICAL_POOLS
642                    else [arguments.pool])
643            balancer_targets = infer_balancer_targets(afe, arguments, pools)
644            try:
645                parallel.RunTasksInProcessPool(
646                        balancer,
647                        balancer_targets,
648                        processes=8,
649                )
650            except KeyboardInterrupt:
651                pass
652
653
654if __name__ == '__main__':
655    main(sys.argv)
656