1#!/usr/bin/env python
2# Copyright 2015 The Chromium OS Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Adjust pool balances to cover DUT shortfalls.
7
8This command takes all broken DUTs in a specific pool for specific
9boards and swaps them with working DUTs taken from a selected pool
10of spares.  The command is meant primarily for replacing broken DUTs
11in critical pools like BVT or CQ, but it can also be used to adjust
12pool sizes, or to create or remove pools.
13
14usage:  balance_pool.py [ options ] POOL BOARD [ BOARD ... ]
15
16positional arguments:
17  POOL                  Name of the pool to balance
18  BOARD                 Names of boards to balance
19
20optional arguments:
21  -h, --help            show this help message and exit
22  -t COUNT, --total COUNT
23                        Set the number of DUTs in the pool to the specified
24                        count for every BOARD
25  -a COUNT, --grow COUNT
26                        Add the specified number of DUTs to the pool for every
27                        BOARD
28  -d COUNT, --shrink COUNT
29                        Remove the specified number of DUTs from the pool for
30                        every BOARD
31  -s POOL, --spare POOL
32                        Pool from which to draw replacement spares (default:
33                        pool:suites)
34  -n, --dry-run         Report actions to take in the form of shell commands
35
36
37The command attempts to remove all broken DUTs from the target POOL
38for every BOARD, and replace them with enough working DUTs taken
39from the spare pool to bring the strength of POOL to the requested
40total COUNT.
41
42If no COUNT options are supplied (i.e. there are no --total, --grow,
43or --shrink options), the command will maintain the current totals of
44DUTs for every BOARD in the target POOL.
45
46If not enough working spares are available, broken DUTs may be left
47in the pool to keep the pool at the target COUNT.
48
49When reducing pool size, working DUTs will be returned after broken
50DUTs, if it's necessary to achieve the target COUNT.
51
52If the selected target POOL is for a Freon board, *and* the selected
53spare pool has no DUTs (in any state), *and* the corresponding
54non-Freon spare pool is populated, then the non-Freon pool will
55be used for the Freon board.  A similar rule applies to balancing
56non-Freon boards when there is an available Freon spare pool.
57
58"""
59
60
61import argparse
62import sys
63import time
64
65import common
66from autotest_lib.server import frontend
67from autotest_lib.site_utils import host_label_utils
68from autotest_lib.site_utils import status_history
69from autotest_lib.site_utils.suite_scheduler import constants
70
71from chromite.lib import parallel
72
73
74_POOL_PREFIX = constants.Labels.POOL_PREFIX
75_BOARD_PREFIX = constants.Labels.BOARD_PREFIX
76
77_FREON_BOARD_TAG = 'freon'
78
79
80def _log_message(message, *args):
81    """Log a message with optional format arguments to stdout.
82
83    This function logs a single line to stdout, with formatting
84    if necessary, and without adornments.
85
86    If `*args` are supplied, the message will be formatted using
87    the arguments.
88
89    @param message  Message to be logged, possibly after formatting.
90    @param args     Format arguments.  If empty, the message is logged
91                    without formatting.
92
93    """
94    if args:
95        message = message % args
96    sys.stdout.write('%s\n' % message)
97
98
99def _log_info(dry_run, message, *args):
100    """Log information in a dry-run dependent fashion.
101
102    This function logs a single line to stdout, with formatting
103    if necessary.  When logging for a dry run, the message is
104    printed as a shell comment, rather than as unadorned text.
105
106    If `*args` are supplied, the message will be formatted using
107    the arguments.
108
109    @param message  Message to be logged, possibly after formatting.
110    @param args     Format arguments.  If empty, the message is logged
111                    without formatting.
112
113    """
114    if dry_run:
115        message = '# ' + message
116    _log_message(message, *args)
117
118
119def _log_error(message, *args):
120    """Log an error to stderr, with optional format arguments.
121
122    This function logs a single line to stderr, prefixed to indicate
123    that it is an error message.
124
125    If `*args` are supplied, the message will be formatted using
126    the arguments.
127
128    @param message  Message to be logged, possibly after formatting.
129    @param args     Format arguments.  If empty, the message is logged
130                    without formatting.
131
132    """
133    if args:
134        message = message % args
135    sys.stderr.write('ERROR: %s\n' % message)
136
137
138class _DUTPool(object):
139    """Information about a pool of DUTs for a given board.
140
141    This class collects information about all DUTs for a given
142    board and pool pair, and divides them into three categories:
143      + Working - the DUT is working for testing, and not locked.
144      + Broken - the DUT is unable to run tests, or it is locked.
145      + Ineligible - the DUT is not available to be removed from
146          this pool.  The DUT may be either working or broken.
147
148    DUTs with more than one pool: label are ineligible for exchange
149    during balancing.  This is done for the sake of chameleon hosts,
150    which must always be assigned to pool:suites.  These DUTs are
151    always marked with pool:chameleon to prevent their reassignment.
152
153    TODO(jrbarnette):  The use of `pool:chamelon` (instead of just
154    the `chameleon` label is a hack that should be eliminated.
155
156    _DUTPool instances are used to track both main pools that need
157    to be resupplied with working DUTs and spare pools that supply
158    those DUTs.
159
160    @property board               Name of the board associated with
161                                  this pool of DUTs.
162    @property pool                Name of the pool associated with
163                                  this pool of DUTs.
164    @property working_hosts       The list of this pool's working
165                                  DUTs.
166    @property broken_hosts        The list of this pool's broken
167                                  DUTs.
168    @property ineligible_hosts    The list of this pool's ineligible DUTs.
169    @property labels              A list of labels that identify a DUT
170                                  as part of this pool.
171    @property total_hosts         The total number of hosts in pool.
172
173    """
174
175
176    @staticmethod
177    def _get_platform_label(board):
178        """Return the platform label associated with `board`.
179
180        When swapping between freon and non-freon boards, the
181        platform label must also change (because wmatrix reports
182        build results against platform labels, not boards).  So, we
183        must be able to get the platform label from the board name.
184
185        For non-freon boards, the platform label is based on a name
186        assigned by the firmware, which in some cases is different
187        from the board name.  For freon boards, the platform label
188        is always the board name.
189
190        @param board The board name to convert to a platform label.
191        @return The platform label for the given board name.
192
193        """
194        if board.endswith(_FREON_BOARD_TAG):
195            return board
196        if board.startswith('x86-'):
197            return board[len('x86-') :]
198        platform_map = {
199          'daisy': 'snow',
200          'daisy_spring': 'spring',
201          'daisy_skate': 'skate',
202          'parrot_ivb': 'parrot_2',
203          'falco_li': 'falco'
204        }
205        return platform_map.get(board, board)
206
207
208    @staticmethod
209    def _freon_board_toggle(board):
210        """Toggle a board name between freon and non-freon.
211
212        For boards naming a freon build, return the name of the
213        associated non-freon board.  For boards naming non-freon
214        builds, return the name of the associated freon board.
215
216        @param board The board name to be toggled.
217        @return A new board name, toggled for freon.
218
219        """
220        if board.endswith(_FREON_BOARD_TAG):
221            # The actual board name ends with either "-freon" or
222            # "_freon", so we have to strip off one extra character.
223            return board[: -len(_FREON_BOARD_TAG) - 1]
224        else:
225            # The actual board name will end with either "-freon" or
226            # "_freon"; we have to figure out which one to use.
227            joiner = '_'
228            if joiner in board:
229                joiner = '-'
230            return joiner.join([board, _FREON_BOARD_TAG])
231
232
233    def __init__(self, afe, board, pool, start_time, end_time,
234                 use_freon=False):
235        self.board = board
236        self.pool = pool
237        self.working_hosts = []
238        self.broken_hosts = []
239        self.ineligible_hosts = []
240        self.total_hosts = self._get_hosts(
241                afe, start_time, end_time, use_freon)
242        self.labels = set([_BOARD_PREFIX + self.board,
243                           self._get_platform_label(self.board),
244                           _POOL_PREFIX + self.pool])
245
246
247    def _get_hosts(self, afe, start_time, end_time, use_freon):
248        all_histories = (
249            status_history.HostJobHistory.get_multiple_histories(
250                    afe, start_time, end_time,
251                    board=self.board, pool=self.pool))
252        if not all_histories and use_freon:
253            alternate_board = self._freon_board_toggle(self.board)
254            alternate_histories = (
255                status_history.HostJobHistory.get_multiple_histories(
256                        afe, start_time, end_time,
257                        board=alternate_board, pool=self.pool))
258            if alternate_histories:
259                self.board = alternate_board
260                all_histories = alternate_histories
261        for h in all_histories:
262            host = h.host
263            host_pools = [l for l in host.labels
264                          if l.startswith(_POOL_PREFIX)]
265            if len(host_pools) != 1:
266                self.ineligible_hosts.append(host)
267            else:
268                diag = h.last_diagnosis()[0]
269                if (diag == status_history.WORKING and
270                        not host.locked):
271                    self.working_hosts.append(host)
272                else:
273                    self.broken_hosts.append(host)
274        return len(all_histories)
275
276
277    @property
278    def pool_labels(self):
279        """Return the AFE labels that identify this pool.
280
281        The returned labels are the labels that must be removed
282        to remove a DUT from the pool, or added to add a DUT.
283
284        @return A list of AFE labels suitable for AFE.add_labels()
285                or AFE.remove_labels().
286
287        """
288        return self.labels
289
290    def calculate_spares_needed(self, target_total):
291        """Calculate and log the spares needed to achieve a target.
292
293        Return how many working spares are needed to achieve the
294        given `target_total` with all DUTs working.
295
296        The spares count may be positive or negative.  Positive
297        values indicate spares are needed to replace broken DUTs in
298        order to reach the target; negative numbers indicate that
299        no spares are needed, and that a corresponding number of
300        working devices can be returned.
301
302        If the new target total would require returning ineligible
303        DUTs, an error is logged, and the target total is adjusted
304        so that those DUTs are not exchanged.
305
306        @param target_total  The new target pool size.
307
308        @return The number of spares needed.
309
310        """
311        num_ineligible = len(self.ineligible_hosts)
312        if target_total < num_ineligible:
313            _log_error('%s %s pool: Target of %d is below '
314                       'minimum of %d DUTs.',
315                       self.board, self.pool,
316                       target_total, num_ineligible)
317            _log_error('Adjusting target to %d DUTs.', num_ineligible)
318            target_total = num_ineligible
319        adjustment = target_total - self.total_hosts
320        return len(self.broken_hosts) + adjustment
321
322    def allocate_surplus(self, num_broken):
323        """Allocate a list DUTs that can returned as surplus.
324
325        Return a list of devices that can be returned in order to
326        reduce this pool's supply.  Broken DUTs will be preferred
327        over working ones.
328
329        The `num_broken` parameter indicates the number of broken
330        DUTs to be left in the pool.  If this number exceeds the
331        number of broken DUTs actually in the pool, the returned
332        list will be empty.  If this number is negative, it
333        indicates a number of working DUTs to be returned in
334        addition to all broken ones.
335
336        @param num_broken    Total number of broken DUTs to be left in
337                             this pool.
338
339        @return A list of DUTs to be returned as surplus.
340
341        """
342        if num_broken >= 0:
343            surplus = self.broken_hosts[num_broken:]
344            return surplus
345        else:
346            return (self.broken_hosts +
347                    self.working_hosts[:-num_broken])
348
349
350def _exchange_labels(dry_run, hosts, target_pool, spare_pool):
351    """Reassign a list of DUTs from one pool to another.
352
353    For all the given hosts, remove all labels associated with
354    `spare_pool`, and add the labels for `target_pool`.
355
356    If `dry_run` is true, perform no changes, but log the `atest`
357    commands needed to accomplish the necessary label changes.
358
359    @param dry_run       Whether the logging is for a dry run or
360                         for actual execution.
361    @param hosts         List of DUTs (AFE hosts) to be reassigned.
362    @param target_pool   The `_DUTPool` object from which the hosts
363                         are drawn.
364    @param spare_pool    The `_DUTPool` object to which the hosts
365                         will be added.
366
367    """
368    if not hosts:
369        return
370    _log_info(dry_run, 'Transferring %d DUTs from %s to %s.',
371              len(hosts), spare_pool.pool, target_pool.pool)
372    additions = target_pool.pool_labels
373    removals = spare_pool.pool_labels
374    intersection = additions & removals
375    additions -= intersection
376    removals -= intersection
377    for host in hosts:
378        if not dry_run:
379            _log_message('Updating host: %s.', host.hostname)
380            host.remove_labels(list(removals))
381            host.add_labels(list(additions))
382        else:
383            _log_message('atest label remove -m %s %s',
384                         host.hostname, ' '.join(removals))
385            _log_message('atest label add -m %s %s',
386                         host.hostname, ' '.join(additions))
387
388
389def _balance_board(arguments, afe, board, start_time, end_time):
390    """Balance one board as requested by command line arguments.
391
392    @param arguments     Parsed command line arguments.
393    @param dry_run       Whether the logging is for a dry run or
394                         for actual execution.
395    @param afe           AFE object to be used for the changes.
396    @param board         Board to be balanced.
397    @param start_time    Start time for HostJobHistory objects in
398                         the DUT pools.
399    @param end_time      End time for HostJobHistory objects in the
400                         DUT pools.
401
402    """
403    spare_pool = _DUTPool(afe, board, arguments.spare,
404                          start_time, end_time, use_freon=True)
405    main_pool = _DUTPool(afe, board, arguments.pool,
406                         start_time, end_time)
407
408    target_total = main_pool.total_hosts
409    if arguments.total is not None:
410        target_total = arguments.total
411    elif arguments.grow:
412        target_total += arguments.grow
413    elif arguments.shrink:
414        target_total -= arguments.shrink
415
416    spares_needed = main_pool.calculate_spares_needed(target_total)
417    if spares_needed > 0:
418        spare_duts = spare_pool.working_hosts[:spares_needed]
419        shortfall = spares_needed - len(spare_duts)
420    else:
421        spare_duts = []
422        shortfall = spares_needed
423
424    surplus_duts = main_pool.allocate_surplus(shortfall)
425
426    if spares_needed or surplus_duts or arguments.verbose:
427        dry_run = arguments.dry_run
428        _log_message('')
429
430        _log_info(dry_run, 'Balancing %s %s pool:', board, main_pool.pool)
431        _log_info(dry_run,
432                  'Total %d DUTs, %d working, %d broken, %d reserved.',
433                  main_pool.total_hosts, len(main_pool.working_hosts),
434                  len(main_pool.broken_hosts), len(main_pool.ineligible_hosts))
435
436        if spares_needed > 0:
437            add_msg = 'grow pool by %d DUTs' % spares_needed
438        elif spares_needed < 0:
439            add_msg = 'shrink pool by %d DUTs' % -spares_needed
440        else:
441            add_msg = 'no change to pool size'
442        _log_info(dry_run, 'Target is %d working DUTs; %s.',
443                  target_total, add_msg)
444
445        _log_info(dry_run,
446                  '%s %s pool has %d spares available.',
447                  board, main_pool.pool, len(spare_pool.working_hosts))
448
449        if spares_needed > len(spare_duts):
450            _log_error('Not enough spares: need %d, only have %d.',
451                       spares_needed, len(spare_duts))
452        elif shortfall >= 0:
453            _log_info(dry_run,
454                      '%s %s pool will return %d broken DUTs, '
455                      'leaving %d still in the pool.',
456                      board, main_pool.pool,
457                      len(surplus_duts),
458                      len(main_pool.broken_hosts) - len(surplus_duts))
459        else:
460            _log_info(dry_run,
461                      '%s %s pool will return %d surplus DUTs, '
462                      'including %d working DUTs.',
463                      board, main_pool.pool,
464                      len(main_pool.broken_hosts) - shortfall,
465                      -shortfall)
466
467    if (len(main_pool.broken_hosts) > arguments.max_broken and
468        not arguments.force_rebalance):
469        _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.',
470                   board, main_pool.pool, len(main_pool.broken_hosts))
471        _log_error('Please investigate this board to see if there is a bug ')
472        _log_error('that is bricking devices. Once you have finished your ')
473        _log_error('investigation, you can force a rebalance with ')
474        _log_error('--force-rebalance')
475        return
476
477    if not spare_duts and not surplus_duts:
478        if arguments.verbose:
479            _log_info(arguments.dry_run, 'No exchange required.')
480        return
481
482    _exchange_labels(arguments.dry_run, surplus_duts,
483                     spare_pool, main_pool)
484    _exchange_labels(arguments.dry_run, spare_duts,
485                     main_pool, spare_pool)
486
487
488def _parse_command(argv):
489    """Parse the command line arguments.
490
491    Create an argument parser for this command's syntax, parse the
492    command line, and return the result of the `ArgumentParser`
493    `parse_args()` method.
494
495    @param argv Standard command line argument vector; `argv[0]` is
496                assumed to be the command name.
497
498    @return Result returned by `ArgumentParser.parse_args()`.
499
500    """
501    parser = argparse.ArgumentParser(
502            prog=argv[0],
503            description='Balance pool shortages from spares on reserve')
504
505    count_group = parser.add_mutually_exclusive_group()
506    count_group.add_argument('-t', '--total', type=int,
507                             metavar='COUNT', default=None,
508                             help='Set the number of DUTs in the '
509                                  'pool to the specified count for '
510                                  'every BOARD')
511    count_group.add_argument('-a', '--grow', type=int,
512                             metavar='COUNT', default=None,
513                             help='Add the specified number of DUTs '
514                                  'to the pool for every BOARD')
515    count_group.add_argument('-d', '--shrink', type=int,
516                             metavar='COUNT', default=None,
517                             help='Remove the specified number of DUTs '
518                                  'from the pool for every BOARD')
519
520    parser.add_argument('-s', '--spare', default='suites',
521                        metavar='POOL',
522                        help='Pool from which to draw replacement '
523                             'spares (default: pool:suites)')
524    parser.add_argument('-n', '--dry-run', action='store_true',
525                        help='Report actions to take in the form of '
526                             'shell commands')
527    parser.add_argument('-v', '--verbose', action='store_true',
528                        help='Print more detail about calculations for debug '
529                             'purposes.')
530
531    parser.add_argument('-m', '--max-broken', default=2, type=int,
532                        metavar='COUNT',
533                        help='Only rebalance a pool if it has at most '
534                             'COUNT broken DUTs.')
535    parser.add_argument('-f', '--force-rebalance', action='store_true',
536                        help='Forcefully rebalance all DUTs in a pool, even '
537                             'if it has a large number of broken DUTs. '
538                             'Before doing this, please investigate whether '
539                             'there is a bug that is bricking devices in the '
540                             'lab.')
541
542    parser.add_argument('--all-boards', action='store_true',
543                        help='Rebalance all boards.')
544
545    parser.add_argument('pool',
546                        metavar='POOL',
547                        help='Name of the pool to balance.')
548    parser.add_argument('boards', nargs='*',
549                        metavar='BOARD',
550                        help='Names of boards to balance.')
551
552    arguments = parser.parse_args(argv[1:])
553
554    # Error-check arguments.
555    if not arguments.boards and not arguments.all_boards:
556        parser.error('No boards specified. To balance all boards, use '
557                     '--all-boards')
558    if arguments.boards and arguments.all_boards:
559        parser.error('Cannot specify boards with --all-boards.')
560
561    return arguments
562
563
564def main(argv):
565    """Standard main routine.
566
567    @param argv  Command line arguments including `sys.argv[0]`.
568
569    """
570    def balancer(i, board):
571      """Balance the specified board.
572
573      @param i The index of the board.
574      @param board The board name.
575      """
576      if i > 0:
577          _log_message('')
578      _balance_board(arguments, afe, board, start_time, end_time)
579
580    arguments = _parse_command(argv)
581    end_time = time.time()
582    start_time = end_time - 24 * 60 * 60
583    afe = frontend.AFE(server=None)
584    boards = arguments.boards
585    if arguments.all_boards:
586        boards = host_label_utils.get_all_boards(
587            labels=[_POOL_PREFIX + arguments.pool])
588    board_args = list(enumerate(boards))
589    try:
590        parallel.RunTasksInProcessPool(balancer, board_args, processes=8)
591    except KeyboardInterrupt:
592        pass
593
594
595if __name__ == '__main__':
596    main(sys.argv)
597