1#!/usr/bin/env python2 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Adjust pool balances to cover DUT shortfalls. 7 8This command takes all broken DUTs in a specific pool for specific 9models and swaps them with working DUTs taken from a selected pool 10of spares. The command is meant primarily for replacing broken DUTs 11in critical pools like BVT or CQ, but it can also be used to adjust 12pool sizes, or to create or remove pools. 13 14usage: balance_pool.py [ options ] POOL MODEL [ MODEL ... ] 15 16positional arguments: 17 POOL Name of the pool to balance 18 MODEL Names of models to balance 19 20optional arguments: 21 -h, --help show this help message and exit 22 -t COUNT, --total COUNT 23 Set the number of DUTs in the pool to the specified 24 count for every MODEL 25 -a COUNT, --grow COUNT 26 Add the specified number of DUTs to the pool for every 27 MODEL 28 -d COUNT, --shrink COUNT 29 Remove the specified number of DUTs from the pool for 30 every MODEL 31 -s POOL, --spare POOL 32 Pool from which to draw replacement spares (default: 33 pool:suites) 34 -p PHASE, --phase PHASE 35 Phase to restrict the balance pool operation to 36 --sku SKU The specific SKU we intend to swap with 37 -n, --dry-run Report actions to take in the form of shell commands 38 39 40The command attempts to remove all broken DUTs from the target POOL 41for every MODEL, and replace them with enough working DUTs taken 42from the spare pool to bring the strength of POOL to the requested 43total COUNT. 44 45If no COUNT options are supplied (i.e. there are no --total, --grow, 46or --shrink options), the command will maintain the current totals of 47DUTs for every MODEL in the target POOL. 48 49If not enough working spares are available, broken DUTs may be left 50in the pool to keep the pool at the target COUNT. 51 52When reducing pool size, working DUTs will be returned after broken 53DUTs, if it's necessary to achieve the target COUNT. 54 55""" 56 57 58import argparse 59import os 60import re 61import sys 62import time 63 64import common 65from autotest_lib.server import constants 66from autotest_lib.server import site_utils 67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68from autotest_lib.server.lib import status_history 69from autotest_lib.site_utils import lab_inventory 70from autotest_lib.utils import labellib 71from chromite.lib import metrics 72from chromite.lib import parallel 73 74#This must be imported after chromite.lib.metrics 75from infra_libs import ts_mon 76 77_POOL_PREFIX = constants.Labels.POOL_PREFIX 78# This is the ratio of all models we should calculate the default max 79# number of broken models against. It seemed like the best choice that 80# was neither too strict nor lax. 81_MAX_BROKEN_DEFAULT_RATIO = 3.0 / 8.0 82 83_ALL_CRITICAL_POOLS = 'all_critical_pools' 84_SPARE_DEFAULT = lab_inventory.SPARE_POOL 85 86 87# _VALID_POOL_PATTERN - Regular expression matching pool names that will 88# be accepted on the command line. 89# 90# Note: This pattern was selected merely to recognize all existing pool 91# names; there's no underlying technical restriction motivating this 92# pattern. No reasonable request to add more special characters to the 93# allowed set should be refused. 94 95_VALID_POOL_PATTERN = re.compile('^[a-zA-z0-9_\-]+$') 96 97 98def _log_message(message, *args): 99 """Log a message with optional format arguments to stdout. 100 101 This function logs a single line to stdout, with formatting 102 if necessary, and without adornments. 103 104 If `*args` are supplied, the message will be formatted using 105 the arguments. 106 107 @param message Message to be logged, possibly after formatting. 108 @param args Format arguments. If empty, the message is logged 109 without formatting. 110 111 """ 112 if args: 113 message = message % args 114 sys.stdout.write('%s\n' % message) 115 116 117def _log_info(dry_run, message, *args): 118 """Log information in a dry-run dependent fashion. 119 120 This function logs a single line to stdout, with formatting 121 if necessary. When logging for a dry run, the message is 122 printed as a shell comment, rather than as unadorned text. 123 124 If `*args` are supplied, the message will be formatted using 125 the arguments. 126 127 @param message Message to be logged, possibly after formatting. 128 @param args Format arguments. If empty, the message is logged 129 without formatting. 130 131 """ 132 if dry_run: 133 message = '# ' + message 134 _log_message(message, *args) 135 136 137def _log_error(message, *args): 138 """Log an error to stderr, with optional format arguments. 139 140 This function logs a single line to stderr, prefixed to indicate 141 that it is an error message. 142 143 If `*args` are supplied, the message will be formatted using 144 the arguments. 145 146 @param message Message to be logged, possibly after formatting. 147 @param args Format arguments. If empty, the message is logged 148 without formatting. 149 150 """ 151 if args: 152 message = message % args 153 sys.stderr.write('ERROR: %s\n' % message) 154 155 156class _DUTPool(object): 157 """Information about a pool of DUTs matching given labels. 158 159 This class collects information about all DUTs for a given pool and matching 160 the given labels, and divides them into three categories: 161 + Working - the DUT is working for testing, and not locked. 162 + Broken - the DUT is unable to run tests, or it is locked. 163 + Ineligible - the DUT is not available to be removed from this pool. The 164 DUT may be either working or broken. 165 166 DUTs with more than one pool: label are ineligible for exchange 167 during balancing. This is done for the sake of chameleon hosts, 168 which must always be assigned to pool:suites. These DUTs are 169 always marked with pool:chameleon to prevent their reassignment. 170 171 TODO(jrbarnette): The use of `pool:chamelon` (instead of just 172 the `chameleon` label is a hack that should be eliminated. 173 174 _DUTPool instances are used to track both main pools that need 175 to be resupplied with working DUTs and spare pools that supply 176 those DUTs. 177 178 @property pool Name of the pool associated with 179 this pool of DUTs. 180 @property labels Labels that constrain the DUTs to consider. 181 @property working_hosts The list of this pool's working DUTs. 182 @property broken_hosts The list of this pool's broken DUTs. 183 @property ineligible_hosts The list of this pool's ineligible DUTs. 184 @property pool_labels A list of labels that identify a DUT as part 185 of this pool. 186 @property total_hosts The total number of hosts in pool. 187 188 """ 189 190 def __init__(self, afe, pool, labels, start_time, end_time): 191 self.pool = pool 192 self.labels = labellib.LabelsMapping(labels) 193 self.labels['pool'] = pool 194 self._pool_labels = [_POOL_PREFIX + self.pool] 195 196 self.working_hosts = [] 197 self.broken_hosts = [] 198 self.ineligible_hosts = [] 199 self.total_hosts = self._get_hosts(afe, start_time, end_time) 200 201 202 def _get_hosts(self, afe, start_time, end_time): 203 all_histories = status_history.HostJobHistory.get_multiple_histories( 204 afe, start_time, end_time, self.labels.getlabels()) 205 for h in all_histories: 206 host = h.host 207 host_pools = [l for l in host.labels 208 if l.startswith(_POOL_PREFIX)] 209 if len(host_pools) != 1: 210 self.ineligible_hosts.append(host) 211 else: 212 diag = h.last_diagnosis()[0] 213 if (diag == status_history.WORKING and 214 not host.locked): 215 self.working_hosts.append(host) 216 else: 217 self.broken_hosts.append(host) 218 return len(all_histories) 219 220 221 @property 222 def pool_labels(self): 223 """Return the AFE labels that identify this pool. 224 225 The returned labels are the labels that must be removed 226 to remove a DUT from the pool, or added to add a DUT. 227 228 @return A list of AFE labels suitable for AFE.add_labels() 229 or AFE.remove_labels(). 230 231 """ 232 return self._pool_labels 233 234 def calculate_spares_needed(self, target_total): 235 """Calculate and log the spares needed to achieve a target. 236 237 Return how many working spares are needed to achieve the 238 given `target_total` with all DUTs working. 239 240 The spares count may be positive or negative. Positive 241 values indicate spares are needed to replace broken DUTs in 242 order to reach the target; negative numbers indicate that 243 no spares are needed, and that a corresponding number of 244 working devices can be returned. 245 246 If the new target total would require returning ineligible 247 DUTs, an error is logged, and the target total is adjusted 248 so that those DUTs are not exchanged. 249 250 @param target_total The new target pool size. 251 252 @return The number of spares needed. 253 254 """ 255 num_ineligible = len(self.ineligible_hosts) 256 spares_needed = target_total >= num_ineligible 257 metrics.Boolean( 258 'chromeos/autotest/balance_pools/exhausted_pools', 259 'True for each pool/model which requests more DUTs than supplied', 260 # TODO(jrbarnette) The 'board' field is a legacy. We need 261 # to leave it here until we do the extra work Monarch 262 # requires to delete a field. 263 field_spec=[ 264 ts_mon.StringField('pool'), 265 ts_mon.StringField('board'), 266 ts_mon.StringField('model'), 267 ]).set( 268 not spares_needed, 269 fields={ 270 'pool': self.pool, 271 'board': self.labels.get('model', ''), 272 'model': self.labels.get('model', ''), 273 }, 274 ) 275 if not spares_needed: 276 _log_error( 277 '%s pool (%s): Target of %d is below minimum of %d DUTs.', 278 self.pool, self.labels, target_total, num_ineligible, 279 ) 280 _log_error('Adjusting target to %d DUTs.', num_ineligible) 281 target_total = num_ineligible 282 else: 283 _log_message('%s %s pool: Target of %d is above minimum.', 284 self.labels.get('model', ''), self.pool, target_total) 285 adjustment = target_total - self.total_hosts 286 return len(self.broken_hosts) + adjustment 287 288 def allocate_surplus(self, num_broken): 289 """Allocate a list DUTs that can returned as surplus. 290 291 Return a list of devices that can be returned in order to 292 reduce this pool's supply. Broken DUTs will be preferred 293 over working ones. 294 295 The `num_broken` parameter indicates the number of broken 296 DUTs to be left in the pool. If this number exceeds the 297 number of broken DUTs actually in the pool, the returned 298 list will be empty. If this number is negative, it 299 indicates a number of working DUTs to be returned in 300 addition to all broken ones. 301 302 @param num_broken Total number of broken DUTs to be left in 303 this pool. 304 305 @return A list of DUTs to be returned as surplus. 306 307 """ 308 if num_broken >= 0: 309 surplus = self.broken_hosts[num_broken:] 310 return surplus 311 else: 312 return (self.broken_hosts + 313 self.working_hosts[:-num_broken]) 314 315 316def _exchange_labels(dry_run, hosts, target_pool, spare_pool): 317 """Reassign a list of DUTs from one pool to another. 318 319 For all the given hosts, remove all labels associated with 320 `spare_pool`, and add the labels for `target_pool`. 321 322 If `dry_run` is true, perform no changes, but log the `atest` 323 commands needed to accomplish the necessary label changes. 324 325 @param dry_run Whether the logging is for a dry run or 326 for actual execution. 327 @param hosts List of DUTs (AFE hosts) to be reassigned. 328 @param target_pool The `_DUTPool` object from which the hosts 329 are drawn. 330 @param spare_pool The `_DUTPool` object to which the hosts 331 will be added. 332 333 """ 334 _log_info(dry_run, 'Transferring %d DUTs from %s to %s.', 335 len(hosts), spare_pool.pool, target_pool.pool) 336 metrics.Counter( 337 'chromeos/autotest/balance_pools/duts_moved', 338 'DUTs transferred between pools', 339 # TODO(jrbarnette) The 'board' field is a legacy. We need to 340 # leave it here until we do the extra work Monarch requires to 341 # delete a field. 342 field_spec=[ 343 ts_mon.StringField('board'), 344 ts_mon.StringField('model'), 345 ts_mon.StringField('source_pool'), 346 ts_mon.StringField('target_pool'), 347 ] 348 ).increment_by( 349 len(hosts), 350 fields={ 351 'board': target_pool.labels.get('model', ''), 352 'model': target_pool.labels.get('model', ''), 353 'source_pool': spare_pool.pool, 354 'target_pool': target_pool.pool, 355 }, 356 ) 357 if not hosts: 358 return 359 360 additions = target_pool.pool_labels 361 removals = spare_pool.pool_labels 362 for host in hosts: 363 if not dry_run: 364 _log_message('Updating host: %s.', host.hostname) 365 host.remove_labels(removals) 366 host.add_labels(additions) 367 else: 368 _log_message('atest label remove -m %s %s', 369 host.hostname, ' '.join(removals)) 370 _log_message('atest label add -m %s %s', 371 host.hostname, ' '.join(additions)) 372 373 374def _balance_model(arguments, afe, pool, labels, start_time, end_time): 375 """Balance one model as requested by command line arguments. 376 377 @param arguments Parsed command line arguments. 378 @param afe AFE object to be used for the changes. 379 @param pool Pool of the model to be balanced. 380 @param labels Restrict the balancing operation within DUTs 381 that have these labels. 382 @param start_time Start time for HostJobHistory objects in 383 the DUT pools. 384 @param end_time End time for HostJobHistory objects in the 385 DUT pools. 386 387 """ 388 spare_pool = _DUTPool(afe, arguments.spare, labels, start_time, end_time) 389 main_pool = _DUTPool(afe, pool, labels, start_time, end_time) 390 391 target_total = main_pool.total_hosts 392 if arguments.total is not None: 393 target_total = arguments.total 394 elif arguments.grow: 395 target_total += arguments.grow 396 elif arguments.shrink: 397 target_total -= arguments.shrink 398 399 spares_needed = main_pool.calculate_spares_needed(target_total) 400 if spares_needed > 0: 401 spare_duts = spare_pool.working_hosts[:spares_needed] 402 shortfall = spares_needed - len(spare_duts) 403 else: 404 spare_duts = [] 405 shortfall = spares_needed 406 407 surplus_duts = main_pool.allocate_surplus(shortfall) 408 409 if spares_needed or surplus_duts or arguments.verbose: 410 dry_run = arguments.dry_run 411 _log_message('') 412 413 _log_info(dry_run, 'Balancing %s %s pool:', labels, main_pool.pool) 414 _log_info(dry_run, 415 'Total %d DUTs, %d working, %d broken, %d reserved.', 416 main_pool.total_hosts, len(main_pool.working_hosts), 417 len(main_pool.broken_hosts), len(main_pool.ineligible_hosts)) 418 419 if spares_needed > 0: 420 add_msg = 'grow pool by %d DUTs' % spares_needed 421 elif spares_needed < 0: 422 add_msg = 'shrink pool by %d DUTs' % -spares_needed 423 else: 424 add_msg = 'no change to pool size' 425 _log_info(dry_run, 'Target is %d working DUTs; %s.', 426 target_total, add_msg) 427 428 _log_info(dry_run, 429 '%s %s pool has %d spares available for balancing pool %s', 430 labels, spare_pool.pool, len(spare_pool.working_hosts), 431 main_pool.pool) 432 433 if spares_needed > len(spare_duts): 434 _log_error('Not enough spares: need %d, only have %d.', 435 spares_needed, len(spare_duts)) 436 elif shortfall >= 0: 437 _log_info(dry_run, 438 '%s %s pool will return %d broken DUTs, ' 439 'leaving %d still in the pool.', 440 labels, main_pool.pool, 441 len(surplus_duts), 442 len(main_pool.broken_hosts) - len(surplus_duts)) 443 else: 444 _log_info(dry_run, 445 '%s %s pool will return %d surplus DUTs, ' 446 'including %d working DUTs.', 447 labels, main_pool.pool, 448 len(main_pool.broken_hosts) - shortfall, 449 -shortfall) 450 451 if (len(main_pool.broken_hosts) > arguments.max_broken and 452 not arguments.force_rebalance): 453 _log_error('%s %s pool: Refusing to act on pool with %d broken DUTs.', 454 labels, main_pool.pool, len(main_pool.broken_hosts)) 455 _log_error('Please investigate this model to for a bug ') 456 _log_error('that is bricking devices. Once you have finished your ') 457 _log_error('investigation, you can force a rebalance with ') 458 _log_error('--force-rebalance') 459 spare_duts = [] 460 surplus_duts = [] 461 462 if not spare_duts and not surplus_duts: 463 if arguments.verbose: 464 _log_info(arguments.dry_run, 'No exchange required.') 465 466 _exchange_labels(arguments.dry_run, surplus_duts, 467 spare_pool, main_pool) 468 _exchange_labels(arguments.dry_run, spare_duts, 469 main_pool, spare_pool) 470 471 472def _parse_command(argv): 473 """Parse the command line arguments. 474 475 Create an argument parser for this command's syntax, parse the 476 command line, and return the result of the `ArgumentParser` 477 `parse_args()` method. 478 479 @param argv Standard command line argument vector; `argv[0]` is 480 assumed to be the command name. 481 482 @return Result returned by `ArgumentParser.parse_args()`. 483 484 """ 485 parser = argparse.ArgumentParser( 486 prog=os.path.basename(argv[0]), 487 description='Balance pool shortages from spares on reserve') 488 489 parser.add_argument( 490 '-w', '--web', type=str, default=None, 491 help='AFE host to use. Default comes from shadow_config.', 492 ) 493 count_group = parser.add_mutually_exclusive_group() 494 count_group.add_argument('-t', '--total', type=int, 495 metavar='COUNT', default=None, 496 help='Set the number of DUTs in the ' 497 'pool to the specified count for ' 498 'every MODEL') 499 count_group.add_argument('-a', '--grow', type=int, 500 metavar='COUNT', default=None, 501 help='Add the specified number of DUTs ' 502 'to the pool for every MODEL') 503 count_group.add_argument('-d', '--shrink', type=int, 504 metavar='COUNT', default=None, 505 help='Remove the specified number of DUTs ' 506 'from the pool for every MODEL') 507 508 parser.add_argument('-s', '--spare', default=_SPARE_DEFAULT, 509 metavar='POOL', 510 help='Pool from which to draw replacement ' 511 'spares (default: pool:%s)' % _SPARE_DEFAULT) 512 parser.add_argument('-n', '--dry-run', action='store_true', 513 help='Report actions to take in the form of ' 514 'shell commands') 515 parser.add_argument('-v', '--verbose', action='store_true', 516 help='Print more detail about calculations for debug ' 517 'purposes.') 518 519 parser.add_argument('-m', '--max-broken', default=2, type=int, 520 metavar='COUNT', 521 help='Only rebalance a pool if it has at most ' 522 'COUNT broken DUTs.') 523 parser.add_argument('-f', '--force-rebalance', action='store_true', 524 help='Forcefully rebalance all DUTs in a pool, even ' 525 'if it has a large number of broken DUTs. ' 526 'Before doing this, please investigate whether ' 527 'there is a bug that is bricking devices in the ' 528 'lab.') 529 parser.add_argument('--production', action='store_true', 530 help='Treat this as a production run. This will ' 531 'collect metrics.') 532 533 parser.add_argument( 534 '--all-models', 535 action='store_true', 536 help='Rebalance all managed models. This will do a very expensive ' 537 'check to see how many models have at least one broken DUT. ' 538 'To bypass that check, set --max-broken-models to 0.', 539 ) 540 parser.add_argument( 541 '--max-broken-models', default=None, type=int, metavar='COUNT', 542 help='Only rebalance all models if number of models with broken ' 543 'DUTs in the specified pool is less than COUNT.', 544 ) 545 546 parser.add_argument('pool', 547 metavar='POOL', 548 help='Name of the pool to balance. Use %s to balance ' 549 'all critical pools' % _ALL_CRITICAL_POOLS) 550 parser.add_argument('models', nargs='*', metavar='MODEL', 551 help='Names of models to balance.') 552 553 parser.add_argument('-p', '--phase', metavar='PHASE', 554 help='Optional phase label to restrict balance ' 555 'operation to.') 556 557 parser.add_argument('--sku', type=str, 558 help='Optional name of sku to restrict to.') 559 560 arguments = parser.parse_args(argv[1:]) 561 562 # Error-check arguments. 563 if arguments.models and arguments.all_models: 564 parser.error('Cannot specify individual models on the command line ' 565 'when using --all-models.') 566 if (arguments.pool == _ALL_CRITICAL_POOLS and 567 arguments.spare != _SPARE_DEFAULT): 568 parser.error('Cannot specify --spare pool to be %s when balancing all ' 569 'critical pools.' % _SPARE_DEFAULT) 570 for p in (arguments.spare, arguments.pool): 571 if not _VALID_POOL_PATTERN.match(p): 572 parser.error('Invalid pool name: %s' % p) 573 return arguments 574 575 576def infer_balancer_targets(afe, arguments, pools): 577 """Take some arguments and translate them to a list of models to balance 578 579 Args: 580 @param afe AFE object to be used for taking inventory. 581 @param arguments Parsed command line arguments. 582 @param pools The list of pools to balance. 583 584 @returns a list of (model, labels) tuples to be balanced 585 586 """ 587 balancer_targets = [] 588 589 for pool in pools: 590 if arguments.all_models: 591 inventory = lab_inventory.get_inventory(afe) 592 for model in inventory.get_pool_models(pool): 593 labels = labellib.LabelsMapping() 594 labels['model'] = model 595 if arguments.phase: 596 labels['phase'] = arguments.phase 597 balancer_targets.append((pool, labels.getlabels())) 598 else: 599 for model in arguments.models: 600 labels = labellib.LabelsMapping() 601 labels['model'] = model 602 if arguments.sku: 603 labels['sku'] = arguments.sku 604 if arguments.phase: 605 labels['phase'] = arguments.phase 606 balancer_targets.append((pool, labels.getlabels())) 607 return balancer_targets 608 609 610def main(argv): 611 """Standard main routine. 612 613 @param argv Command line arguments including `sys.argv[0]`. 614 615 """ 616 arguments = _parse_command(argv) 617 if arguments.production: 618 metrics_manager = site_utils.SetupTsMonGlobalState('balance_pools', 619 indirect=True) 620 else: 621 metrics_manager = site_utils.TrivialContextManager() 622 623 with metrics_manager: 624 with metrics.SuccessCounter('chromeos/autotest/balance_pools/runs'): 625 end_time = time.time() 626 start_time = end_time - 24 * 60 * 60 627 afe = frontend_wrappers.RetryingAFE(server=arguments.web) 628 629 def balancer(pool, labels): 630 """Balance the specified model. 631 632 @param pool: The pool to rebalance for the model. 633 @param labels: labels to restrict to balancing operations 634 within. 635 """ 636 _balance_model(arguments, afe, pool, labels, 637 start_time, end_time) 638 _log_message('') 639 640 pools = (lab_inventory.CRITICAL_POOLS 641 if arguments.pool == _ALL_CRITICAL_POOLS 642 else [arguments.pool]) 643 balancer_targets = infer_balancer_targets(afe, arguments, pools) 644 try: 645 parallel.RunTasksInProcessPool( 646 balancer, 647 balancer_targets, 648 processes=8, 649 ) 650 except KeyboardInterrupt: 651 pass 652 653 654if __name__ == '__main__': 655 main(sys.argv) 656