1#!/usr/bin/env python 2# Copyright 2015 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Create e-mail reports of the Lab's DUT inventory. 7 8Gathers a list of all DUTs of interest in the Lab, segregated by 9board and pool, and determines whether each DUT is working or 10broken. Then, send one or more e-mail reports summarizing the 11status to e-mail addresses provided on the command line. 12 13usage: lab_inventory.py [ options ] [ board ... ] 14 15Options: 16--duration / -d <hours> 17 How far back in time to search job history to determine DUT 18 status. 19 20--board-notify <address>[,<address>] 21 Send the "board status" e-mail to all the specified e-mail 22 addresses. 23 24--pool-notify <address>[,<address>] 25 Send the "pool status" e-mail to all the specified e-mail 26 addresses. 27 28--recommend <number> 29 When generating the "board status" e-mail, included a list of 30 <number> specific DUTs to be recommended for repair. 31 32--logdir <directory> 33 Log progress and actions in a file under this directory. Text 34 of any e-mail sent will also be logged in a timestamped file in 35 this directory. 36 37--debug 38 Suppress all logging and sending e-mail. Instead, write the 39 output that would be generated onto stdout. 40 41<board> arguments: 42 With no arguments, gathers the status for all boards in the lab. 43 With one or more named boards on the command line, restricts 44 reporting to just those boards. 45 46""" 47 48 49import argparse 50import logging 51import logging.handlers 52import os 53import re 54import sys 55import time 56 57import common 58from autotest_lib.client.bin import utils 59from autotest_lib.client.common_lib import time_utils 60from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 61from autotest_lib.server.hosts import servo_host 62from autotest_lib.site_utils import gmail_lib 63from autotest_lib.site_utils import status_history 64from autotest_lib.site_utils.suite_scheduler import constants 65 66 67# The pools in the Lab that are actually of interest. 68# 69# These are general purpose pools of DUTs that are considered 70# identical for purposes of testing. That is, a device in one of 71# these pools can be shifted to another pool at will for purposes 72# of supplying test demand. 73# 74# Devices in these pools are not allowed to have special-purpose 75# attachments, or to be part of in any kind of custom fixture. 76# Devices in these pools are also required to reside in areas 77# managed by the Platforms team (i.e. at the time of this writing, 78# only in "Atlantis" or "Destiny"). 79# 80# _CRITICAL_POOLS - Pools that must be kept fully supplied in order 81# to guarantee timely completion of tests from builders. 82# _SPARE_POOL - A low priority pool that is allowed to provide 83# spares to replace broken devices in the critical pools. 84# _MANAGED_POOLS - The set of all the general purpose pools 85# monitored by this script. 86 87_CRITICAL_POOLS = ['bvt', 'cq', 'continuous'] 88_SPARE_POOL = 'suites' 89_MANAGED_POOLS = _CRITICAL_POOLS + [_SPARE_POOL] 90 91# _DEFAULT_DURATION: 92# Default value used for the --duration command line option. 93# Specifies how far back in time to search in order to determine 94# DUT status. 95 96_DEFAULT_DURATION = 24 97 98# _LOGDIR: 99# Relative path used in the calculation of the default setting 100# for the --logdir option. The full path path is relative to 101# the root of the autotest directory, as determined from 102# sys.argv[0]. 103# _LOGFILE: 104# Basename of a file to which general log information will be 105# written. 106# _LOG_FORMAT: 107# Format string for log messages. 108 109_LOGDIR = os.path.join('logs', 'dut-data') 110_LOGFILE = 'lab-inventory.log' 111_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s' 112 113# Pattern describing location-based host names in the Chrome OS test 114# labs. Each DUT hostname designates the DUT's location: 115# * A lab (room) that's physically separated from other labs 116# (i.e. there's a door). 117# * A row (or aisle) of DUTs within the lab. 118# * A vertical rack of shelves on the row. 119# * A specific host on one shelf of the rack. 120 121_HOSTNAME_PATTERN = re.compile( 122 r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)') 123 124 125class _PoolCounts(object): 126 """Maintains a set of `HostJobHistory` objects for a pool. 127 128 The collected history objects are nominally all part of a single 129 scheduling pool of DUTs. The collection maintains a list of 130 working DUTs, a list of broken DUTs, and a list of all DUTs. 131 132 Performance note: Certain methods in this class are potentially 133 expensive: 134 * `get_working()` 135 * `get_working_list()` 136 * `get_broken()` 137 * `get_broken_list()` 138 The first time any one of these methods is called, it causes 139 multiple RPC calls with a relatively expensive set of database 140 queries. However, the results of the queries are cached in the 141 individual `HostJobHistory` objects, so only the first call 142 actually pays the full cost. 143 144 Additionally, `get_working_list()` and `get_broken_list()` both 145 cache their return values to avoid recalculating lists at every 146 call; this caching is separate from the caching of RPC results 147 described above. 148 149 This class is deliberately constructed to delay the RPC cost 150 until the accessor methods are called (rather than to query in 151 `record_host()`) so that it's possible to construct a complete 152 `_LabInventory` without making the expensive queries at creation 153 time. `_populate_board_counts()`, below, assumes this behavior. 154 155 """ 156 157 def __init__(self): 158 self._histories = [] 159 self._working_list = None 160 self._broken_list = None 161 162 163 def record_host(self, host_history): 164 """Add one `HostJobHistory` object to the collection. 165 166 @param host_history The `HostJobHistory` object to be 167 remembered. 168 169 """ 170 self._working_list = None 171 self._broken_list = None 172 self._histories.append(host_history) 173 174 175 def get_working_list(self): 176 """Return a list of all working DUTs in the pool. 177 178 Filter `self._histories` for histories where the last 179 diagnosis is `WORKING`. 180 181 Cache the result so that we only cacluate it once. 182 183 @return A list of HostJobHistory objects. 184 185 """ 186 if self._working_list is None: 187 self._working_list = [h for h in self._histories 188 if h.last_diagnosis()[0] == status_history.WORKING] 189 return self._working_list 190 191 192 def get_working(self): 193 """Return the number of working DUTs in the pool.""" 194 return len(self.get_working_list()) 195 196 197 def get_broken_list(self): 198 """Return a list of all broken DUTs in the pool. 199 200 Filter `self._histories` for histories where the last 201 diagnosis is not `WORKING`. 202 203 Cache the result so that we only cacluate it once. 204 205 @return A list of HostJobHistory objects. 206 207 """ 208 if self._broken_list is None: 209 self._broken_list = [h for h in self._histories 210 if h.last_diagnosis()[0] != status_history.WORKING] 211 return self._broken_list 212 213 214 def get_broken(self): 215 """Return the number of broken DUTs in the pool.""" 216 return len(self.get_broken_list()) 217 218 219 def get_total(self): 220 """Return the total number of DUTs in the pool.""" 221 return len(self._histories) 222 223 224class _BoardCounts(object): 225 """Maintains a set of `HostJobHistory` objects for a board. 226 227 The collected history objects are nominally all of the same 228 board. The collection maintains a count of working DUTs, a 229 count of broken DUTs, and a total count. The counts can be 230 obtained either for a single pool, or as a total across all 231 pools. 232 233 DUTs in the collection must be assigned to one of the pools 234 in `_MANAGED_POOLS`. 235 236 The `get_working()` and `get_broken()` methods rely on the 237 methods of the same name in _PoolCounts, so the performance 238 note in _PoolCounts applies here as well. 239 240 """ 241 242 def __init__(self): 243 self._pools = { 244 pool: _PoolCounts() for pool in _MANAGED_POOLS 245 } 246 247 def record_host(self, host_history): 248 """Add one `HostJobHistory` object to the collection. 249 250 @param host_history The `HostJobHistory` object to be 251 remembered. 252 253 """ 254 pool = host_history.host_pool 255 self._pools[pool].record_host(host_history) 256 257 258 def _count_pool(self, get_pool_count, pool=None): 259 """Internal helper to count hosts in a given pool. 260 261 The `get_pool_count` parameter is a function to calculate 262 the exact count of interest for the pool. 263 264 @param get_pool_count Function to return a count from a 265 _PoolCount object. 266 @param pool The pool to be counted. If `None`, 267 return the total across all pools. 268 269 """ 270 if pool is None: 271 return sum([get_pool_count(counts) 272 for counts in self._pools.values()]) 273 else: 274 return get_pool_count(self._pools[pool]) 275 276 277 def get_working_list(self): 278 """Return a list of all working DUTs for the board. 279 280 Go through all HostJobHistory objects in the board's pools, 281 selecting the ones where the last diagnosis is `WORKING`. 282 283 @return A list of HostJobHistory objects. 284 285 """ 286 l = [] 287 for p in self._pools.values(): 288 l.extend(p.get_working_list()) 289 return l 290 291 292 def get_working(self, pool=None): 293 """Return the number of working DUTs in a pool. 294 295 @param pool The pool to be counted. If `None`, return the 296 total across all pools. 297 298 @return The total number of working DUTs in the selected 299 pool(s). 300 """ 301 return self._count_pool(_PoolCounts.get_working, pool) 302 303 304 def get_broken_list(self): 305 """Return a list of all broken DUTs for the board. 306 307 Go through all HostJobHistory objects in the board's pools, 308 selecting the ones where the last diagnosis is not 309 `WORKING`. 310 311 @return A list of HostJobHistory objects. 312 313 """ 314 l = [] 315 for p in self._pools.values(): 316 l.extend(p.get_broken_list()) 317 return l 318 319 320 def get_broken(self, pool=None): 321 """Return the number of broken DUTs in a pool. 322 323 @param pool The pool to be counted. If `None`, return the 324 total across all pools. 325 326 @return The total number of broken DUTs in the selected pool(s). 327 """ 328 return self._count_pool(_PoolCounts.get_broken, pool) 329 330 331 def get_spares_buffer(self): 332 """Return the the nominal number of working spares. 333 334 Calculates and returns how many working spares there would 335 be in the spares pool if all broken DUTs were in the spares 336 pool. This number may be negative, indicating a shortfall 337 in the critical pools. 338 339 @return The total number DUTs in the spares pool, less the total 340 number of broken DUTs in all pools. 341 """ 342 return self.get_total(_SPARE_POOL) - self.get_broken() 343 344 345 def get_total(self, pool=None): 346 """Return the total number of DUTs in a pool. 347 348 @param pool The pool to be counted. If `None`, return the 349 total across all pools. 350 351 @return The total number of DUTs in the selected pool(s). 352 """ 353 return self._count_pool(_PoolCounts.get_total, pool) 354 355 356class _LabInventory(dict): 357 """Collection of `HostJobHistory` objects for the Lab's inventory. 358 359 The collection is indexed by board. Indexing returns the 360 _BoardCounts object associated with the board. 361 362 The collection is also iterable. The iterator returns all the 363 boards in the inventory, in unspecified order. 364 365 """ 366 367 @classmethod 368 def create_inventory(cls, afe, start_time, end_time, boardlist=[]): 369 """Return a Lab inventory with specified parameters. 370 371 By default, gathers inventory from `HostJobHistory` objects 372 for all DUTs in the `_MANAGED_POOLS` list. If `boardlist` 373 is supplied, the inventory will be restricted to only the 374 given boards. 375 376 @param afe AFE object for constructing the 377 `HostJobHistory` objects. 378 @param start_time Start time for the `HostJobHistory` 379 objects. 380 @param end_time End time for the `HostJobHistory` 381 objects. 382 @param boardlist List of boards to include. If empty, 383 include all available boards. 384 @return A `_LabInventory` object for the specified boards. 385 386 """ 387 label_list = [constants.Labels.POOL_PREFIX + l 388 for l in _MANAGED_POOLS] 389 afehosts = afe.get_hosts(labels__name__in=label_list) 390 if boardlist: 391 boardhosts = [] 392 for board in boardlist: 393 board_label = constants.Labels.BOARD_PREFIX + board 394 host_list = [h for h in afehosts 395 if board_label in h.labels] 396 boardhosts.extend(host_list) 397 afehosts = boardhosts 398 create = lambda host: ( 399 status_history.HostJobHistory(afe, host, 400 start_time, end_time)) 401 return cls([create(host) for host in afehosts]) 402 403 404 def __init__(self, histories): 405 # N.B. The query that finds our hosts is restricted to those 406 # with a valid pool: label, but doesn't check for a valid 407 # board: label. In some (insufficiently) rare cases, the 408 # AFE hosts table has been known to (incorrectly) have DUTs 409 # with a pool: but no board: label. We explicitly exclude 410 # those here. 411 histories = [h for h in histories 412 if h.host_board is not None] 413 boards = set([h.host_board for h in histories]) 414 initval = { board: _BoardCounts() for board in boards } 415 super(_LabInventory, self).__init__(initval) 416 self._dut_count = len(histories) 417 self._managed_boards = None 418 for h in histories: 419 self[h.host_board].record_host(h) 420 421 422 def get_managed_boards(self): 423 """Return the set of "managed" boards. 424 425 Operationally, saying a board is "managed" means that the 426 board will be included in the "board" and "repair 427 recommendations" reports. That is, if there are failures in 428 the board's inventory then lab techs will be asked to fix 429 them without a separate ticket. 430 431 For purposes of implementation, a board is "managed" if it 432 has DUTs in both the spare and a non-spare (i.e. critical) 433 pool. 434 435 @return A set of all the boards that have both spare and 436 non-spare pools. 437 """ 438 if self._managed_boards is None: 439 self._managed_boards = set() 440 for board, counts in self.items(): 441 spares = counts.get_total(_SPARE_POOL) 442 total = counts.get_total() 443 if spares != 0 and spares != total: 444 self._managed_boards.add(board) 445 return self._managed_boards 446 447 448 def get_num_duts(self): 449 """Return the total number of DUTs in the inventory.""" 450 return self._dut_count 451 452 453 def get_num_boards(self): 454 """Return the total number of boards in the inventory.""" 455 return len(self) 456 457 458def _sort_by_location(inventory_list): 459 """Return a list of DUTs, organized by location. 460 461 Take the given list of `HostJobHistory` objects, separate it 462 into a list per lab, and sort each lab's list by location. The 463 order of sorting within a lab is 464 * By row number within the lab, 465 * then by rack number within the row, 466 * then by host shelf number within the rack. 467 468 Return a list of the sorted lists. 469 470 Implementation note: host locations are sorted by converting 471 each location into a base 100 number. If row, rack or 472 host numbers exceed the range [0..99], then sorting will 473 break down. 474 475 @return A list of sorted lists of DUTs. 476 477 """ 478 BASE = 100 479 lab_lists = {} 480 for history in inventory_list: 481 location = _HOSTNAME_PATTERN.match(history.host.hostname) 482 if location: 483 lab = location.group(1) 484 key = 0 485 for idx in location.group(2, 3, 4): 486 key = BASE * key + int(idx) 487 lab_lists.setdefault(lab, []).append((key, history)) 488 return_list = [] 489 for dut_list in lab_lists.values(): 490 dut_list.sort(key=lambda t: t[0]) 491 return_list.append([t[1] for t in dut_list]) 492 return return_list 493 494 495def _score_repair_set(buffer_counts, repair_list): 496 """Return a numeric score rating a set of DUTs to be repaired. 497 498 `buffer_counts` is a dictionary mapping board names to the 499 size of the board's spares buffer. 500 501 `repair_list` is a list of DUTs to be repaired. 502 503 This function calculates the new set of buffer counts that would 504 result from the proposed repairs, and scores the new set using 505 two numbers: 506 * Worst case buffer count for any board (higher is better). 507 This is the more siginficant number for comparison. 508 * Number of boards at the worst case (lower is better). This 509 is the less significant number. 510 511 Implementation note: The score could fail to reflect the 512 intended criteria if there are more than 1000 boards in the 513 inventory. 514 515 @param spare_counts A dictionary mapping boards to buffer counts. 516 @param repair_list A list of boards to be repaired. 517 @return A numeric score. 518 519 """ 520 # Go through `buffer_counts`, and create a list of new counts 521 # that records the buffer count for each board after repair. 522 # The new list of counts discards the board names, as they don't 523 # contribute to the final score. 524 _NBOARDS = 1000 525 repair_inventory = _LabInventory(repair_list) 526 new_counts = [] 527 for b, c in buffer_counts.items(): 528 if b in repair_inventory: 529 newcount = repair_inventory[b].get_total() 530 else: 531 newcount = 0 532 new_counts.append(c + newcount) 533 # Go through the new list of counts. Find the worst available 534 # spares count, and count how many times that worst case occurs. 535 worst_count = new_counts[0] 536 num_worst = 1 537 for c in new_counts[1:]: 538 if c == worst_count: 539 num_worst += 1 540 elif c < worst_count: 541 worst_count = c 542 num_worst = 1 543 # Return the calculated score 544 return _NBOARDS * worst_count - num_worst 545 546 547def _generate_repair_recommendation(inventory, num_recommend): 548 """Return a summary of selected DUTs needing repair. 549 550 Returns a message recommending a list of broken DUTs to be 551 repaired. The list of DUTs is selected based on these 552 criteria: 553 * No more than `num_recommend` DUTs will be listed. 554 * All DUTs must be in the same lab. 555 * DUTs should be selected for some degree of physical 556 proximity. 557 * DUTs for boards with a low spares buffer are more important 558 than DUTs with larger buffers. 559 560 The algorithm used will guarantee that at least one DUT from a 561 board with the smallest spares buffer will be recommended. If 562 the worst spares buffer number is shared by more than one board, 563 the algorithm will tend to prefer repair sets that include more 564 of those boards over sets that cover fewer boards. 565 566 @param inventory Inventory for generating recommendations. 567 @param num_recommend Number of DUTs to recommend for repair. 568 569 """ 570 logging.debug('Creating DUT repair recommendations') 571 board_buffer_counts = {} 572 broken_list = [] 573 for board in inventory.get_managed_boards(): 574 logging.debug('Listing failed DUTs for %s', board) 575 counts = inventory[board] 576 if counts.get_broken() != 0: 577 board_buffer_counts[board] = counts.get_spares_buffer() 578 broken_list.extend(counts.get_broken_list()) 579 # N.B. The logic inside this loop may seem complicated, but 580 # simplification is hard: 581 # * Calculating an initial recommendation outside of 582 # the loop likely would make things more complicated, 583 # not less. 584 # * It's necessary to calculate an initial lab slice once per 585 # lab _before_ the while loop, in case the number of broken 586 # DUTs in a lab is less than `num_recommend`. 587 recommendation = None 588 best_score = None 589 for lab_duts in _sort_by_location(broken_list): 590 start = 0 591 end = num_recommend 592 lab_slice = lab_duts[start : end] 593 lab_score = _score_repair_set(board_buffer_counts, 594 lab_slice) 595 while end < len(lab_duts): 596 start += 1 597 end += 1 598 new_slice = lab_duts[start : end] 599 new_score = _score_repair_set(board_buffer_counts, 600 new_slice) 601 if new_score > lab_score: 602 lab_slice = new_slice 603 lab_score = new_score 604 if recommendation is None or lab_score > best_score: 605 recommendation = lab_slice 606 best_score = lab_score 607 message = ['Repair recommendations:\n', 608 '%-30s %-16s %s' % ( 609 'Hostname', 'Board', 'Servo instructions')] 610 for h in recommendation: 611 servo_name = servo_host.make_servo_hostname(h.host.hostname) 612 if utils.host_is_in_lab_zone(servo_name): 613 servo_message = 'Repair servo first' 614 else: 615 servo_message = 'No servo present' 616 line = '%-30s %-16s %s' % ( 617 h.host.hostname, h.host_board, servo_message) 618 message.append(line) 619 return '\n'.join(message) 620 621 622def _generate_board_inventory_message(inventory): 623 """Generate the "board inventory" e-mail message. 624 625 The board inventory is a list by board summarizing the number 626 of working and broken DUTs, and the total shortfall or surplus 627 of working devices relative to the minimum critical pool 628 requirement. 629 630 The report omits boards with no DUTs in the spare pool or with 631 no DUTs in a critical pool. 632 633 N.B. For sample output text formattted as users can expect to 634 see it in e-mail and log files, refer to the unit tests. 635 636 @param inventory _LabInventory object with the inventory to 637 be reported on. 638 @return String with the inventory message to be sent. 639 640 """ 641 logging.debug('Creating board inventory') 642 nworking = 0 643 nbroken = 0 644 nbroken_boards = 0 645 summaries = [] 646 for board in inventory.get_managed_boards(): 647 logging.debug('Counting board inventory for %s', board) 648 counts = inventory[board] 649 # Summary elements laid out in the same order as the text 650 # headers: 651 # Board Avail Bad Good Spare Total 652 # e[0] e[1] e[2] e[3] e[4] e[5] 653 element = (board, 654 counts.get_spares_buffer(), 655 counts.get_broken(), 656 counts.get_working(), 657 counts.get_total(_SPARE_POOL), 658 counts.get_total()) 659 summaries.append(element) 660 nbroken += element[2] 661 nworking += element[3] 662 if element[2]: 663 nbroken_boards += 1 664 ntotal = nworking + nbroken 665 summaries = sorted(summaries, key=lambda e: (e[1], -e[2])) 666 broken_percent = int(round(100.0 * nbroken / ntotal)) 667 working_percent = 100 - broken_percent 668 message = ['Summary of DUTs in inventory:', 669 '%10s %10s %6s' % ('Bad', 'Good', 'Total'), 670 '%5d %3d%% %5d %3d%% %6d' % ( 671 nbroken, broken_percent, 672 nworking, working_percent, 673 ntotal), 674 '', 675 'Boards with failures: %d' % nbroken_boards, 676 'Boards in inventory: %d' % len(summaries), 677 '', '', 678 'Full board inventory:\n', 679 '%-22s %5s %5s %5s %5s %5s' % ( 680 'Board', 'Avail', 'Bad', 'Good', 681 'Spare', 'Total')] 682 message.extend( 683 ['%-22s %5d %5d %5d %5d %5d' % e for e in summaries]) 684 return '\n'.join(message) 685 686 687_POOL_INVENTORY_HEADER = '''\ 688Notice to Infrastructure deputies: All boards shown below are at 689less than full strength, please take action to resolve the issues. 690Once you're satisified that failures won't recur, failed DUTs can 691be replaced with spares by running `balance_pool`. Detailed 692instructions can be found here: 693 http://go/cros-manage-duts 694''' 695 696 697def _generate_pool_inventory_message(inventory): 698 """Generate the "pool inventory" e-mail message. 699 700 The pool inventory is a list by pool and board summarizing the 701 number of working and broken DUTs in the pool. Only boards with 702 at least one broken DUT are included in the list. 703 704 N.B. For sample output text formattted as users can expect to 705 see it in e-mail and log files, refer to the unit tests. 706 707 @param inventory _LabInventory object with the inventory to 708 be reported on. 709 @return String with the inventory message to be sent. 710 711 """ 712 logging.debug('Creating pool inventory') 713 message = [_POOL_INVENTORY_HEADER] 714 newline = '' 715 for pool in _CRITICAL_POOLS: 716 message.append( 717 '%sStatus for pool:%s, by board:' % (newline, pool)) 718 message.append( 719 '%-20s %5s %5s %5s' % ( 720 'Board', 'Bad', 'Good', 'Total')) 721 data_list = [] 722 for board, counts in inventory.items(): 723 logging.debug('Counting inventory for %s, %s', 724 board, pool) 725 broken = counts.get_broken(pool) 726 if broken == 0: 727 continue 728 working = counts.get_working(pool) 729 total = counts.get_total(pool) 730 data_list.append((board, broken, working, total)) 731 if data_list: 732 data_list = sorted(data_list, key=lambda d: -d[1]) 733 message.extend( 734 ['%-20s %5d %5d %5d' % t for t in data_list]) 735 else: 736 message.append('(All boards at full strength)') 737 newline = '\n' 738 return '\n'.join(message) 739 740 741def _send_email(arguments, tag, subject, recipients, body): 742 """Send an inventory e-mail message. 743 744 The message is logged in the selected log directory using `tag` 745 for the file name. 746 747 If the --print option was requested, the message is neither 748 logged nor sent, but merely printed on stdout. 749 750 @param arguments Parsed command-line options. 751 @param tag Tag identifying the inventory for logging 752 purposes. 753 @param subject E-mail Subject: header line. 754 @param recipients E-mail addresses for the To: header line. 755 @param body E-mail message body. 756 757 """ 758 logging.debug('Generating email: "%s"', subject) 759 all_recipients = ', '.join(recipients) 760 report_body = '\n'.join([ 761 'To: %s' % all_recipients, 762 'Subject: %s' % subject, 763 '', body, '']) 764 if arguments.debug: 765 print report_body 766 else: 767 filename = os.path.join(arguments.logdir, tag) 768 try: 769 report_file = open(filename, 'w') 770 report_file.write(report_body) 771 report_file.close() 772 except EnvironmentError as e: 773 logging.error('Failed to write %s: %s', filename, e) 774 try: 775 gmail_lib.send_email(all_recipients, subject, body) 776 except Exception as e: 777 logging.error('Failed to send e-mail to %s: %s', 778 all_recipients, e) 779 780 781def _separate_email_addresses(address_list): 782 """Parse a list of comma-separated lists of e-mail addresses. 783 784 @param address_list A list of strings containing comma 785 separate e-mail addresses. 786 @return A list of the individual e-mail addresses. 787 788 """ 789 newlist = [] 790 for arg in address_list: 791 newlist.extend([email.strip() for email in arg.split(',')]) 792 return newlist 793 794 795def _verify_arguments(arguments): 796 """Validate command-line arguments. 797 798 Join comma separated e-mail addresses for `--board-notify` and 799 `--pool-notify` in separate option arguments into a single list. 800 801 For non-debug uses, require that notification be requested for 802 at least one report. For debug, if notification isn't specified, 803 treat it as "run all the reports." 804 805 The return value indicates success or failure; in the case of 806 failure, we also write an error message to stderr. 807 808 @param arguments Command-line arguments as returned by 809 `ArgumentParser` 810 @return True if the arguments are semantically good, or False 811 if the arguments don't meet requirements. 812 813 """ 814 arguments.board_notify = _separate_email_addresses( 815 arguments.board_notify) 816 arguments.pool_notify = _separate_email_addresses( 817 arguments.pool_notify) 818 if not arguments.board_notify and not arguments.pool_notify: 819 if not arguments.debug: 820 sys.stderr.write('Must specify at least one of ' 821 '--board-notify or --pool-notify\n') 822 return False 823 else: 824 # We want to run all the reports. An empty notify list 825 # will cause a report to be skipped, so make sure the 826 # lists are non-empty. 827 arguments.board_notify = [''] 828 arguments.pool_notify = [''] 829 return True 830 831 832def _get_logdir(script): 833 """Get the default directory for the `--logdir` option. 834 835 The default log directory is based on the parent directory 836 containing this script. 837 838 @param script Path to this script file. 839 @return A path to a directory. 840 841 """ 842 basedir = os.path.dirname(os.path.abspath(script)) 843 basedir = os.path.dirname(basedir) 844 return os.path.join(basedir, _LOGDIR) 845 846 847def _parse_command(argv): 848 """Parse the command line arguments. 849 850 Create an argument parser for this command's syntax, parse the 851 command line, and return the result of the ArgumentParser 852 parse_args() method. 853 854 @param argv Standard command line argument vector; argv[0] is 855 assumed to be the command name. 856 @return Result returned by ArgumentParser.parse_args(). 857 858 """ 859 parser = argparse.ArgumentParser( 860 prog=argv[0], 861 description='Gather and report lab inventory statistics') 862 parser.add_argument('-d', '--duration', type=int, 863 default=_DEFAULT_DURATION, metavar='HOURS', 864 help='number of hours back to search for status' 865 ' (default: %d)' % _DEFAULT_DURATION) 866 parser.add_argument('--board-notify', action='append', 867 default=[], metavar='ADDRESS', 868 help='Generate board inventory message, ' 869 'and send it to the given e-mail address(es)') 870 parser.add_argument('--pool-notify', action='append', 871 default=[], metavar='ADDRESS', 872 help='Generate pool inventory message, ' 873 'and send it to the given address(es)') 874 parser.add_argument('-r', '--recommend', type=int, default=None, 875 help=('Specify how many DUTs should be ' 876 'recommended for repair (default: no ' 877 'recommendation)')) 878 parser.add_argument('--debug', action='store_true', 879 help='Print e-mail messages on stdout ' 880 'without sending them.') 881 parser.add_argument('--logdir', default=_get_logdir(argv[0]), 882 help='Directory where logs will be written.') 883 parser.add_argument('boardnames', nargs='*', 884 metavar='BOARD', 885 help='names of boards to report on ' 886 '(default: all boards)') 887 arguments = parser.parse_args(argv[1:]) 888 if not _verify_arguments(arguments): 889 return None 890 return arguments 891 892 893def _configure_logging(arguments): 894 """Configure the `logging` module for our needs. 895 896 How we log depends on whether the `--print` option was 897 provided on the command line. Without the option, we log all 898 messages at DEBUG level or above, and write them to a file in 899 the directory specified by the `--logdir` option. With the 900 option, we write log messages to stdout; messages below INFO 901 level are discarded. 902 903 The log file is configured to rotate once a week on Friday 904 evening, preserving ~3 months worth of history. 905 906 @param arguments Command-line arguments as returned by 907 `ArgumentParser` 908 909 """ 910 root_logger = logging.getLogger() 911 if arguments.debug: 912 root_logger.setLevel(logging.INFO) 913 handler = logging.StreamHandler(sys.stdout) 914 handler.setFormatter(logging.Formatter()) 915 else: 916 root_logger.setLevel(logging.DEBUG) 917 logfile = os.path.join(arguments.logdir, _LOGFILE) 918 handler = logging.handlers.TimedRotatingFileHandler( 919 logfile, when='W4', backupCount=13) 920 formatter = logging.Formatter(_LOG_FORMAT, 921 time_utils.TIME_FMT) 922 handler.setFormatter(formatter) 923 # TODO(jrbarnette) This is gross. Importing client.bin.utils 924 # implicitly imported logging_config, which calls 925 # logging.basicConfig() *at module level*. That gives us an 926 # extra logging handler that we don't want. So, clear out all 927 # the handlers here. 928 for h in root_logger.handlers: 929 root_logger.removeHandler(h) 930 root_logger.addHandler(handler) 931 932 933def _populate_board_counts(inventory): 934 """Gather board counts while providing interactive feedback. 935 936 Gathering the status of all individual DUTs in the lab can take 937 considerable time (~30 minutes at the time of this writing). 938 939 Normally, we pay that cost by querying as we go. However, with 940 the `--print` option, a human being may be watching the 941 progress. So, we force the first (expensive) queries to happen 942 up front, and provide a small ASCII progress bar to give an 943 indicator of how many boards have been processed. 944 945 @param inventory _LabInventory object with the inventory to 946 be gathered. 947 948 """ 949 n = 0 950 total_broken = 0 951 for counts in inventory.values(): 952 n += 1 953 if n % 10 == 5: 954 c = '+' 955 elif n % 10 == 0: 956 c = '%d' % ((n / 10) % 10) 957 else: 958 c = '.' 959 sys.stdout.write(c) 960 sys.stdout.flush() 961 # This next call is where all the time goes - it forces all 962 # of a board's HostJobHistory objects to query the database 963 # and cache their results. 964 total_broken += counts.get_broken() 965 sys.stdout.write('\n') 966 sys.stdout.write('Found %d broken DUTs\n' % total_broken) 967 968 969def main(argv): 970 """Standard main routine. 971 @param argv Command line arguments including `sys.argv[0]`. 972 """ 973 arguments = _parse_command(argv) 974 if not arguments: 975 sys.exit(1) 976 _configure_logging(arguments) 977 try: 978 end_time = int(time.time()) 979 start_time = end_time - arguments.duration * 60 * 60 980 timestamp = time.strftime('%Y-%m-%d.%H', 981 time.localtime(end_time)) 982 logging.debug('Starting lab inventory for %s', timestamp) 983 if arguments.board_notify: 984 if arguments.recommend: 985 logging.debug('Will include repair recommendations') 986 logging.debug('Will include board inventory') 987 if arguments.pool_notify: 988 logging.debug('Will include pool inventory') 989 990 afe = frontend_wrappers.RetryingAFE(server=None) 991 inventory = _LabInventory.create_inventory( 992 afe, start_time, end_time, arguments.boardnames) 993 logging.info('Found %d hosts across %d boards', 994 inventory.get_num_duts(), 995 inventory.get_num_boards()) 996 997 if arguments.debug: 998 _populate_board_counts(inventory) 999 1000 if arguments.board_notify: 1001 if arguments.recommend: 1002 recommend_message = _generate_repair_recommendation( 1003 inventory, arguments.recommend) + '\n\n\n' 1004 else: 1005 recommend_message = '' 1006 board_message = _generate_board_inventory_message(inventory) 1007 _send_email(arguments, 1008 'boards-%s.txt' % timestamp, 1009 'DUT board inventory %s' % timestamp, 1010 arguments.board_notify, 1011 recommend_message + board_message) 1012 1013 if arguments.pool_notify: 1014 _send_email(arguments, 1015 'pools-%s.txt' % timestamp, 1016 'DUT pool inventory %s' % timestamp, 1017 arguments.pool_notify, 1018 _generate_pool_inventory_message(inventory)) 1019 except KeyboardInterrupt: 1020 pass 1021 except EnvironmentError as e: 1022 logging.exception('Unexpected OS error: %s', e) 1023 except Exception as e: 1024 logging.exception('Unexpected exception: %s', e) 1025 1026 1027def get_managed_boards(afe): 1028 end_time = int(time.time()) 1029 start_time = end_time - 24 * 60 * 60 1030 inventory = _LabInventory.create_inventory( 1031 afe, start_time, end_time) 1032 return inventory.get_managed_boards() 1033 1034 1035if __name__ == '__main__': 1036 main(sys.argv) 1037