1#!/usr/bin/python 2# 3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7 8"""Tool for running suites of tests and waiting for completion. 9 10The desired test suite will be scheduled with autotest. By default, 11this tool will block until the job is complete, printing a summary 12at the end. Error conditions result in exceptions. 13 14This is intended for use only with Chrome OS test suits that leverage the 15dynamic suite infrastructure in server/cros/dynamic_suite.py. 16 17This script exits with one of the following codes: 180 - OK: Suite finished successfully 191 - ERROR: Test(s) failed, or hits its own timeout 202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out. 213 - INFRA_FAILURE: Infrastructure related issues, e.g. 22 * Lab is down 23 * Too many duts (defined as a constant) in repair failed status 24 * Suite job issues, like bug in dynamic suite, 25 user aborted the suite, lose a drone/all devservers/rpc server, 26 0 tests ran, etc. 27 * provision failed 28 TODO(fdeng): crbug.com/413918, reexamine treating all provision 29 failures as INFRA failures. 304 - SUITE_TIMEOUT: Suite timed out, some tests ran, 31 none failed by the time the suite job was aborted. This will cover, 32 but not limited to, the following cases: 33 * A devserver failure that manifests as a timeout 34 * No DUTs available midway through a suite 35 * Provision/Reset/Cleanup took longer time than expected for new image 36 * A regression in scheduler tick time. 375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool. 386- INVALID_OPTIONS: If options are not valid. 39""" 40 41import argparse 42import ast 43from collections import namedtuple 44from datetime import datetime 45from datetime import timedelta 46import getpass 47import json 48import logging 49import os 50import re 51import sys 52import time 53 54import common 55from chromite.lib import buildbot_annotations as annotations 56 57from autotest_lib.client.common_lib import control_data 58from autotest_lib.client.common_lib import error 59from autotest_lib.client.common_lib import global_config, enum 60from autotest_lib.client.common_lib import priorities 61from autotest_lib.client.common_lib import time_utils 62from autotest_lib.client.common_lib.cros import retry 63from autotest_lib.frontend.afe.json_rpc import proxy 64from autotest_lib.server import utils 65from autotest_lib.server.cros import provision 66from autotest_lib.server.cros.dynamic_suite import constants 67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers 68from autotest_lib.server.cros.dynamic_suite import reporting 69from autotest_lib.server.cros.dynamic_suite import reporting_utils 70from autotest_lib.server.cros.dynamic_suite import tools 71from autotest_lib.site_utils import diagnosis_utils 72from autotest_lib.site_utils import job_overhead 73 74CONFIG = global_config.global_config 75 76_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value( 77 'SERVER', 'hostname', type=str) 78_URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str) 79 80# Return code that will be sent back to autotest_rpc_server.py 81RETURN_CODES = enum.Enum( 82 'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT', 83 'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS') 84# The severity of return code. If multiple codes 85# apply, the script should always return the severest one. 86# E.g. if we have a test failure and the suite also timed out, 87# we should return 'ERROR'. 88SEVERITY = {RETURN_CODES.OK: 0, 89 RETURN_CODES.WARNING: 1, 90 RETURN_CODES.SUITE_TIMEOUT: 2, 91 RETURN_CODES.INFRA_FAILURE: 3, 92 RETURN_CODES.ERROR: 4} 93 94 95def get_worse_code(code1, code2): 96 """Compare the severity of two codes and return the worse code. 97 98 @param code1: An enum value of RETURN_CODES 99 @param code2: An enum value of RETURN_CODES 100 101 @returns: the more severe one between code1 and code2. 102 103 """ 104 return code1 if SEVERITY[code1] >= SEVERITY[code2] else code2 105 106 107def bool_str(x): 108 """Boolean string type for option arguments. 109 110 @param x: string representation of boolean value. 111 112 """ 113 if x == 'True': 114 return True 115 elif x == 'False': 116 return False 117 else: 118 raise argparse.ArgumentTypeError( 119 '%s is not one of True or False' % (x,)) 120 121 122def _get_priority_value(x): 123 """Convert a priority representation to its int value. 124 125 Priorities can be described either by an int value (possibly as a string) 126 or a name string. This function coerces both forms to an int value. 127 128 This function is intended for casting command line arguments during 129 parsing. 130 131 @param x: priority value as an int, int string, or name string 132 133 @returns: int value of priority 134 """ 135 try: 136 return int(x) 137 except ValueError: 138 try: 139 return priorities.Priority.get_value(x) 140 except AttributeError: 141 raise argparse.ArgumentTypeError( 142 'Unknown priority level %s. Try one of %s.' 143 % (x, ', '.join(priorities.Priority.names))) 144 145 146def make_parser(): 147 """Make ArgumentParser instance for run_suite.py.""" 148 parser = argparse.ArgumentParser( 149 usage="%(prog)s [options]") 150 parser.add_argument("-b", "--board", dest="board") 151 parser.add_argument("-i", "--build", dest="build") 152 parser.add_argument( 153 "-w", "--web", dest="web", default=None, 154 help="Address of a webserver to receive suite requests.") 155 parser.add_argument( 156 '--firmware_rw_build', dest='firmware_rw_build', default=None, 157 help='Firmware build to be installed in dut RW firmware.') 158 parser.add_argument( 159 '--firmware_ro_build', dest='firmware_ro_build', default=None, 160 help='Firmware build to be installed in dut RO firmware.') 161 parser.add_argument( 162 '--test_source_build', dest='test_source_build', default=None, 163 help=('Build that contains the test code, ' 164 'e.g., it can be the value of `--build`, ' 165 '`--firmware_rw_build` or `--firmware_ro_build` ' 166 'arguments. Default is None, that is, use the test ' 167 'code from `--build` (CrOS image)')) 168 # This should just be a boolean flag, but the autotest "proxy" code 169 # can't handle flags that don't take arguments. 170 parser.add_argument( 171 "-n", "--no_wait", dest="no_wait", default=False, type=bool_str, 172 help='Must pass "True" or "False" if used.') 173 # If you really want no pool, --pool="" will do it. USE WITH CARE. 174 parser.add_argument("-p", "--pool", dest="pool", default="suites") 175 parser.add_argument("-s", "--suite_name", dest="name") 176 parser.add_argument("-a", "--afe_timeout_mins", type=int, 177 dest="afe_timeout_mins", default=30) 178 parser.add_argument("-t", "--timeout_mins", type=int, 179 dest="timeout_mins", default=1440) 180 parser.add_argument("-x", "--max_runtime_mins", type=int, 181 dest="max_runtime_mins", default=1440) 182 parser.add_argument("-d", "--delay_sec", type=int, 183 dest="delay_sec", default=10) 184 parser.add_argument("-m", "--mock_job_id", dest="mock_job_id", 185 help="Attach to existing job id for already running " 186 "suite, and creates report.") 187 # NOTE(akeshet): This looks similar to --no_wait, but behaves differently. 188 # --no_wait is passed in to the suite rpc itself and affects the suite, 189 # while this does not. 190 parser.add_argument("-c", "--create_and_return", dest="create_and_return", 191 action="store_true", 192 help="Create the suite and print the job id, then " 193 "finish immediately.") 194 parser.add_argument("-u", "--num", dest="num", type=int, default=None, 195 help="Run on at most NUM machines.") 196 # Same boolean flag issue applies here. 197 parser.add_argument( 198 "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str, 199 help=('File bugs on test failures. Must pass "True" or ' 200 '"False" if used.')) 201 parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus", 202 action="store_true", help='Bypass lab status check.') 203 # We allow either a number or a string for the priority. This way, if you 204 # know what you're doing, one can specify a custom priority level between 205 # other levels. 206 parser.add_argument("-r", "--priority", dest="priority", 207 type=_get_priority_value, 208 default=priorities.Priority.DEFAULT, 209 action="store", 210 help="Priority of suite. Either numerical value, or " 211 "one of (" + ", ".join(priorities.Priority.names) 212 + ").") 213 parser.add_argument( 214 '--retry', dest='retry', default=False, type=bool_str, action='store', 215 help='Enable test retry. Must pass "True" or "False" if used.') 216 parser.add_argument('--max_retries', dest='max_retries', default=None, 217 type=int, action='store', help='Maximum retries' 218 'allowed at suite level. No limit if not specified.') 219 parser.add_argument('--minimum_duts', dest='minimum_duts', type=int, 220 default=0, action='store', 221 help='Check that the pool has at least such many ' 222 'healthy machines, otherwise suite will not run. ' 223 'Default to 0.') 224 parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int, 225 default=0, action='store', 226 help='Preferred minimum number of machines. Scheduler ' 227 'will prioritize on getting such many machines for ' 228 'the suite when it is competing with another suite ' 229 'that has a higher priority but already got minimum ' 230 'machines it needs. Default to 0.') 231 parser.add_argument("--suite_args", dest="suite_args", 232 default=None, action="store", 233 help="Argument string for suite control file.") 234 parser.add_argument('--offload_failures_only', 235 dest='offload_failures_only', type=bool_str, 236 action='store', default=False, 237 help='Only enable gs_offloading for failed tests. ' 238 'Successful tests will be deleted. Must pass "True"' 239 ' or "False" if used.') 240 parser.add_argument('--use_suite_attr', dest='use_suite_attr', 241 action='store_true', default=False, 242 help='Advanced. Run the suite based on ATTRIBUTES of ' 243 'control files, rather than SUITE.') 244 parser.add_argument('--json_dump', dest='json_dump', action='store_true', 245 default=False, 246 help='Dump the output of run_suite to stdout.') 247 parser.add_argument( 248 '--run_prod_code', dest='run_prod_code', 249 action='store_true', default=False, 250 help='Run the test code that lives in prod aka the test ' 251 'code currently on the lab servers.') 252 parser.add_argument( 253 '--delay_minutes', type=int, default=0, 254 help=('Delay the creation of test jobs for a given ' 255 'number of minutes. This argument can be used to ' 256 'force provision jobs being delayed, which helps ' 257 'to distribute loads across devservers.')) 258 parser.add_argument( 259 '--skip_duts_check', dest='skip_duts_check', action='store_true', 260 default=False, help='If True, skip minimum available DUTs check') 261 parser.add_argument( 262 '--job_keyvals', dest='job_keyvals', type=ast.literal_eval, 263 action='store', default=None, 264 help='A dict of job keyvals to be inject to suite control file') 265 parser.add_argument( 266 '--test_args', dest='test_args', type=ast.literal_eval, 267 action='store', default=None, 268 help=('A dict of args passed all the way to each individual test that ' 269 'will be actually ran.')) 270 return parser 271 272 273def verify_options(options): 274 """Verify the validity of options. 275 276 @param options: The parsed options to verify. 277 278 @returns: True if verification passes, False otherwise. 279 280 """ 281 if options.mock_job_id and ( 282 not options.build or not options.name or not options.board): 283 print ('When using -m, need to specify build, board and suite ' 284 'name which you have used for creating the original job') 285 return False 286 else: 287 if not options.build: 288 print 'Need to specify which build to use' 289 return False 290 if not options.board: 291 print 'Need to specify board' 292 return False 293 if not options.name: 294 print 'Need to specify suite name' 295 return False 296 if options.num is not None and options.num < 1: 297 print 'Number of machines must be more than 0, if specified.' 298 return False 299 if not options.retry and options.max_retries is not None: 300 print 'max_retries can only be used with --retry=True' 301 return False 302 if options.use_suite_attr and options.suite_args is not None: 303 print ('The new suite control file cannot parse the suite_args: %s.' 304 'Please not specify any suite_args here.' % options.suite_args) 305 return False 306 if options.no_wait and options.retry: 307 print 'Test retry is not available when using --no_wait=True' 308 # Default to use the test code in CrOS build. 309 if not options.test_source_build and options.build: 310 options.test_source_build = options.build 311 return True 312 313 314def change_options_for_suite_attr(options): 315 """Change options to be prepared to run the suite_attr_wrapper. 316 317 If specify 'use_suite_attr' from the cmd line, it indicates to run the 318 new style suite control file, suite_attr_wrapper. Then, change the 319 options.name to 'suite_attr_wrapper', change the options.suite_args to 320 include the arguments needed by suite_attr_wrapper. 321 322 @param options: The verified options. 323 324 @returns: The changed options. 325 326 """ 327 # Convert the suite_name to attribute boolean expression. 328 if type(options.name) is str: 329 attr_filter_val = 'suite:%s' % options.name 330 else: 331 attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name]) 332 333 # change the suite_args to be a dict of arguments for suite_attr_wrapper 334 # if suite_args is not None, store the values in 'other_args' of the dict 335 args_dict = {} 336 args_dict['attr_filter'] = attr_filter_val 337 options.suite_args = str(args_dict) 338 options.name = 'suite_attr_wrapper' 339 340 return options 341 342 343class TestResult(object): 344 345 """Represents the result of a TestView.""" 346 347 def __init__(self, test_view, retry_count=0): 348 """Initialize instance. 349 350 @param test_view: TestView instance. 351 @param retry_count: Retry count for test. Optional. 352 """ 353 self.name = test_view.get_testname() 354 self.status = test_view['status'] 355 self.reason = test_view['reason'] 356 self.retry_count = retry_count 357 358 _PRETTY_STATUS_MAP = { 359 'GOOD': '[ PASSED ]', 360 'TEST_NA': '[ INFO ]', 361 } 362 363 @property 364 def _pretty_status(self): 365 """Pretty status string.""" 366 return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]') 367 368 def log_using(self, log_function, name_column_width): 369 """Log the test result using the given log function. 370 371 @param log_function: Log function to use. Example: logging.info 372 @param name_column_width: Width of name column for formatting. 373 """ 374 padded_name = self.name.ljust(name_column_width) 375 log_function('%s%s', padded_name, self._pretty_status) 376 if self.status != 'GOOD': 377 log_function('%s %s: %s', padded_name, self.status, self.reason) 378 if self.retry_count > 0: 379 log_function('%s retry_count: %s', padded_name, self.retry_count) 380 381 382def get_original_suite_name(suite_name, suite_args): 383 """Get the original suite name when running suite_attr_wrapper. 384 385 @param suite_name: the name of the suite launched in afe. When it is 386 suite_attr_wrapper, the suite that actually running is 387 specified in the suite_args. 388 @param suite_args: the parsed option which contains the original suite name. 389 390 @returns: the original suite name. 391 392 """ 393 if suite_name == 'suite_attr_wrapper': 394 attrs = ast.literal_eval(suite_args).get('attr_filter', '') 395 suite_list = ([x[6:] for x in re.split('[() ]', attrs) 396 if x and x.startswith('suite:')]) 397 return suite_list[0] if suite_list else suite_name 398 return suite_name 399 400 401class LogLink(object): 402 """Information needed to record a link in the logs. 403 404 Depending on context and the information provided at 405 construction time, the link may point to either to log files for 406 a job, or to a bug filed for a failure in the job. 407 408 @var anchor The link text. 409 @var url The link url. 410 @var bug_id Id of a bug to link to, or None. 411 """ 412 413 # A list of tests that don't get retried so skip the dashboard. 414 _SKIP_RETRY_DASHBOARD = ['provision'] 415 416 _BUG_LINK_PREFIX = 'Auto-Bug' 417 _LOG_LINK_PREFIX = 'Test-Logs' 418 419 420 @classmethod 421 def get_bug_link(cls, bug_id): 422 """Generate a bug link for the given bug_id. 423 424 @param bug_id: The id of the bug. 425 @return: A link, eg: https://crbug.com/<bug_id>. 426 """ 427 return reporting_utils.link_crbug(bug_id) 428 429 430 def __init__(self, anchor, server, job_string, bug_info=None, reason=None, 431 retry_count=0, testname=None): 432 """Initialize the LogLink by generating the log URL. 433 434 @param anchor The link text. 435 @param server The hostname of the server this suite ran on. 436 @param job_string The job whose logs we'd like to link to. 437 @param bug_info Info about the bug, if one was filed. 438 @param reason A string representing the reason of failure if any. 439 @param retry_count How many times the test has been retried. 440 @param testname Optional Arg that supplies the testname. 441 """ 442 self.anchor = anchor 443 self.url = _URL_PATTERN % (server, job_string) 444 self.reason = reason 445 self.retry_count = retry_count 446 self.testname = testname 447 if bug_info: 448 self.bug_id, self.bug_count = bug_info 449 else: 450 self.bug_id = None 451 self.bug_count = None 452 453 454 @property 455 def bug_url(self): 456 """URL of associated bug.""" 457 if self.bug_id: 458 return reporting_utils.link_crbug(self.bug_id) 459 else: 460 return None 461 462 463 @property 464 def _bug_count_text(self): 465 """Return bug count as human friendly text.""" 466 if self.bug_count is None: 467 bug_info = 'unknown number of reports' 468 elif self.bug_count == 1: 469 bug_info = 'new report' 470 else: 471 bug_info = '%s reports' % self.bug_count 472 return bug_info 473 474 475 def GenerateBuildbotLinks(self): 476 """Generate a link formatted to meet buildbot expectations. 477 478 If there is a bug associated with this link, report a link to the bug 479 and a link to the job logs; 480 otherwise report a link to the job logs. 481 482 @return A list of links formatted for the buildbot log annotator. 483 """ 484 bug_info_strings = [] 485 info_strings = [] 486 487 if self.retry_count > 0: 488 info_strings.append('retry_count: %d' % self.retry_count) 489 bug_info_strings.append('retry_count: %d' % self.retry_count) 490 491 if self.reason: 492 bug_info_strings.append(self.reason) 493 info_strings.append(self.reason) 494 495 # Add the bug link to buildbot_links 496 if self.bug_url: 497 bug_info_strings.append(self._bug_count_text) 498 499 bug_anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX, 500 bug_info_strings) 501 502 yield annotations.StepLink(bug_anchor_text, self.bug_url) 503 504 anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX, 505 info_strings) 506 yield annotations.StepLink(anchor_text, self.url) 507 508 509 def _format_anchor_text(self, prefix, info_strings): 510 """Format anchor text given a prefix and info strings. 511 512 @param prefix The prefix of the anchor text. 513 @param info_strings The infos presented in the anchor text. 514 @return A anchor_text with the right prefix and info strings. 515 """ 516 anchor_text = '[{prefix}]: {anchor}'.format( 517 prefix=prefix, 518 anchor=self.anchor.strip()) 519 if info_strings: 520 info_text = ', '.join(info_strings) 521 anchor_text += ': ' + info_text 522 return anchor_text 523 524 @property 525 def text_link(self): 526 """Link to the job's logs, for consumption by a human. 527 528 @return A link formatted for human readability. 529 """ 530 return '%s %s' % (self.anchor, self.url) 531 532 533 def GenerateWmatrixRetryLink(self): 534 """Generate a link to the wmatrix retry dashboard. 535 536 @return A link formatted for the buildbot log annotator. 537 """ 538 if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD: 539 return None 540 return annotations.StepLink( 541 text='[Flake-Dashboard]: %s' % self.testname, 542 url=reporting_utils.link_retry_url(self.testname)) 543 544 545class Timings(object): 546 """Timings for important events during a suite. 547 548 All timestamps are datetime.datetime objects. 549 550 @var suite_job_id: the afe job id of the suite job for which 551 we are recording the timing for. 552 @var download_start_time: the time the devserver starts staging 553 the build artifacts. Recorded in create_suite_job. 554 @var payload_end_time: the time when the artifacts only necessary to start 555 installsing images onto DUT's are staged. 556 Recorded in create_suite_job. 557 @var artifact_end_time: the remaining artifacts are downloaded after we kick 558 off the reimaging job, at which point we record 559 artifact_end_time. Recorded in dynamic_suite.py. 560 @var suite_start_time: the time the suite started. 561 @var tests_start_time: the time the first test started running. 562 @var tests_end_time: the time the last test finished running. 563 """ 564 565 def __init__(self, suite_job_id): 566 self.suite_job_id = suite_job_id 567 # Timings related to staging artifacts on devserver. 568 self.download_start_time = None 569 self.payload_end_time = None 570 self.artifact_end_time = None 571 572 # The test_start_time, but taken off the view that corresponds to the 573 # suite instead of an individual test. 574 self.suite_start_time = None 575 576 # Earliest and Latest tests in the set of TestViews passed to us. 577 self.tests_start_time = None 578 self.tests_end_time = None 579 580 581 def RecordTiming(self, view): 582 """Given a test report view, extract and record pertinent time info. 583 584 get_detailed_test_views() returns a list of entries that provide 585 info about the various parts of a suite run. This method can take 586 any one of these entries and look up timestamp info we might want 587 and record it. 588 589 If timestamps are unavailable, datetime.datetime.min/max will be used. 590 591 @param view: A TestView object. 592 """ 593 start_candidate = datetime.min 594 end_candidate = datetime.max 595 if view['test_started_time']: 596 start_candidate = time_utils.time_string_to_datetime( 597 view['test_started_time']) 598 if view['test_finished_time']: 599 end_candidate = time_utils.time_string_to_datetime( 600 view['test_finished_time']) 601 602 if view.get_testname() == TestView.SUITE_JOB: 603 self.suite_start_time = start_candidate 604 else: 605 self._UpdateFirstTestStartTime(start_candidate) 606 self._UpdateLastTestEndTime(end_candidate) 607 if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view: 608 keyvals = view['job_keyvals'] 609 self.download_start_time = time_utils.time_string_to_datetime( 610 keyvals.get(constants.DOWNLOAD_STARTED_TIME), 611 handle_type_error=True) 612 613 self.payload_end_time = time_utils.time_string_to_datetime( 614 keyvals.get(constants.PAYLOAD_FINISHED_TIME), 615 handle_type_error=True) 616 617 self.artifact_end_time = time_utils.time_string_to_datetime( 618 keyvals.get(constants.ARTIFACT_FINISHED_TIME), 619 handle_type_error=True) 620 621 622 def _UpdateFirstTestStartTime(self, candidate): 623 """Update self.tests_start_time, iff candidate is an earlier time. 624 625 @param candidate: a datetime.datetime object. 626 """ 627 if not self.tests_start_time or candidate < self.tests_start_time: 628 self.tests_start_time = candidate 629 630 631 def _UpdateLastTestEndTime(self, candidate): 632 """Update self.tests_end_time, iff candidate is a later time. 633 634 @param candidate: a datetime.datetime object. 635 """ 636 if not self.tests_end_time or candidate > self.tests_end_time: 637 self.tests_end_time = candidate 638 639 640 def __str__(self): 641 return ('\n' 642 'Suite timings:\n' 643 'Downloads started at %s\n' 644 'Payload downloads ended at %s\n' 645 'Suite started at %s\n' 646 'Artifact downloads ended (at latest) at %s\n' 647 'Testing started at %s\n' 648 'Testing ended at %s\n' % (self.download_start_time, 649 self.payload_end_time, 650 self.suite_start_time, 651 self.artifact_end_time, 652 self.tests_start_time, 653 self.tests_end_time)) 654 655 656def instance_for_pool(pool_name): 657 """ 658 Return the hostname of the server that should be used to service a suite 659 for the specified pool. 660 661 @param pool_name: The pool (without 'pool:' to schedule the suite against. 662 @return: The correct host that should be used to service this suite run. 663 """ 664 return CONFIG.get_config_value( 665 'POOL_INSTANCE_SHARDING', pool_name, 666 default=_DEFAULT_AUTOTEST_INSTANCE) 667 668 669class TestView(object): 670 """Represents a test view and provides a set of helper functions.""" 671 672 673 SUITE_JOB = 'Suite job' 674 INFRA_TESTS = ['provision'] 675 676 677 def __init__(self, view, afe_job, suite_name, build, user, 678 solo_test_run=False): 679 """Init a TestView object representing a tko test view. 680 681 @param view: A dictionary representing a tko test view. 682 @param afe_job: An instance of frontend.afe.models.Job 683 representing the job that kicked off the test. 684 @param suite_name: The name of the suite 685 that the test belongs to. 686 @param build: The build for which the test is run. 687 @param user: The user for which the test is run. 688 @param solo_test_run: This is a solo test run not part of a suite. 689 """ 690 self.view = view 691 self.afe_job = afe_job 692 self.suite_name = suite_name 693 self.build = build 694 self.is_suite_view = afe_job.parent_job is None and not solo_test_run 695 # This is the test name that will be shown in the output. 696 self.testname = None 697 self.user = user 698 699 # The case that a job was aborted before it got a chance to run 700 # usually indicates suite has timed out (unless aborted by user). 701 # In this case, the abort reason will be None. 702 # Update the reason with proper information. 703 if (self.is_relevant_suite_view() and 704 not self.get_testname() == self.SUITE_JOB and 705 self.view['status'] == 'ABORT' and 706 not self.view['reason']): 707 self.view['reason'] = 'Timed out, did not run.' 708 709 710 def __getitem__(self, key): 711 """Overload __getitem__ so that we can still use [] 712 713 @param key: A key of the tko test view. 714 715 @returns: The value of an attribute in the view. 716 717 """ 718 return self.view[key] 719 720 721 def __iter__(self): 722 """Overload __iter__ so that it supports 'in' operator.""" 723 return iter(self.view) 724 725 726 def get_testname(self): 727 """Get test name that should be shown in the output. 728 729 Formalize the test_name we got from the test view. 730 731 Remove 'build/suite' prefix if any. And append 'experimental' prefix 732 for experimental tests if their names do not start with 'experimental'. 733 734 If one runs a test in control file via the following code, 735 job.runtest('my_Test', tag='tag') 736 for most of the cases, view['test_name'] would look like 'my_Test.tag'. 737 If this is the case, this method will just return the original 738 test name, i.e. 'my_Test.tag'. 739 740 There are four special cases. 741 1) A test view is for the suite job's SERVER_JOB. 742 In this case, this method will return 'Suite job'. 743 744 2) A test view is of a child job or a solo test run not part of a 745 suite, and for a SERVER_JOB or CLIENT_JOB. 746 In this case, we will take the job name, remove the build/suite 747 prefix from the job name, and append the rest to 'SERVER_JOB' 748 or 'CLIENT_JOB' as a prefix. So the names returned by this 749 method will look like: 750 'experimental_Telemetry Smoothness Measurement_SERVER_JOB' 751 'experimental_dummy_Pass_SERVER_JOB' 752 'dummy_Fail_SERVER_JOB' 753 754 3) A test view is of a suite job and its status is ABORT. 755 In this case, the view['test_name'] is the child job's name. 756 If it is an experimental test, 'experimental' will be part 757 of the name. For instance, 758 'lumpy-release/R35-5712.0.0/perf_v2/ 759 experimental_Telemetry Smoothness Measurement' 760 'lumpy-release/R35-5712.0.0/dummy/experimental_dummy_Pass' 761 'lumpy-release/R35-5712.0.0/dummy/dummy_Fail' 762 The above names will be converted to the following: 763 'experimental_Telemetry Smoothness Measurement' 764 'experimental_dummy_Pass' 765 'dummy_Fail' 766 767 4) A test view's status is of a suite job and its status is TEST_NA. 768 In this case, the view['test_name'] is the NAME field of the control 769 file. If it is an experimental test, 'experimental' will part of 770 the name. For instance, 771 'experimental_Telemetry Smoothness Measurement' 772 'experimental_dummy_Pass' 773 'dummy_Fail' 774 This method will not modify these names. 775 776 @returns: Test name after normalization. 777 778 """ 779 if self.testname is not None: 780 return self.testname 781 782 if (self.is_suite_view and 783 self.view['test_name'].startswith('SERVER_JOB')): 784 # Rename suite job's SERVER_JOB to 'Suite job'. 785 self.testname = self.SUITE_JOB 786 return self.testname 787 788 if (self.view['test_name'].startswith('SERVER_JOB') or 789 self.view['test_name'].startswith('CLIENT_JOB')): 790 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 791 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 792 else: 793 testname = self.view['test_name'] 794 experimental = self.is_experimental() 795 # Remove the build and suite name from testname if any. 796 testname = tools.get_test_name( 797 self.build, self.suite_name, testname) 798 # If an experimental test was aborted, testname 799 # would include the 'experimental' prefix already. 800 prefix = constants.EXPERIMENTAL_PREFIX if ( 801 experimental and not 802 testname.startswith(constants.EXPERIMENTAL_PREFIX)) else '' 803 self.testname = prefix + testname 804 return self.testname 805 806 807 def is_relevant_suite_view(self): 808 """Checks whether this is a suite view we should care about. 809 810 @returns: True if it is relevant. False otherwise. 811 """ 812 return (self.get_testname() == self.SUITE_JOB or 813 (self.is_suite_view and 814 not self.view['test_name'].startswith('CLIENT_JOB') and 815 not self.view['subdir'])) 816 817 818 def is_test(self): 819 """Return whether the view is for an actual test. 820 821 @returns True if the view is for an actual test. 822 False if the view is for SERVER_JOB or CLIENT_JOB. 823 824 """ 825 return not (self.view['test_name'].startswith('SERVER_JOB') or 826 self.view['test_name'].startswith('CLIENT_JOB')) 827 828 829 def is_retry(self): 830 """Check whether the view is for a retry. 831 832 @returns: True, if the view is for a retry; False otherwise. 833 834 """ 835 return self.view['job_keyvals'].get('retry_original_job_id') is not None 836 837 838 def is_experimental(self): 839 """Check whether a test view is for an experimental test. 840 841 @returns: True if it is for an experimental test, False otherwise. 842 843 """ 844 return (self.view['job_keyvals'].get('experimental') == 'True' or 845 tools.get_test_name(self.build, self.suite_name, 846 self.view['test_name']).startswith('experimental')) 847 848 849 def hit_timeout(self): 850 """Check whether the corresponding job has hit its own timeout. 851 852 Note this method should not be called for those test views 853 that belongs to a suite job and are determined as irrelevant 854 by is_relevant_suite_view. This is because they are associated 855 to the suite job, whose job start/finished time make no sense 856 to an irrelevant test view. 857 858 @returns: True if the corresponding afe job has hit timeout. 859 False otherwise. 860 """ 861 if (self.is_relevant_suite_view() and 862 self.get_testname() != self.SUITE_JOB): 863 # Any relevant suite test view except SUITE_JOB 864 # did not hit its own timeout because it was not ever run. 865 return False 866 start = (datetime.strptime( 867 self.view['job_started_time'], time_utils.TIME_FMT) 868 if self.view['job_started_time'] else None) 869 end = (datetime.strptime( 870 self.view['job_finished_time'], time_utils.TIME_FMT) 871 if self.view['job_finished_time'] else None) 872 if not start or not end: 873 return False 874 else: 875 return ((end - start).total_seconds()/60.0 876 > self.afe_job.max_runtime_mins) 877 878 879 def is_aborted(self): 880 """Check if the view was aborted. 881 882 For suite job and child job test views, we check job keyval 883 'aborted_by' and test status. 884 885 For relevant suite job test views, we only check test status 886 because the suite job keyval won't make sense to individual 887 test views. 888 889 @returns: True if the test was as aborted, False otherwise. 890 891 """ 892 893 if (self.is_relevant_suite_view() and 894 self.get_testname() != self.SUITE_JOB): 895 return self.view['status'] == 'ABORT' 896 else: 897 return (bool(self.view['job_keyvals'].get('aborted_by')) and 898 self.view['status'] in ['ABORT', 'RUNNING']) 899 900 901 def is_in_fail_status(self): 902 """Check if the given test's status corresponds to a failure. 903 904 @returns: True if the test's status is FAIL or ERROR. False otherwise. 905 906 """ 907 # All the statuses tests can have when they fail. 908 return self.view['status'] in ['FAIL', 'ERROR', 'ABORT'] 909 910 911 def is_infra_test(self): 912 """Check whether this is a test that only lab infra is concerned. 913 914 @returns: True if only lab infra is concerned, False otherwise. 915 916 """ 917 return self.get_testname() in self.INFRA_TESTS 918 919 920 def get_buildbot_link_reason(self): 921 """Generate the buildbot link reason for the test. 922 923 @returns: A string representing the reason. 924 925 """ 926 return ('%s: %s' % (self.view['status'], self.view['reason']) 927 if self.view['reason'] else self.view['status']) 928 929 930 def get_job_id_owner_str(self): 931 """Generate the job_id_owner string for a test. 932 933 @returns: A string which looks like 135036-username 934 935 """ 936 return '%s-%s' % (self.view['afe_job_id'], self.user) 937 938 939 def get_bug_info(self, suite_job_keyvals): 940 """Get the bug info from suite_job_keyvals. 941 942 If a bug has been filed for the test, its bug info (bug id and counts) 943 will be stored in the suite job's keyvals. This method attempts to 944 retrieve bug info of the test from |suite_job_keyvals|. It will return 945 None if no bug info is found. No need to check bug info if the view is 946 SUITE_JOB. 947 948 @param suite_job_keyvals: The job keyval dictionary of the suite job. 949 All the bug info about child jobs are stored in 950 suite job's keyvals. 951 952 @returns: None if there is no bug info, or a pair with the 953 id of the bug, and the count of the number of 954 times the bug has been seen. 955 956 """ 957 if self.get_testname() == self.SUITE_JOB: 958 return None 959 if (self.view['test_name'].startswith('SERVER_JOB') or 960 self.view['test_name'].startswith('CLIENT_JOB')): 961 # Append job name as a prefix for SERVER_JOB and CLIENT_JOB 962 testname= '%s_%s' % (self.view['job_name'], self.view['test_name']) 963 else: 964 testname = self.view['test_name'] 965 966 return tools.get_test_failure_bug_info( 967 suite_job_keyvals, self.view['afe_job_id'], 968 testname) 969 970 971 def should_display_buildbot_link(self): 972 """Check whether a buildbot link should show for this view. 973 974 For suite job view, show buildbot link if it fails. 975 For normal test view, 976 show buildbot link if it is a retry 977 show buildbot link if it hits its own timeout. 978 show buildbot link if it fails. This doesn't 979 include the case where it was aborted but has 980 not hit its own timeout (most likely it was aborted because 981 suite has timed out). 982 983 @returns: True if we should show the buildbot link. 984 False otherwise. 985 """ 986 is_bad_status = (self.view['status'] != 'GOOD' and 987 self.view['status'] != 'TEST_NA') 988 if self.get_testname() == self.SUITE_JOB: 989 return is_bad_status 990 else: 991 if self.is_retry(): 992 return True 993 if is_bad_status: 994 return not self.is_aborted() or self.hit_timeout() 995 996 997 def get_control_file_attributes(self): 998 """Get the attributes from the control file of the test. 999 1000 @returns: A list of test attribute or None. 1001 """ 1002 control_file = self.afe_job.control_file 1003 attributes = None 1004 if control_file: 1005 cd = control_data.parse_control_string(control_file) 1006 attributes = list(cd.attributes) 1007 return attributes 1008 1009 1010 def override_afe_job_id(self, afe_job_id): 1011 """Overrides the AFE job id for the test. 1012 1013 @param afe_job_id: The new AFE job id to use. 1014 """ 1015 self.view['afe_job_id'] = afe_job_id 1016 1017 1018def log_buildbot_links(log_func, links): 1019 """Output buildbot links to log. 1020 1021 @param log_func: Logging function to use. 1022 @param links: Iterable of LogLink instances. 1023 """ 1024 for link in links: 1025 for generated_link in link.GenerateBuildbotLinks(): 1026 log_func(generated_link) 1027 wmatrix_link = link.GenerateWmatrixRetryLink() 1028 if wmatrix_link: 1029 log_func(wmatrix_link) 1030 1031 1032class ResultCollector(object): 1033 """Collect test results of a suite or a single test run. 1034 1035 Once a suite job has finished, use this class to collect test results. 1036 `run` is the core method that is to be called first. Then the caller 1037 could retrieve information like return code, return message, is_aborted, 1038 and timings by accessing the collector's public attributes. And output 1039 the test results and links by calling the 'output_*' methods. 1040 1041 Here is a overview of what `run` method does. 1042 1043 1) Collect the suite job's results from tko_test_view_2. 1044 For the suite job, we only pull test views without a 'subdir'. 1045 A NULL subdir indicates that the test was _not_ executed. This could be 1046 that no child job was scheduled for this test or the child job got 1047 aborted before starts running. 1048 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1049 1050 2) Collect the child jobs' results from tko_test_view_2. 1051 For child jobs, we pull all the test views associated with them. 1052 (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially) 1053 1054 3) Generate web and buildbot links. 1055 4) Compute timings of the suite run. 1056 5) Compute the return code based on test results. 1057 1058 @var _instance_server: The hostname of the server that is used 1059 to service the suite. 1060 @var _afe: The afe rpc client. 1061 @var _tko: The tko rpc client. 1062 @var _build: The build for which the suite is run, 1063 e.g. 'lumpy-release/R35-5712.0.0' 1064 @var _board: The target board for which the suite is run, 1065 e.g., 'lumpy', 'link'. 1066 @var _suite_name: The suite name, e.g. 'bvt', 'dummy'. 1067 @var _suite_job_id: The job id of the suite for which we are going to 1068 collect results. 1069 @var _original_suite_name: The suite name we record timing would be 1070 different from _suite_name when running 1071 suite_attr_wrapper. 1072 @var _suite_views: A list of TestView objects, representing relevant 1073 test views of the suite job. 1074 @var _child_views: A list of TestView objects, representing test views 1075 of the child jobs. 1076 @var _test_views: A list of TestView objects, representing all test views 1077 from _suite_views and _child_views. 1078 @var _web_links: A list of web links pointing to the results of jobs. 1079 @var _buildbot_links: A list of buildbot links for non-passing tests. 1080 @var _solo_test_run: True if this is a single test run. 1081 @var return_code: The exit code that should be returned by run_suite. 1082 @var return_message: Any message that should be displayed to explain 1083 the return code. 1084 @var is_aborted: Whether the suite was aborted or not. 1085 True, False or None (aborting status is unknown yet) 1086 @var timings: A Timing object that records the suite's timings. 1087 1088 """ 1089 1090 1091 def __init__(self, instance_server, afe, tko, build, board, 1092 suite_name, suite_job_id, original_suite_name=None, 1093 user=None, solo_test_run=False): 1094 self._instance_server = instance_server 1095 self._afe = afe 1096 self._tko = tko 1097 self._build = build 1098 self._board = board 1099 self._suite_name = suite_name 1100 self._suite_job_id = suite_job_id 1101 self._original_suite_name = original_suite_name or suite_name 1102 self._suite_views = [] 1103 self._child_views = [] 1104 self._test_views = [] 1105 self._retry_counts = {} 1106 self._missing_results = {} 1107 self._web_links = [] 1108 self._buildbot_links = [] 1109 self._num_child_jobs = 0 1110 self.return_code = None 1111 self.return_message = '' 1112 self.is_aborted = None 1113 self.timings = None 1114 self._user = user or getpass.getuser() 1115 self._solo_test_run = solo_test_run 1116 1117 1118 @property 1119 def buildbot_links(self): 1120 """Provide public access to buildbot links.""" 1121 return self._buildbot_links 1122 1123 1124 def _fetch_relevant_test_views_of_suite(self): 1125 """Fetch relevant test views of the suite job. 1126 1127 For the suite job, there will be a test view for SERVER_JOB, and views 1128 for results of its child jobs. For example, assume we've created 1129 a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail, 1130 dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while 1131 dummy_Path.bluetooth got TEST_NA as no duts have bluetooth. 1132 So the suite job's test views would look like 1133 _____________________________________________________________________ 1134 test_idx| job_idx|test_name |subdir |afe_job_id|status 1135 10 | 1000 |SERVER_JOB |---- |40 |GOOD 1136 11 | 1000 |dummy_Pass |NULL |40 |ABORT 1137 12 | 1000 |dummy_Fail.Fail |41-onwer/...|40 |FAIL 1138 13 | 1000 |dummy_Fail.Error |42-owner/...|40 |ERROR 1139 14 | 1000 |dummy_Pass.bluetooth|NULL |40 |TEST_NA 1140 1141 For a suite job, we only care about 1142 a) The test view for the suite job's SERVER_JOB 1143 b) The test views for real tests without a subdir. A NULL subdir 1144 indicates that a test didn't get executed. 1145 So, for the above example, we only keep test views whose test_idxs 1146 are 10, 11, 14. 1147 1148 @returns: A list of TestView objects, representing relevant 1149 test views of the suite job. 1150 1151 """ 1152 suite_job = self._afe.get_jobs(id=self._suite_job_id)[0] 1153 views = self._tko.run(call='get_detailed_test_views', 1154 afe_job_id=self._suite_job_id) 1155 relevant_views = [] 1156 for v in views: 1157 v = TestView(v, suite_job, self._suite_name, self._build, self._user, 1158 solo_test_run=self._solo_test_run) 1159 if v.is_relevant_suite_view(): 1160 # If the test doesn't have results in TKO and is being 1161 # displayed in the suite view instead of the child view, 1162 # then afe_job_id is incorrect and from the suite. 1163 # Override it based on the AFE job id which was missing 1164 # results. 1165 # TODO: This is likely inaccurate if a test has multiple 1166 # tries which all fail TKO parse stage. 1167 if v['test_name'] in self._missing_results: 1168 v.override_afe_job_id( 1169 self._missing_results[v['test_name']][0]) 1170 relevant_views.append(v) 1171 return relevant_views 1172 1173 1174 def _compute_retry_count(self, view): 1175 """Return how many times the test has been retried. 1176 1177 @param view: A TestView instance. 1178 @returns: An int value indicating the retry count. 1179 1180 """ 1181 old_job = view['job_keyvals'].get('retry_original_job_id') 1182 count = 0 1183 while old_job: 1184 count += 1 1185 views = self._tko.run( 1186 call='get_detailed_test_views', afe_job_id=old_job) 1187 old_job = (views[0]['job_keyvals'].get('retry_original_job_id') 1188 if views else None) 1189 return count 1190 1191 1192 def _fetch_test_views_of_child_jobs(self, jobs=None): 1193 """Fetch test views of child jobs. 1194 1195 @returns: A tuple (child_views, retry_counts, missing_results) 1196 child_views is list of TestView objects, representing 1197 all valid views. 1198 retry_counts is a dictionary that maps test_idx to retry 1199 counts. It only stores retry counts that are greater than 0. 1200 missing_results is a dictionary that maps test names to 1201 lists of job ids. 1202 1203 """ 1204 child_views = [] 1205 retry_counts = {} 1206 missing_results = {} 1207 child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id) 1208 if child_jobs: 1209 self._num_child_jobs = len(child_jobs) 1210 for job in child_jobs: 1211 views = [TestView(v, job, self._suite_name, self._build, self._user) 1212 for v in self._tko.run( 1213 call='get_detailed_test_views', afe_job_id=job.id, 1214 invalid=0)] 1215 if len(views) == 0: 1216 missing_results.setdefault(job.name, []).append(job.id) 1217 contains_test_failure = any( 1218 v.is_test() and v['status'] != 'GOOD' for v in views) 1219 for v in views: 1220 if (v.is_test() or 1221 v['status'] != 'GOOD' and not contains_test_failure): 1222 # For normal test view, just keep it. 1223 # For SERVER_JOB or CLIENT_JOB, only keep it 1224 # if it fails and no other test failure. 1225 child_views.append(v) 1226 retry_count = self._compute_retry_count(v) 1227 if retry_count > 0: 1228 retry_counts[v['test_idx']] = retry_count 1229 return child_views, retry_counts, missing_results 1230 1231 1232 def _generate_web_and_buildbot_links(self): 1233 """Generate web links and buildbot links.""" 1234 # TODO(fdeng): If a job was aborted before it reaches Running 1235 # state, we read the test view from the suite job 1236 # and thus this method generates a link pointing to the 1237 # suite job's page for the aborted job. Need a fix. 1238 self._web_links = [] 1239 self._buildbot_links = [] 1240 # Bug info are stored in the suite job's keyvals. 1241 if self._solo_test_run: 1242 suite_job_keyvals = {} 1243 else: 1244 suite_job_keyvals = self._suite_views[0]['job_keyvals'] 1245 for v in self._test_views: 1246 retry_count = self._retry_counts.get(v['test_idx'], 0) 1247 bug_info = v.get_bug_info(suite_job_keyvals) 1248 job_id_owner = v.get_job_id_owner_str() 1249 link = LogLink( 1250 anchor=v.get_testname(), 1251 server=self._instance_server, 1252 job_string=job_id_owner, 1253 bug_info=bug_info, retry_count=retry_count, 1254 testname=v.get_testname()) 1255 self._web_links.append(link) 1256 1257 if v.should_display_buildbot_link(): 1258 link.reason = v.get_buildbot_link_reason() 1259 self._buildbot_links.append(link) 1260 1261 1262 def _record_timings(self): 1263 """Record suite timings.""" 1264 self.timings = Timings(self._suite_job_id) 1265 for v in self._test_views: 1266 self.timings.RecordTiming(v) 1267 1268 1269 def _get_return_msg(self, code, tests_passed_after_retry): 1270 """Return the proper message for a given return code. 1271 1272 @param code: An enum value of RETURN_CODES 1273 @param test_passed_after_retry: True/False, indicating 1274 whether there are test(s) that have passed after retry. 1275 1276 @returns: A string, representing the message. 1277 1278 """ 1279 if code == RETURN_CODES.INFRA_FAILURE: 1280 return 'Suite job failed or provisioning failed.' 1281 elif code == RETURN_CODES.SUITE_TIMEOUT: 1282 return ('Some test(s) was aborted before running,' 1283 ' suite must have timed out.') 1284 elif code == RETURN_CODES.WARNING: 1285 if tests_passed_after_retry: 1286 return 'Some test(s) passed after retry.' 1287 else: 1288 return 'Some test(s) raised a warning.' 1289 elif code == RETURN_CODES.ERROR: 1290 return 'Some test(s) failed.' 1291 else: 1292 return '' 1293 1294 1295 def _compute_return_code(self): 1296 """Compute the exit code based on test results.""" 1297 code = RETURN_CODES.OK 1298 tests_passed_after_retry = False 1299 1300 for v in self._test_views: 1301 # The order of checking each case is important. 1302 if v.is_experimental(): 1303 continue 1304 if v.get_testname() == TestView.SUITE_JOB: 1305 if v.is_aborted() and v.hit_timeout(): 1306 current_code = RETURN_CODES.SUITE_TIMEOUT 1307 elif v.is_in_fail_status(): 1308 current_code = RETURN_CODES.INFRA_FAILURE 1309 elif v['status'] == 'WARN': 1310 current_code = RETURN_CODES.WARNING 1311 else: 1312 current_code = RETURN_CODES.OK 1313 else: 1314 if v.is_aborted() and v.is_relevant_suite_view(): 1315 # The test was aborted before started 1316 # This gurantees that the suite has timed out. 1317 current_code = RETURN_CODES.SUITE_TIMEOUT 1318 elif v.is_aborted() and not v.hit_timeout(): 1319 # The test was aborted, but 1320 # not due to a timeout. This is most likely 1321 # because the suite has timed out, but may 1322 # also because it was aborted by the user. 1323 # Since suite timing out is determined by checking 1324 # the suite job view, we simply ignore this view here. 1325 current_code = RETURN_CODES.OK 1326 elif v.is_in_fail_status(): 1327 # The test job failed. 1328 if v.is_infra_test(): 1329 current_code = RETURN_CODES.INFRA_FAILURE 1330 else: 1331 current_code = RETURN_CODES.ERROR 1332 elif v['status'] == 'WARN': 1333 # The test/suite job raised a wanrning. 1334 current_code = RETURN_CODES.WARNING 1335 elif v.is_retry(): 1336 # The test is a passing retry. 1337 current_code = RETURN_CODES.WARNING 1338 tests_passed_after_retry = True 1339 else: 1340 current_code = RETURN_CODES.OK 1341 code = get_worse_code(code, current_code) 1342 1343 self.return_code = code 1344 self.return_message = self._get_return_msg( 1345 code, tests_passed_after_retry) 1346 1347 1348 def _make_test_results(self): 1349 """Make TestResults for collected tests. 1350 1351 @returns: List of TestResult instances. 1352 """ 1353 test_results = [] 1354 for test_view in self._test_views: 1355 test_result = TestResult( 1356 test_view=test_view, 1357 retry_count=self._retry_counts.get(test_view['test_idx'], 0)) 1358 test_results.append(test_result) 1359 return test_results 1360 1361 1362 def output_results(self): 1363 """Output test results, timings and web links.""" 1364 # Output test results 1365 test_results = self._make_test_results() 1366 max_name_length = max(len(test_result.name) 1367 for test_result in test_results) 1368 for test_result in test_results: 1369 test_result.log_using(logging.info, max_name_length + 3) 1370 # Output suite timings 1371 logging.info(self.timings) 1372 # Output links to test logs 1373 logging.info('\nLinks to test logs:') 1374 for link in self._web_links: 1375 logging.info(link.text_link) 1376 logging.info('\n') 1377 1378 1379 def get_results_dict(self): 1380 """Write test results, timings and web links into a dict. 1381 1382 @returns: A dict of results in the format like: 1383 { 1384 'tests': { 1385 'test_1': {'status': 'PASSED', 'attributes': [1,2], ...} 1386 'test_2': {'status': 'FAILED', 'attributes': [1],...} 1387 } 1388 'suite_timings': { 1389 'download_start': '1998-07-17 00:00:00', 1390 'payload_download_end': '1998-07-17 00:00:05', 1391 ... 1392 } 1393 } 1394 """ 1395 output_dict = {} 1396 tests_dict = output_dict.setdefault('tests', {}) 1397 for v in self._test_views: 1398 test_name = v.get_testname() 1399 test_info = tests_dict.setdefault(test_name, {}) 1400 test_info.update({ 1401 'status': v['status'], 1402 'attributes': v.get_control_file_attributes() or list(), 1403 'reason': v['reason'], 1404 'retry_count': self._retry_counts.get(v['test_idx'], 0), 1405 }) 1406 # For aborted test, the control file will not be parsed and thus 1407 # fail to get the attributes info. Therefore, the subsystems the 1408 # abort test testing will be missing. For this case, we will assume 1409 # the aborted test will test all subsystems, set subsystem:default. 1410 if (test_info['status'] == 'ABORT' and 1411 not any('subsystem:' in a for a in test_info['attributes'])): 1412 test_info['attributes'].append('subsystem:default') 1413 1414 # Write the links to test logs into the |tests_dict| of |output_dict|. 1415 # For test whose status is not 'GOOD', the link is also buildbot_link. 1416 for link in self._web_links: 1417 test_name = link.anchor.strip() 1418 test_info = tests_dict.get(test_name) 1419 if test_info: 1420 test_info['link_to_logs'] = link.url 1421 # Write the wmatrix link into the dict. 1422 if link in self._buildbot_links and link.testname: 1423 test_info['wmatrix_link'] \ 1424 = reporting_utils.link_retry_url(link.testname) 1425 # Write the bug url into the dict. 1426 if link.bug_id: 1427 test_info['bug_url'] = link.bug_url 1428 1429 # Write the suite timings into |output_dict| 1430 timings = self.timings 1431 if timings is not None: 1432 time_dict = output_dict.setdefault('suite_timings', {}) 1433 time_dict.update({ 1434 'download_start' : str(timings.download_start_time), 1435 'payload_download_end' : str(timings.payload_end_time), 1436 'suite_start' : str(timings.suite_start_time), 1437 'artifact_download_end' : str(timings.artifact_end_time), 1438 'tests_start' : str(timings.tests_start_time), 1439 'tests_end' : str(timings.tests_end_time), 1440 }) 1441 1442 output_dict['suite_job_id'] = self._suite_job_id 1443 1444 return output_dict 1445 1446 1447 def run(self): 1448 """Collect test results. 1449 1450 This method goes through the following steps: 1451 Fetch relevent test views of the suite job. 1452 Fetch test views of child jobs 1453 Check whether the suite was aborted. 1454 Generate links. 1455 Calculate suite timings. 1456 Compute return code based on the test result. 1457 1458 """ 1459 if self._solo_test_run: 1460 self._test_views, self.retry_count, self._missing_results = ( 1461 self._fetch_test_views_of_child_jobs( 1462 jobs=self._afe.get_jobs(id=self._suite_job_id))) 1463 else: 1464 self._child_views, self._retry_counts, self._missing_results = ( 1465 self._fetch_test_views_of_child_jobs()) 1466 self._suite_views = self._fetch_relevant_test_views_of_suite() 1467 self._test_views = self._suite_views + self._child_views 1468 # For hostless job in Starting status, there is no test view associated. 1469 # This can happen when a suite job in Starting status is aborted. When 1470 # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone, 1471 # max_jobs_started_per_cycle, a suite job can stays in Starting status. 1472 if not self._test_views: 1473 self.return_code = RETURN_CODES.INFRA_FAILURE 1474 self.return_message = 'No test view was found.' 1475 return 1476 self.is_aborted = any([view['job_keyvals'].get('aborted_by') 1477 for view in self._suite_views]) 1478 self._generate_web_and_buildbot_links() 1479 self._record_timings() 1480 self._compute_return_code() 1481 1482 1483 def gather_timing_stats(self): 1484 """Collect timing related statistics.""" 1485 # Record suite runtime in metadata db. 1486 # Some failure modes can leave times unassigned, report sentinel value 1487 # in that case. 1488 runtime_in_secs = -1 1489 if (self.timings.tests_end_time is not None and 1490 self.timings.suite_start_time is not None): 1491 runtime_in_secs = (self.timings.tests_end_time - 1492 self.timings.suite_start_time).total_seconds() 1493 1494 job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name, 1495 self._board, self._build, self._num_child_jobs, runtime_in_secs) 1496 1497 1498def _make_builds_from_options(options): 1499 """Create a dict of builds for creating a suite job. 1500 1501 The returned dict maps version label prefixes to build names. Together, 1502 each key-value pair describes a complete label. 1503 1504 @param options: SimpleNamespace from argument parsing. 1505 1506 @return: dict mapping version label prefixes to build names 1507 """ 1508 builds = {} 1509 if options.build: 1510 prefix = provision.get_version_label_prefix(options.build) 1511 builds[prefix] = options.build 1512 if options.firmware_rw_build: 1513 builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build 1514 if options.firmware_ro_build: 1515 builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build 1516 return builds 1517 1518 1519@retry.retry(error.StageControlFileFailure, timeout_min=10) 1520def create_suite(afe, options): 1521 """Create a suite with retries. 1522 1523 @param afe: The afe object to insert the new suite job into. 1524 @param options: The options to use in creating the suite. 1525 1526 @return: The afe_job_id of the new suite job. 1527 """ 1528 logging.info('%s Submitted create_suite_job rpc', 1529 diagnosis_utils.JobTimer.format_time(datetime.now())) 1530 return afe.run( 1531 'create_suite_job', 1532 name=options.name, 1533 board=options.board, 1534 builds=_make_builds_from_options(options), 1535 test_source_build=options.test_source_build, 1536 check_hosts=not options.no_wait, 1537 pool=options.pool, 1538 num=options.num, 1539 file_bugs=options.file_bugs, 1540 priority=options.priority, 1541 suite_args=options.suite_args, 1542 wait_for_results=not options.no_wait, 1543 timeout_mins=options.timeout_mins + options.delay_minutes, 1544 max_runtime_mins=options.max_runtime_mins + options.delay_minutes, 1545 job_retry=options.retry, 1546 max_retries=options.max_retries, 1547 suite_min_duts=options.suite_min_duts, 1548 offload_failures_only=options.offload_failures_only, 1549 run_prod_code=options.run_prod_code, 1550 delay_minutes=options.delay_minutes, 1551 job_keyvals=options.job_keyvals, 1552 test_args=options.test_args, 1553 ) 1554 1555 1556SuiteResult = namedtuple('SuiteResult', ['return_code', 'output_dict']) 1557 1558 1559def main_without_exception_handling(options): 1560 """ 1561 run_suite script without exception handling. 1562 1563 @param options: The parsed options. 1564 1565 @returns: A tuple contains the return_code of run_suite and the dictionary 1566 of the output. 1567 1568 """ 1569 # If indicate to use the new style suite control file, convert the args 1570 if options.use_suite_attr: 1571 options = change_options_for_suite_attr(options) 1572 1573 log_name = 'run_suite-default.log' 1574 if options.build: 1575 # convert build name from containing / to containing only _ 1576 log_name = 'run_suite-%s.log' % options.build.replace('/', '_') 1577 log_dir = os.path.join(common.autotest_dir, 'logs') 1578 if os.path.exists(log_dir): 1579 log_name = os.path.join(log_dir, log_name) 1580 1581 utils.setup_logging(logfile=log_name) 1582 1583 if not options.bypass_labstatus and not options.web: 1584 utils.check_lab_status(options.build) 1585 instance_server = (options.web if options.web else 1586 instance_for_pool(options.pool)) 1587 afe = frontend_wrappers.RetryingAFE(server=instance_server, 1588 timeout_min=options.afe_timeout_mins, 1589 delay_sec=options.delay_sec) 1590 logging.info('Autotest instance: %s', instance_server) 1591 1592 rpc_helper = diagnosis_utils.RPCHelper(afe) 1593 is_real_time = True 1594 if options.mock_job_id: 1595 job_id = int(options.mock_job_id) 1596 existing_job = afe.get_jobs(id=job_id, finished=True) 1597 if existing_job: 1598 is_real_time = False 1599 else: 1600 existing_job = afe.get_jobs(id=job_id) 1601 if existing_job: 1602 job_created_on = time_utils.date_string_to_epoch_time( 1603 existing_job[0].created_on) 1604 else: 1605 raise utils.TestLabException('Failed to retrieve job: %d' % job_id) 1606 else: 1607 try: 1608 rpc_helper.check_dut_availability(options.board, options.pool, 1609 options.minimum_duts, 1610 options.skip_duts_check) 1611 job_id = create_suite(afe, options) 1612 job_created_on = time.time() 1613 except diagnosis_utils.NotEnoughDutsError as e: 1614 e.add_suite_name(options.name) 1615 e.add_build(options.test_source_build) 1616 pool_health_bug = reporting.PoolHealthBug(e) 1617 bug_id = reporting.Reporter().report(pool_health_bug).bug_id 1618 if bug_id is not None: 1619 logging.info(annotations.StepLink( 1620 text='Pool Health Bug', 1621 url=reporting_utils.link_crbug(bug_id))) 1622 e.add_bug_id(bug_id) 1623 raise e 1624 except (error.CrosDynamicSuiteException, 1625 error.RPCException, proxy.JSONRPCException) as e: 1626 logging.exception('Error Message: %s', e) 1627 return (RETURN_CODES.INFRA_FAILURE, {'return_message': str(e)}) 1628 except AttributeError: 1629 return (RETURN_CODES.INVALID_OPTIONS, {}) 1630 1631 job_timer = diagnosis_utils.JobTimer( 1632 job_created_on, float(options.timeout_mins)) 1633 job_url = reporting_utils.link_job(job_id, 1634 instance_server=instance_server) 1635 logging.info('%s Created suite job: %s', 1636 job_timer.format_time(job_timer.job_created_time), 1637 job_url) 1638 logging.info(annotations.StepLink( 1639 text='Link to suite', 1640 url=job_url)) 1641 1642 if options.create_and_return: 1643 msg = '--create_and_return was specified, terminating now.' 1644 logging.info(msg) 1645 return (RETURN_CODES.OK, {'return_message':msg}) 1646 1647 if options.no_wait: 1648 return _handle_job_nowait(job_id, options, instance_server) 1649 else: 1650 return _handle_job_wait(afe, job_id, options, job_timer, is_real_time) 1651 1652 1653def _handle_job_wait(afe, job_id, options, job_timer, is_real_time): 1654 """Handle suite job synchronously. 1655 1656 @param afe AFE instance. 1657 @param job_id Suite job id. 1658 @param options Parsed options. 1659 @param job_timer JobTimer for suite job. 1660 @param is_real_time Whether or not to handle job timeout. 1661 1662 @return SuiteResult of suite job. 1663 """ 1664 code = RETURN_CODES.OK 1665 output_dict = {} 1666 rpc_helper = diagnosis_utils.RPCHelper(afe) 1667 instance_server = afe.server 1668 while not afe.get_jobs(id=job_id, finished=True): 1669 # Note that this call logs output, preventing buildbot's 1670 # 9000 second silent timeout from kicking in. Let there be no 1671 # doubt, this is a hack. The timeout is from upstream buildbot and 1672 # this is the easiest work around. 1673 if job_timer.first_past_halftime(): 1674 rpc_helper.diagnose_job(job_id, instance_server) 1675 if job_timer.debug_output_timer.poll(): 1676 logging.info('The suite job has another %s till timeout.', 1677 job_timer.timeout_hours - job_timer.elapsed_time()) 1678 time.sleep(10) 1679 logging.info('%s Suite job is finished.', 1680 diagnosis_utils.JobTimer.format_time(datetime.now())) 1681 # For most cases, ResultCollector should be able to determine whether 1682 # a suite has timed out by checking information in the test view. 1683 # However, occationally tko parser may fail on parsing the 1684 # job_finished time from the job's keyval file. So we add another 1685 # layer of timeout check in run_suite. We do the check right after 1686 # the suite finishes to make it as accurate as possible. 1687 # There is a minor race condition here where we might have aborted 1688 # for some reason other than a timeout, and the job_timer thinks 1689 # it's a timeout because of the jitter in waiting for results. 1690 # The consequence would be that run_suite exits with code 1691 # SUITE_TIMEOUT while it should have returned INFRA_FAILURE 1692 # instead, which should happen very rarely. 1693 # Note the timeout will have no sense when using -m option. 1694 is_suite_timeout = job_timer.is_suite_timeout() 1695 1696 # Extract the original suite name to record timing. 1697 original_suite_name = get_original_suite_name(options.name, 1698 options.suite_args) 1699 # Start collecting test results. 1700 logging.info('%s Start collectint test results and dump them to json.', 1701 diagnosis_utils.JobTimer.format_time(datetime.now())) 1702 TKO = frontend_wrappers.RetryingTKO(server=instance_server, 1703 timeout_min=options.afe_timeout_mins, 1704 delay_sec=options.delay_sec) 1705 collector = ResultCollector(instance_server=instance_server, 1706 afe=afe, tko=TKO, build=options.build, 1707 board=options.board, 1708 suite_name=options.name, 1709 suite_job_id=job_id, 1710 original_suite_name=original_suite_name) 1711 collector.run() 1712 # Dump test outputs into json. 1713 output_dict = collector.get_results_dict() 1714 output_dict['autotest_instance'] = instance_server 1715 if not options.json_dump: 1716 collector.output_results() 1717 code = collector.return_code 1718 return_message = collector.return_message 1719 if is_real_time: 1720 # Do not record stats if the suite was aborted (either by a user 1721 # or through the golo rpc). 1722 # Also do not record stats if is_aborted is None, indicating 1723 # aborting status is unknown yet. 1724 if collector.is_aborted == False: 1725 logging.info('%s Gathering timing stats for the suite job.', 1726 diagnosis_utils.JobTimer.format_time(datetime.now())) 1727 collector.gather_timing_stats() 1728 1729 if collector.is_aborted == True and is_suite_timeout: 1730 # There are two possible cases when a suite times out. 1731 # 1. the suite job was aborted due to timing out 1732 # 2. the suite job succeeded, but some child jobs 1733 # were already aborted before the suite job exited. 1734 # The case 2 was handled by ResultCollector, 1735 # here we handle case 1. 1736 old_code = code 1737 code = get_worse_code( 1738 code, RETURN_CODES.SUITE_TIMEOUT) 1739 if old_code != code: 1740 return_message = 'Suite job timed out.' 1741 logging.info('Upgrade return code from %s to %s ' 1742 'because suite job has timed out.', 1743 RETURN_CODES.get_string(old_code), 1744 RETURN_CODES.get_string(code)) 1745 1746 logging.info('\n %s Attempting to display pool info: %s', 1747 diagnosis_utils.JobTimer.format_time(datetime.now()), 1748 options.pool) 1749 try: 1750 # Add some jitter to make up for any latency in 1751 # aborting the suite or checking for results. 1752 cutoff = (job_timer.timeout_hours + 1753 timedelta(hours=0.3)) 1754 rpc_helper.diagnose_pool( 1755 options.board, options.pool, cutoff) 1756 except proxy.JSONRPCException: 1757 logging.warning('Unable to display pool info.') 1758 1759 # And output return message. 1760 if return_message: 1761 logging.info('Reason: %s', return_message) 1762 output_dict['return_message'] = return_message 1763 1764 logging.info('\n %s Output below this line is for buildbot consumption:', 1765 diagnosis_utils.JobTimer.format_time(datetime.now())) 1766 log_buildbot_links(logging.info, collector._buildbot_links) 1767 return SuiteResult(code, output_dict) 1768 1769 1770def _handle_job_nowait(job_id, options, instance_server): 1771 """Handle suite job asynchronously. 1772 1773 @param job_id Suite job id. 1774 @param options Parsed options. 1775 @param instance_server Autotest instance hostname. 1776 1777 @return SuiteResult of suite job. 1778 """ 1779 logging.info('Created suite job: %r', job_id) 1780 link = LogLink(options.name, instance_server, 1781 '%s-%s' % (job_id, getpass.getuser())) 1782 for generate_link in link.GenerateBuildbotLinks(): 1783 logging.info(generate_link) 1784 logging.info('--no_wait specified; Exiting.') 1785 return SuiteResult(RETURN_CODES.OK, 1786 {'return_message': '--no_wait specified; Exiting.'}) 1787 1788 1789def main(): 1790 """Entry point.""" 1791 utils.verify_not_root_user() 1792 1793 parser = make_parser() 1794 options = parser.parse_args() 1795 try: 1796 # Silence the log when dumping outputs into json 1797 if options.json_dump: 1798 logging.disable(logging.CRITICAL) 1799 1800 if not verify_options(options): 1801 parser.print_help() 1802 code = RETURN_CODES.INVALID_OPTIONS 1803 output_dict = {'return_code': RETURN_CODES.INVALID_OPTIONS} 1804 else: 1805 code, output_dict = main_without_exception_handling(options) 1806 except diagnosis_utils.BoardNotAvailableError as e: 1807 output_dict = {'return_message': 'Skipping testing: %s' % e.message} 1808 code = RETURN_CODES.BOARD_NOT_AVAILABLE 1809 logging.info(output_dict['return_message']) 1810 except utils.TestLabException as e: 1811 output_dict = {'return_message': 'TestLabException: %s' % e} 1812 code = RETURN_CODES.INFRA_FAILURE 1813 logging.exception(output_dict['return_message']) 1814 except Exception as e: 1815 output_dict = { 1816 'return_message': 'Unhandled run_suite exception: %s' % e 1817 } 1818 code = RETURN_CODES.INFRA_FAILURE 1819 logging.exception(output_dict['return_message']) 1820 1821 # Dump test outputs into json. 1822 output_dict['return_code'] = code 1823 if options.json_dump: 1824 output_json = json.dumps(output_dict, sort_keys=True) 1825 output_json_marked = '#JSON_START#%s#JSON_END#' % output_json.strip() 1826 sys.stdout.write(output_json_marked) 1827 1828 logging.info('Will return from run_suite with status: %s', 1829 RETURN_CODES.get_string(code)) 1830 return code 1831 1832 1833if __name__ == "__main__": 1834 sys.exit(main()) 1835