1#!/usr/bin/python
2#
3# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7
8"""Tool for running suites of tests and waiting for completion.
9
10The desired test suite will be scheduled with autotest. By default,
11this tool will block until the job is complete, printing a summary
12at the end.  Error conditions result in exceptions.
13
14This is intended for use only with Chrome OS test suits that leverage the
15dynamic suite infrastructure in server/cros/dynamic_suite.py.
16
17This script exits with one of the following codes:
180 - OK: Suite finished successfully
191 - ERROR: Test(s) failed, or hits its own timeout
202 - WARNING: Test(s) raised a warning or passed on retry, none failed/timed out.
213 - INFRA_FAILURE: Infrastructure related issues, e.g.
22    * Lab is down
23    * Too many duts (defined as a constant) in repair failed status
24    * Suite job issues, like bug in dynamic suite,
25      user aborted the suite, lose a drone/all devservers/rpc server,
26      0 tests ran, etc.
27    * provision failed
28      TODO(fdeng): crbug.com/413918, reexamine treating all provision
29                   failures as INFRA failures.
304 - SUITE_TIMEOUT: Suite timed out, some tests ran,
31    none failed by the time the suite job was aborted. This will cover,
32    but not limited to, the following cases:
33    * A devserver failure that manifests as a timeout
34    * No DUTs available midway through a suite
35    * Provision/Reset/Cleanup took longer time than expected for new image
36    * A regression in scheduler tick time.
375- BOARD_NOT_AVAILABLE: If there is no host for the requested board/pool.
386- INVALID_OPTIONS: If options are not valid.
39"""
40
41import argparse
42import ast
43from collections import namedtuple
44from datetime import datetime
45from datetime import timedelta
46import getpass
47import json
48import logging
49import os
50import re
51import sys
52import time
53
54import common
55from chromite.lib import buildbot_annotations as annotations
56
57from autotest_lib.client.common_lib import control_data
58from autotest_lib.client.common_lib import error
59from autotest_lib.client.common_lib import global_config, enum
60from autotest_lib.client.common_lib import priorities
61from autotest_lib.client.common_lib import time_utils
62from autotest_lib.client.common_lib.cros import retry
63from autotest_lib.frontend.afe.json_rpc import proxy
64from autotest_lib.server import utils
65from autotest_lib.server.cros import provision
66from autotest_lib.server.cros.dynamic_suite import constants
67from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
68from autotest_lib.server.cros.dynamic_suite import reporting
69from autotest_lib.server.cros.dynamic_suite import reporting_utils
70from autotest_lib.server.cros.dynamic_suite import tools
71from autotest_lib.site_utils import diagnosis_utils
72from autotest_lib.site_utils import job_overhead
73
74CONFIG = global_config.global_config
75
76_DEFAULT_AUTOTEST_INSTANCE = CONFIG.get_config_value(
77        'SERVER', 'hostname', type=str)
78_URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
79
80# Return code that will be sent back to autotest_rpc_server.py
81RETURN_CODES = enum.Enum(
82        'OK', 'ERROR', 'WARNING', 'INFRA_FAILURE', 'SUITE_TIMEOUT',
83        'BOARD_NOT_AVAILABLE', 'INVALID_OPTIONS')
84# The severity of return code. If multiple codes
85# apply, the script should always return the severest one.
86# E.g. if we have a test failure and the suite also timed out,
87# we should return 'ERROR'.
88SEVERITY = {RETURN_CODES.OK: 0,
89            RETURN_CODES.WARNING: 1,
90            RETURN_CODES.SUITE_TIMEOUT: 2,
91            RETURN_CODES.INFRA_FAILURE: 3,
92            RETURN_CODES.ERROR: 4}
93
94
95def get_worse_code(code1, code2):
96    """Compare the severity of two codes and return the worse code.
97
98    @param code1: An enum value of RETURN_CODES
99    @param code2: An enum value of RETURN_CODES
100
101    @returns: the more severe one between code1 and code2.
102
103    """
104    return code1 if SEVERITY[code1] >= SEVERITY[code2] else code2
105
106
107def bool_str(x):
108    """Boolean string type for option arguments.
109
110    @param x: string representation of boolean value.
111
112    """
113    if x == 'True':
114        return True
115    elif x == 'False':
116        return False
117    else:
118        raise argparse.ArgumentTypeError(
119            '%s is not one of True or False' % (x,))
120
121
122def _get_priority_value(x):
123    """Convert a priority representation to its int value.
124
125    Priorities can be described either by an int value (possibly as a string)
126    or a name string.  This function coerces both forms to an int value.
127
128    This function is intended for casting command line arguments during
129    parsing.
130
131    @param x: priority value as an int, int string, or name string
132
133    @returns: int value of priority
134    """
135    try:
136        return int(x)
137    except ValueError:
138        try:
139            return priorities.Priority.get_value(x)
140        except AttributeError:
141            raise argparse.ArgumentTypeError(
142                'Unknown priority level %s.  Try one of %s.'
143                % (x, ', '.join(priorities.Priority.names)))
144
145
146def make_parser():
147    """Make ArgumentParser instance for run_suite.py."""
148    parser = argparse.ArgumentParser(
149        usage="%(prog)s [options]")
150    parser.add_argument("-b", "--board", dest="board")
151    parser.add_argument("-i", "--build", dest="build")
152    parser.add_argument(
153        "-w", "--web", dest="web", default=None,
154        help="Address of a webserver to receive suite requests.")
155    parser.add_argument(
156        '--firmware_rw_build', dest='firmware_rw_build', default=None,
157        help='Firmware build to be installed in dut RW firmware.')
158    parser.add_argument(
159        '--firmware_ro_build', dest='firmware_ro_build', default=None,
160        help='Firmware build to be installed in dut RO firmware.')
161    parser.add_argument(
162        '--test_source_build', dest='test_source_build', default=None,
163        help=('Build that contains the test code, '
164              'e.g., it can be the value of `--build`, '
165              '`--firmware_rw_build` or `--firmware_ro_build` '
166              'arguments. Default is None, that is, use the test '
167              'code from `--build` (CrOS image)'))
168    #  This should just be a boolean flag, but the autotest "proxy" code
169    #  can't handle flags that don't take arguments.
170    parser.add_argument(
171        "-n", "--no_wait", dest="no_wait", default=False, type=bool_str,
172        help='Must pass "True" or "False" if used.')
173    # If you really want no pool, --pool="" will do it. USE WITH CARE.
174    parser.add_argument("-p", "--pool", dest="pool", default="suites")
175    parser.add_argument("-s", "--suite_name", dest="name")
176    parser.add_argument("-a", "--afe_timeout_mins", type=int,
177                        dest="afe_timeout_mins", default=30)
178    parser.add_argument("-t", "--timeout_mins", type=int,
179                        dest="timeout_mins", default=1440)
180    parser.add_argument("-x", "--max_runtime_mins", type=int,
181                        dest="max_runtime_mins", default=1440)
182    parser.add_argument("-d", "--delay_sec", type=int,
183                        dest="delay_sec", default=10)
184    parser.add_argument("-m", "--mock_job_id", dest="mock_job_id",
185                        help="Attach to existing job id for already running "
186                        "suite, and creates report.")
187    # NOTE(akeshet): This looks similar to --no_wait, but behaves differently.
188    # --no_wait is passed in to the suite rpc itself and affects the suite,
189    # while this does not.
190    parser.add_argument("-c", "--create_and_return", dest="create_and_return",
191                        action="store_true",
192                        help="Create the suite and print the job id, then "
193                        "finish immediately.")
194    parser.add_argument("-u", "--num", dest="num", type=int, default=None,
195                        help="Run on at most NUM machines.")
196    #  Same boolean flag issue applies here.
197    parser.add_argument(
198        "-f", "--file_bugs", dest="file_bugs", default=False, type=bool_str,
199        help=('File bugs on test failures. Must pass "True" or '
200              '"False" if used.'))
201    parser.add_argument("-l", "--bypass_labstatus", dest="bypass_labstatus",
202                        action="store_true", help='Bypass lab status check.')
203    # We allow either a number or a string for the priority.  This way, if you
204    # know what you're doing, one can specify a custom priority level between
205    # other levels.
206    parser.add_argument("-r", "--priority", dest="priority",
207                        type=_get_priority_value,
208                        default=priorities.Priority.DEFAULT,
209                        action="store",
210                        help="Priority of suite. Either numerical value, or "
211                        "one of (" + ", ".join(priorities.Priority.names)
212                        + ").")
213    parser.add_argument(
214        '--retry', dest='retry', default=False, type=bool_str, action='store',
215        help='Enable test retry.  Must pass "True" or "False" if used.')
216    parser.add_argument('--max_retries', dest='max_retries', default=None,
217                        type=int, action='store', help='Maximum retries'
218                        'allowed at suite level. No limit if not specified.')
219    parser.add_argument('--minimum_duts', dest='minimum_duts', type=int,
220                        default=0, action='store',
221                        help='Check that the pool has at least such many '
222                        'healthy machines, otherwise suite will not run. '
223                        'Default to 0.')
224    parser.add_argument('--suite_min_duts', dest='suite_min_duts', type=int,
225                        default=0, action='store',
226                        help='Preferred minimum number of machines. Scheduler '
227                        'will prioritize on getting such many machines for '
228                        'the suite when it is competing with another suite '
229                        'that has a higher priority but already got minimum '
230                        'machines it needs. Default to 0.')
231    parser.add_argument("--suite_args", dest="suite_args",
232                        default=None, action="store",
233                        help="Argument string for suite control file.")
234    parser.add_argument('--offload_failures_only',
235                        dest='offload_failures_only', type=bool_str,
236                        action='store', default=False,
237                        help='Only enable gs_offloading for failed tests. '
238                        'Successful tests will be deleted. Must pass "True"'
239                        ' or "False" if used.')
240    parser.add_argument('--use_suite_attr', dest='use_suite_attr',
241                        action='store_true', default=False,
242                        help='Advanced. Run the suite based on ATTRIBUTES of '
243                        'control files, rather than SUITE.')
244    parser.add_argument('--json_dump', dest='json_dump', action='store_true',
245                        default=False,
246                        help='Dump the output of run_suite to stdout.')
247    parser.add_argument(
248        '--run_prod_code', dest='run_prod_code',
249        action='store_true', default=False,
250        help='Run the test code that lives in prod aka the test '
251        'code currently on the lab servers.')
252    parser.add_argument(
253        '--delay_minutes', type=int, default=0,
254        help=('Delay the creation of test jobs for a given '
255              'number of minutes. This argument can be used to '
256              'force provision jobs being delayed, which helps '
257              'to distribute loads across devservers.'))
258    parser.add_argument(
259        '--skip_duts_check', dest='skip_duts_check', action='store_true',
260        default=False, help='If True, skip minimum available DUTs check')
261    parser.add_argument(
262        '--job_keyvals', dest='job_keyvals', type=ast.literal_eval,
263        action='store', default=None,
264        help='A dict of job keyvals to be inject to suite control file')
265    parser.add_argument(
266        '--test_args', dest='test_args', type=ast.literal_eval,
267        action='store', default=None,
268        help=('A dict of args passed all the way to each individual test that '
269              'will be actually ran.'))
270    return parser
271
272
273def verify_options(options):
274    """Verify the validity of options.
275
276    @param options: The parsed options to verify.
277
278    @returns: True if verification passes, False otherwise.
279
280    """
281    if options.mock_job_id and (
282            not options.build or not options.name or not options.board):
283        print ('When using -m, need to specify build, board and suite '
284               'name which you have used for creating the original job')
285        return False
286    else:
287        if not options.build:
288            print 'Need to specify which build to use'
289            return False
290        if not options.board:
291            print 'Need to specify board'
292            return False
293        if not options.name:
294            print 'Need to specify suite name'
295            return False
296    if options.num is not None and options.num < 1:
297        print 'Number of machines must be more than 0, if specified.'
298        return False
299    if not options.retry and options.max_retries is not None:
300        print 'max_retries can only be used with --retry=True'
301        return False
302    if options.use_suite_attr and options.suite_args is not None:
303        print ('The new suite control file cannot parse the suite_args: %s.'
304               'Please not specify any suite_args here.' % options.suite_args)
305        return False
306    if options.no_wait and options.retry:
307        print 'Test retry is not available when using --no_wait=True'
308    # Default to use the test code in CrOS build.
309    if not options.test_source_build and options.build:
310        options.test_source_build = options.build
311    return True
312
313
314def change_options_for_suite_attr(options):
315    """Change options to be prepared to run the suite_attr_wrapper.
316
317    If specify 'use_suite_attr' from the cmd line, it indicates to run the
318    new style suite control file, suite_attr_wrapper. Then, change the
319    options.name to 'suite_attr_wrapper', change the options.suite_args to
320    include the arguments needed by suite_attr_wrapper.
321
322    @param options: The verified options.
323
324    @returns: The changed options.
325
326    """
327    # Convert the suite_name to attribute boolean expression.
328    if type(options.name) is str:
329        attr_filter_val = 'suite:%s' % options.name
330    else:
331        attr_filter_val = ' or '.join(['suite:%s' % x for x in options.name])
332
333    # change the suite_args to be a dict of arguments for suite_attr_wrapper
334    # if suite_args is not None, store the values in 'other_args' of the dict
335    args_dict = {}
336    args_dict['attr_filter'] = attr_filter_val
337    options.suite_args = str(args_dict)
338    options.name = 'suite_attr_wrapper'
339
340    return options
341
342
343class TestResult(object):
344
345    """Represents the result of a TestView."""
346
347    def __init__(self, test_view, retry_count=0):
348        """Initialize instance.
349
350        @param test_view: TestView instance.
351        @param retry_count: Retry count for test.  Optional.
352        """
353        self.name = test_view.get_testname()
354        self.status = test_view['status']
355        self.reason = test_view['reason']
356        self.retry_count = retry_count
357
358    _PRETTY_STATUS_MAP = {
359        'GOOD':    '[ PASSED ]',
360        'TEST_NA': '[  INFO  ]',
361    }
362
363    @property
364    def _pretty_status(self):
365        """Pretty status string."""
366        return self._PRETTY_STATUS_MAP.get(self.status, '[ FAILED ]')
367
368    def log_using(self, log_function, name_column_width):
369        """Log the test result using the given log function.
370
371        @param log_function: Log function to use.  Example: logging.info
372        @param name_column_width: Width of name column for formatting.
373        """
374        padded_name = self.name.ljust(name_column_width)
375        log_function('%s%s', padded_name, self._pretty_status)
376        if self.status != 'GOOD':
377            log_function('%s  %s: %s', padded_name, self.status, self.reason)
378        if self.retry_count > 0:
379            log_function('%s  retry_count: %s', padded_name, self.retry_count)
380
381
382def get_original_suite_name(suite_name, suite_args):
383    """Get the original suite name when running suite_attr_wrapper.
384
385    @param suite_name: the name of the suite launched in afe. When it is
386                       suite_attr_wrapper, the suite that actually running is
387                       specified in the suite_args.
388    @param suite_args: the parsed option which contains the original suite name.
389
390    @returns: the original suite name.
391
392    """
393    if suite_name == 'suite_attr_wrapper':
394        attrs = ast.literal_eval(suite_args).get('attr_filter', '')
395        suite_list = ([x[6:] for x in re.split('[() ]', attrs)
396                       if x and x.startswith('suite:')])
397        return suite_list[0] if suite_list else suite_name
398    return suite_name
399
400
401class LogLink(object):
402    """Information needed to record a link in the logs.
403
404    Depending on context and the information provided at
405    construction time, the link may point to either to log files for
406    a job, or to a bug filed for a failure in the job.
407
408    @var anchor  The link text.
409    @var url     The link url.
410    @var bug_id  Id of a bug to link to, or None.
411    """
412
413    # A list of tests that don't get retried so skip the dashboard.
414    _SKIP_RETRY_DASHBOARD = ['provision']
415
416    _BUG_LINK_PREFIX = 'Auto-Bug'
417    _LOG_LINK_PREFIX = 'Test-Logs'
418
419
420    @classmethod
421    def get_bug_link(cls, bug_id):
422        """Generate a bug link for the given bug_id.
423
424        @param bug_id: The id of the bug.
425        @return: A link, eg: https://crbug.com/<bug_id>.
426        """
427        return reporting_utils.link_crbug(bug_id)
428
429
430    def __init__(self, anchor, server, job_string, bug_info=None, reason=None,
431                 retry_count=0, testname=None):
432        """Initialize the LogLink by generating the log URL.
433
434        @param anchor      The link text.
435        @param server      The hostname of the server this suite ran on.
436        @param job_string  The job whose logs we'd like to link to.
437        @param bug_info    Info about the bug, if one was filed.
438        @param reason      A string representing the reason of failure if any.
439        @param retry_count How many times the test has been retried.
440        @param testname    Optional Arg that supplies the testname.
441        """
442        self.anchor = anchor
443        self.url = _URL_PATTERN % (server, job_string)
444        self.reason = reason
445        self.retry_count = retry_count
446        self.testname = testname
447        if bug_info:
448            self.bug_id, self.bug_count = bug_info
449        else:
450            self.bug_id = None
451            self.bug_count = None
452
453
454    @property
455    def bug_url(self):
456        """URL of associated bug."""
457        if self.bug_id:
458            return reporting_utils.link_crbug(self.bug_id)
459        else:
460            return None
461
462
463    @property
464    def _bug_count_text(self):
465        """Return bug count as human friendly text."""
466        if self.bug_count is None:
467            bug_info = 'unknown number of reports'
468        elif self.bug_count == 1:
469            bug_info = 'new report'
470        else:
471            bug_info = '%s reports' % self.bug_count
472        return bug_info
473
474
475    def GenerateBuildbotLinks(self):
476        """Generate a link formatted to meet buildbot expectations.
477
478        If there is a bug associated with this link, report a link to the bug
479        and a link to the job logs;
480        otherwise report a link to the job logs.
481
482        @return A list of links formatted for the buildbot log annotator.
483        """
484        bug_info_strings = []
485        info_strings = []
486
487        if self.retry_count > 0:
488            info_strings.append('retry_count: %d' % self.retry_count)
489            bug_info_strings.append('retry_count: %d' % self.retry_count)
490
491        if self.reason:
492            bug_info_strings.append(self.reason)
493            info_strings.append(self.reason)
494
495        # Add the bug link to buildbot_links
496        if self.bug_url:
497            bug_info_strings.append(self._bug_count_text)
498
499            bug_anchor_text = self._format_anchor_text(self._BUG_LINK_PREFIX,
500                                                       bug_info_strings)
501
502            yield annotations.StepLink(bug_anchor_text, self.bug_url)
503
504        anchor_text = self._format_anchor_text(self._LOG_LINK_PREFIX,
505                                               info_strings)
506        yield annotations.StepLink(anchor_text, self.url)
507
508
509    def _format_anchor_text(self, prefix, info_strings):
510        """Format anchor text given a prefix and info strings.
511
512        @param prefix        The prefix of the anchor text.
513        @param info_strings  The infos presented in the anchor text.
514        @return A anchor_text with the right prefix and info strings.
515        """
516        anchor_text = '[{prefix}]: {anchor}'.format(
517            prefix=prefix,
518            anchor=self.anchor.strip())
519        if info_strings:
520            info_text = ', '.join(info_strings)
521            anchor_text += ': ' + info_text
522        return anchor_text
523
524    @property
525    def text_link(self):
526        """Link to the job's logs, for consumption by a human.
527
528        @return A link formatted for human readability.
529        """
530        return '%s %s' % (self.anchor, self.url)
531
532
533    def GenerateWmatrixRetryLink(self):
534        """Generate a link to the wmatrix retry dashboard.
535
536        @return A link formatted for the buildbot log annotator.
537        """
538        if not self.testname or self.testname in self._SKIP_RETRY_DASHBOARD:
539            return None
540        return annotations.StepLink(
541            text='[Flake-Dashboard]: %s' % self.testname,
542            url=reporting_utils.link_retry_url(self.testname))
543
544
545class Timings(object):
546    """Timings for important events during a suite.
547
548    All timestamps are datetime.datetime objects.
549
550    @var suite_job_id: the afe job id of the suite job for which
551                       we are recording the timing for.
552    @var download_start_time: the time the devserver starts staging
553                              the build artifacts. Recorded in create_suite_job.
554    @var payload_end_time: the time when the artifacts only necessary to start
555                           installsing images onto DUT's are staged.
556                           Recorded in create_suite_job.
557    @var artifact_end_time: the remaining artifacts are downloaded after we kick
558                            off the reimaging job, at which point we record
559                            artifact_end_time. Recorded in dynamic_suite.py.
560    @var suite_start_time: the time the suite started.
561    @var tests_start_time: the time the first test started running.
562    @var tests_end_time: the time the last test finished running.
563    """
564
565    def __init__(self, suite_job_id):
566        self.suite_job_id = suite_job_id
567        # Timings related to staging artifacts on devserver.
568        self.download_start_time = None
569        self.payload_end_time = None
570        self.artifact_end_time = None
571
572        # The test_start_time, but taken off the view that corresponds to the
573        # suite instead of an individual test.
574        self.suite_start_time = None
575
576        # Earliest and Latest tests in the set of TestViews passed to us.
577        self.tests_start_time = None
578        self.tests_end_time = None
579
580
581    def RecordTiming(self, view):
582        """Given a test report view, extract and record pertinent time info.
583
584        get_detailed_test_views() returns a list of entries that provide
585        info about the various parts of a suite run.  This method can take
586        any one of these entries and look up timestamp info we might want
587        and record it.
588
589        If timestamps are unavailable, datetime.datetime.min/max will be used.
590
591        @param view: A TestView object.
592        """
593        start_candidate = datetime.min
594        end_candidate = datetime.max
595        if view['test_started_time']:
596            start_candidate = time_utils.time_string_to_datetime(
597                    view['test_started_time'])
598        if view['test_finished_time']:
599            end_candidate = time_utils.time_string_to_datetime(
600                    view['test_finished_time'])
601
602        if view.get_testname() == TestView.SUITE_JOB:
603            self.suite_start_time = start_candidate
604        else:
605            self._UpdateFirstTestStartTime(start_candidate)
606            self._UpdateLastTestEndTime(end_candidate)
607        if view['afe_job_id'] == self.suite_job_id and 'job_keyvals' in view:
608            keyvals = view['job_keyvals']
609            self.download_start_time = time_utils.time_string_to_datetime(
610                    keyvals.get(constants.DOWNLOAD_STARTED_TIME),
611                    handle_type_error=True)
612
613            self.payload_end_time = time_utils.time_string_to_datetime(
614                    keyvals.get(constants.PAYLOAD_FINISHED_TIME),
615                    handle_type_error=True)
616
617            self.artifact_end_time = time_utils.time_string_to_datetime(
618                    keyvals.get(constants.ARTIFACT_FINISHED_TIME),
619                    handle_type_error=True)
620
621
622    def _UpdateFirstTestStartTime(self, candidate):
623        """Update self.tests_start_time, iff candidate is an earlier time.
624
625        @param candidate: a datetime.datetime object.
626        """
627        if not self.tests_start_time or candidate < self.tests_start_time:
628            self.tests_start_time = candidate
629
630
631    def _UpdateLastTestEndTime(self, candidate):
632        """Update self.tests_end_time, iff candidate is a later time.
633
634        @param candidate: a datetime.datetime object.
635        """
636        if not self.tests_end_time or candidate > self.tests_end_time:
637            self.tests_end_time = candidate
638
639
640    def __str__(self):
641        return ('\n'
642                'Suite timings:\n'
643                'Downloads started at %s\n'
644                'Payload downloads ended at %s\n'
645                'Suite started at %s\n'
646                'Artifact downloads ended (at latest) at %s\n'
647                'Testing started at %s\n'
648                'Testing ended at %s\n' % (self.download_start_time,
649                                           self.payload_end_time,
650                                           self.suite_start_time,
651                                           self.artifact_end_time,
652                                           self.tests_start_time,
653                                           self.tests_end_time))
654
655
656def instance_for_pool(pool_name):
657    """
658    Return the hostname of the server that should be used to service a suite
659    for the specified pool.
660
661    @param pool_name: The pool (without 'pool:' to schedule the suite against.
662    @return: The correct host that should be used to service this suite run.
663    """
664    return CONFIG.get_config_value(
665            'POOL_INSTANCE_SHARDING', pool_name,
666            default=_DEFAULT_AUTOTEST_INSTANCE)
667
668
669class TestView(object):
670    """Represents a test view and provides a set of helper functions."""
671
672
673    SUITE_JOB = 'Suite job'
674    INFRA_TESTS = ['provision']
675
676
677    def __init__(self, view, afe_job, suite_name, build, user,
678                 solo_test_run=False):
679        """Init a TestView object representing a tko test view.
680
681        @param view: A dictionary representing a tko test view.
682        @param afe_job: An instance of frontend.afe.models.Job
683                        representing the job that kicked off the test.
684        @param suite_name: The name of the suite
685                           that the test belongs to.
686        @param build: The build for which the test is run.
687        @param user: The user for which the test is run.
688        @param solo_test_run: This is a solo test run not part of a suite.
689        """
690        self.view = view
691        self.afe_job = afe_job
692        self.suite_name = suite_name
693        self.build = build
694        self.is_suite_view = afe_job.parent_job is None and not solo_test_run
695        # This is the test name that will be shown in the output.
696        self.testname = None
697        self.user = user
698
699        # The case that a job was aborted before it got a chance to run
700        # usually indicates suite has timed out (unless aborted by user).
701        # In this case, the abort reason will be None.
702        # Update the reason with proper information.
703        if (self.is_relevant_suite_view() and
704                not self.get_testname() == self.SUITE_JOB and
705                self.view['status'] == 'ABORT' and
706                not self.view['reason']):
707            self.view['reason'] = 'Timed out, did not run.'
708
709
710    def __getitem__(self, key):
711        """Overload __getitem__ so that we can still use []
712
713        @param key: A key of the tko test view.
714
715        @returns: The value of an attribute in the view.
716
717        """
718        return self.view[key]
719
720
721    def __iter__(self):
722        """Overload __iter__ so that it supports 'in' operator."""
723        return iter(self.view)
724
725
726    def get_testname(self):
727        """Get test name that should be shown in the output.
728
729        Formalize the test_name we got from the test view.
730
731        Remove 'build/suite' prefix if any. And append 'experimental' prefix
732        for experimental tests if their names do not start with 'experimental'.
733
734        If one runs a test in control file via the following code,
735           job.runtest('my_Test', tag='tag')
736        for most of the cases, view['test_name'] would look like 'my_Test.tag'.
737        If this is the case, this method will just return the original
738        test name, i.e. 'my_Test.tag'.
739
740        There are four special cases.
741        1) A test view is for the suite job's SERVER_JOB.
742           In this case, this method will return 'Suite job'.
743
744        2) A test view is of a child job or a solo test run not part of a
745           suite, and for a SERVER_JOB or CLIENT_JOB.
746           In this case, we will take the job name, remove the build/suite
747           prefix from the job name, and append the rest to 'SERVER_JOB'
748           or 'CLIENT_JOB' as a prefix. So the names returned by this
749           method will look like:
750             'experimental_Telemetry Smoothness Measurement_SERVER_JOB'
751             'experimental_dummy_Pass_SERVER_JOB'
752             'dummy_Fail_SERVER_JOB'
753
754        3) A test view is of a suite job and its status is ABORT.
755           In this case, the view['test_name'] is the child job's name.
756           If it is an experimental test, 'experimental' will be part
757           of the name. For instance,
758             'lumpy-release/R35-5712.0.0/perf_v2/
759                   experimental_Telemetry Smoothness Measurement'
760             'lumpy-release/R35-5712.0.0/dummy/experimental_dummy_Pass'
761             'lumpy-release/R35-5712.0.0/dummy/dummy_Fail'
762           The above names will be converted to the following:
763             'experimental_Telemetry Smoothness Measurement'
764             'experimental_dummy_Pass'
765             'dummy_Fail'
766
767        4) A test view's status is of a suite job and its status is TEST_NA.
768           In this case, the view['test_name'] is the NAME field of the control
769           file. If it is an experimental test, 'experimental' will part of
770           the name. For instance,
771             'experimental_Telemetry Smoothness Measurement'
772             'experimental_dummy_Pass'
773             'dummy_Fail'
774           This method will not modify these names.
775
776        @returns: Test name after normalization.
777
778        """
779        if self.testname is not None:
780            return self.testname
781
782        if (self.is_suite_view and
783                self.view['test_name'].startswith('SERVER_JOB')):
784            # Rename suite job's SERVER_JOB to 'Suite job'.
785            self.testname = self.SUITE_JOB
786            return self.testname
787
788        if (self.view['test_name'].startswith('SERVER_JOB') or
789                self.view['test_name'].startswith('CLIENT_JOB')):
790            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
791            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
792        else:
793            testname = self.view['test_name']
794        experimental =  self.is_experimental()
795        # Remove the build and suite name from testname if any.
796        testname = tools.get_test_name(
797                self.build, self.suite_name, testname)
798        # If an experimental test was aborted, testname
799        # would include the 'experimental' prefix already.
800        prefix = constants.EXPERIMENTAL_PREFIX if (
801                experimental and not
802                testname.startswith(constants.EXPERIMENTAL_PREFIX)) else ''
803        self.testname = prefix + testname
804        return self.testname
805
806
807    def is_relevant_suite_view(self):
808        """Checks whether this is a suite view we should care about.
809
810        @returns: True if it is relevant. False otherwise.
811        """
812        return (self.get_testname() == self.SUITE_JOB or
813                (self.is_suite_view and
814                    not self.view['test_name'].startswith('CLIENT_JOB') and
815                    not self.view['subdir']))
816
817
818    def is_test(self):
819        """Return whether the view is for an actual test.
820
821        @returns True if the view is for an actual test.
822                 False if the view is for SERVER_JOB or CLIENT_JOB.
823
824        """
825        return not (self.view['test_name'].startswith('SERVER_JOB') or
826                self.view['test_name'].startswith('CLIENT_JOB'))
827
828
829    def is_retry(self):
830        """Check whether the view is for a retry.
831
832        @returns: True, if the view is for a retry; False otherwise.
833
834        """
835        return self.view['job_keyvals'].get('retry_original_job_id') is not None
836
837
838    def is_experimental(self):
839        """Check whether a test view is for an experimental test.
840
841        @returns: True if it is for an experimental test, False otherwise.
842
843        """
844        return (self.view['job_keyvals'].get('experimental') == 'True' or
845                tools.get_test_name(self.build, self.suite_name,
846                        self.view['test_name']).startswith('experimental'))
847
848
849    def hit_timeout(self):
850        """Check whether the corresponding job has hit its own timeout.
851
852        Note this method should not be called for those test views
853        that belongs to a suite job and are determined as irrelevant
854        by is_relevant_suite_view.  This is because they are associated
855        to the suite job, whose job start/finished time make no sense
856        to an irrelevant test view.
857
858        @returns: True if the corresponding afe job has hit timeout.
859                  False otherwise.
860        """
861        if (self.is_relevant_suite_view() and
862                self.get_testname() != self.SUITE_JOB):
863            # Any relevant suite test view except SUITE_JOB
864            # did not hit its own timeout because it was not ever run.
865            return False
866        start = (datetime.strptime(
867                self.view['job_started_time'], time_utils.TIME_FMT)
868                if self.view['job_started_time'] else None)
869        end = (datetime.strptime(
870                self.view['job_finished_time'], time_utils.TIME_FMT)
871                if self.view['job_finished_time'] else None)
872        if not start or not end:
873            return False
874        else:
875            return ((end - start).total_seconds()/60.0
876                        > self.afe_job.max_runtime_mins)
877
878
879    def is_aborted(self):
880        """Check if the view was aborted.
881
882        For suite job and child job test views, we check job keyval
883        'aborted_by' and test status.
884
885        For relevant suite job test views, we only check test status
886        because the suite job keyval won't make sense to individual
887        test views.
888
889        @returns: True if the test was as aborted, False otherwise.
890
891        """
892
893        if (self.is_relevant_suite_view() and
894                self.get_testname() != self.SUITE_JOB):
895            return self.view['status'] == 'ABORT'
896        else:
897            return (bool(self.view['job_keyvals'].get('aborted_by')) and
898                    self.view['status'] in ['ABORT', 'RUNNING'])
899
900
901    def is_in_fail_status(self):
902        """Check if the given test's status corresponds to a failure.
903
904        @returns: True if the test's status is FAIL or ERROR. False otherwise.
905
906        """
907        # All the statuses tests can have when they fail.
908        return self.view['status'] in ['FAIL', 'ERROR', 'ABORT']
909
910
911    def is_infra_test(self):
912        """Check whether this is a test that only lab infra is concerned.
913
914        @returns: True if only lab infra is concerned, False otherwise.
915
916        """
917        return self.get_testname() in self.INFRA_TESTS
918
919
920    def get_buildbot_link_reason(self):
921        """Generate the buildbot link reason for the test.
922
923        @returns: A string representing the reason.
924
925        """
926        return ('%s: %s' % (self.view['status'], self.view['reason'])
927                if self.view['reason'] else self.view['status'])
928
929
930    def get_job_id_owner_str(self):
931        """Generate the job_id_owner string for a test.
932
933        @returns: A string which looks like 135036-username
934
935        """
936        return '%s-%s' % (self.view['afe_job_id'], self.user)
937
938
939    def get_bug_info(self, suite_job_keyvals):
940        """Get the bug info from suite_job_keyvals.
941
942        If a bug has been filed for the test, its bug info (bug id and counts)
943        will be stored in the suite job's keyvals. This method attempts to
944        retrieve bug info of the test from |suite_job_keyvals|. It will return
945        None if no bug info is found. No need to check bug info if the view is
946        SUITE_JOB.
947
948        @param suite_job_keyvals: The job keyval dictionary of the suite job.
949                All the bug info about child jobs are stored in
950                suite job's keyvals.
951
952        @returns: None if there is no bug info, or a pair with the
953                  id of the bug, and the count of the number of
954                  times the bug has been seen.
955
956        """
957        if self.get_testname() == self.SUITE_JOB:
958            return None
959        if (self.view['test_name'].startswith('SERVER_JOB') or
960                self.view['test_name'].startswith('CLIENT_JOB')):
961            # Append job name as a prefix for SERVER_JOB and CLIENT_JOB
962            testname= '%s_%s' % (self.view['job_name'], self.view['test_name'])
963        else:
964            testname = self.view['test_name']
965
966        return tools.get_test_failure_bug_info(
967                suite_job_keyvals, self.view['afe_job_id'],
968                testname)
969
970
971    def should_display_buildbot_link(self):
972        """Check whether a buildbot link should show for this view.
973
974        For suite job view, show buildbot link if it fails.
975        For normal test view,
976            show buildbot link if it is a retry
977            show buildbot link if it hits its own timeout.
978            show buildbot link if it fails. This doesn't
979            include the case where it was aborted but has
980            not hit its own timeout (most likely it was aborted because
981            suite has timed out).
982
983        @returns: True if we should show the buildbot link.
984                  False otherwise.
985        """
986        is_bad_status = (self.view['status'] != 'GOOD' and
987                         self.view['status'] != 'TEST_NA')
988        if self.get_testname() == self.SUITE_JOB:
989            return is_bad_status
990        else:
991            if self.is_retry():
992                return True
993            if is_bad_status:
994                return not self.is_aborted() or self.hit_timeout()
995
996
997    def get_control_file_attributes(self):
998        """Get the attributes from the control file of the test.
999
1000        @returns: A list of test attribute or None.
1001        """
1002        control_file = self.afe_job.control_file
1003        attributes = None
1004        if control_file:
1005            cd = control_data.parse_control_string(control_file)
1006            attributes = list(cd.attributes)
1007        return attributes
1008
1009
1010    def override_afe_job_id(self, afe_job_id):
1011        """Overrides the AFE job id for the test.
1012
1013        @param afe_job_id: The new AFE job id to use.
1014        """
1015        self.view['afe_job_id'] = afe_job_id
1016
1017
1018def log_buildbot_links(log_func, links):
1019    """Output buildbot links to log.
1020
1021    @param log_func: Logging function to use.
1022    @param links: Iterable of LogLink instances.
1023    """
1024    for link in links:
1025        for generated_link in link.GenerateBuildbotLinks():
1026            log_func(generated_link)
1027        wmatrix_link = link.GenerateWmatrixRetryLink()
1028        if wmatrix_link:
1029            log_func(wmatrix_link)
1030
1031
1032class ResultCollector(object):
1033    """Collect test results of a suite or a single test run.
1034
1035    Once a suite job has finished, use this class to collect test results.
1036    `run` is the core method that is to be called first. Then the caller
1037    could retrieve information like return code, return message, is_aborted,
1038    and timings by accessing the collector's public attributes. And output
1039    the test results and links by calling the 'output_*' methods.
1040
1041    Here is a overview of what `run` method does.
1042
1043    1) Collect the suite job's results from tko_test_view_2.
1044    For the suite job, we only pull test views without a 'subdir'.
1045    A NULL subdir indicates that the test was _not_ executed. This could be
1046    that no child job was scheduled for this test or the child job got
1047    aborted before starts running.
1048    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1049
1050    2) Collect the child jobs' results from tko_test_view_2.
1051    For child jobs, we pull all the test views associated with them.
1052    (Note 'SERVER_JOB'/'CLIENT_JOB' are handled specially)
1053
1054    3) Generate web and buildbot links.
1055    4) Compute timings of the suite run.
1056    5) Compute the return code based on test results.
1057
1058    @var _instance_server: The hostname of the server that is used
1059                           to service the suite.
1060    @var _afe: The afe rpc client.
1061    @var _tko: The tko rpc client.
1062    @var _build: The build for which the suite is run,
1063                 e.g. 'lumpy-release/R35-5712.0.0'
1064    @var _board: The target board for which the suite is run,
1065                 e.g., 'lumpy', 'link'.
1066    @var _suite_name: The suite name, e.g. 'bvt', 'dummy'.
1067    @var _suite_job_id: The job id of the suite for which we are going to
1068                        collect results.
1069    @var _original_suite_name: The suite name we record timing would be
1070                               different from _suite_name when running
1071                               suite_attr_wrapper.
1072    @var _suite_views: A list of TestView objects, representing relevant
1073                       test views of the suite job.
1074    @var _child_views: A list of TestView objects, representing test views
1075                       of the child jobs.
1076    @var _test_views: A list of TestView objects, representing all test views
1077                      from _suite_views and _child_views.
1078    @var _web_links: A list of web links pointing to the results of jobs.
1079    @var _buildbot_links: A list of buildbot links for non-passing tests.
1080    @var _solo_test_run: True if this is a single test run.
1081    @var return_code: The exit code that should be returned by run_suite.
1082    @var return_message: Any message that should be displayed to explain
1083                         the return code.
1084    @var is_aborted: Whether the suite was aborted or not.
1085                     True, False or None (aborting status is unknown yet)
1086    @var timings: A Timing object that records the suite's timings.
1087
1088    """
1089
1090
1091    def __init__(self, instance_server, afe, tko, build, board,
1092                 suite_name, suite_job_id, original_suite_name=None,
1093                 user=None, solo_test_run=False):
1094        self._instance_server = instance_server
1095        self._afe = afe
1096        self._tko = tko
1097        self._build = build
1098        self._board = board
1099        self._suite_name = suite_name
1100        self._suite_job_id = suite_job_id
1101        self._original_suite_name = original_suite_name or suite_name
1102        self._suite_views = []
1103        self._child_views = []
1104        self._test_views = []
1105        self._retry_counts = {}
1106        self._missing_results = {}
1107        self._web_links = []
1108        self._buildbot_links = []
1109        self._num_child_jobs = 0
1110        self.return_code = None
1111        self.return_message = ''
1112        self.is_aborted = None
1113        self.timings = None
1114        self._user = user or getpass.getuser()
1115        self._solo_test_run = solo_test_run
1116
1117
1118    @property
1119    def buildbot_links(self):
1120        """Provide public access to buildbot links."""
1121        return self._buildbot_links
1122
1123
1124    def _fetch_relevant_test_views_of_suite(self):
1125        """Fetch relevant test views of the suite job.
1126
1127        For the suite job, there will be a test view for SERVER_JOB, and views
1128        for results of its child jobs. For example, assume we've created
1129        a suite job (afe_job_id: 40) that runs dummy_Pass, dummy_Fail,
1130        dummy_Pass.bluetooth. Assume dummy_Pass was aborted before running while
1131        dummy_Path.bluetooth got TEST_NA as no duts have bluetooth.
1132        So the suite job's test views would look like
1133        _____________________________________________________________________
1134        test_idx| job_idx|test_name           |subdir      |afe_job_id|status
1135        10      | 1000   |SERVER_JOB          |----        |40        |GOOD
1136        11      | 1000   |dummy_Pass          |NULL        |40        |ABORT
1137        12      | 1000   |dummy_Fail.Fail     |41-onwer/...|40        |FAIL
1138        13      | 1000   |dummy_Fail.Error    |42-owner/...|40        |ERROR
1139        14      | 1000   |dummy_Pass.bluetooth|NULL        |40        |TEST_NA
1140
1141        For a suite job, we only care about
1142        a) The test view for the suite job's SERVER_JOB
1143        b) The test views for real tests without a subdir. A NULL subdir
1144           indicates that a test didn't get executed.
1145        So, for the above example, we only keep test views whose test_idxs
1146        are 10, 11, 14.
1147
1148        @returns: A list of TestView objects, representing relevant
1149                  test views of the suite job.
1150
1151        """
1152        suite_job = self._afe.get_jobs(id=self._suite_job_id)[0]
1153        views = self._tko.run(call='get_detailed_test_views',
1154                              afe_job_id=self._suite_job_id)
1155        relevant_views = []
1156        for v in views:
1157            v = TestView(v, suite_job, self._suite_name, self._build, self._user,
1158                         solo_test_run=self._solo_test_run)
1159            if v.is_relevant_suite_view():
1160                # If the test doesn't have results in TKO and is being
1161                # displayed in the suite view instead of the child view,
1162                # then afe_job_id is incorrect and from the suite.
1163                # Override it based on the AFE job id which was missing
1164                # results.
1165                # TODO: This is likely inaccurate if a test has multiple
1166                # tries which all fail TKO parse stage.
1167                if v['test_name'] in self._missing_results:
1168                    v.override_afe_job_id(
1169                            self._missing_results[v['test_name']][0])
1170                relevant_views.append(v)
1171        return relevant_views
1172
1173
1174    def _compute_retry_count(self, view):
1175        """Return how many times the test has been retried.
1176
1177        @param view: A TestView instance.
1178        @returns: An int value indicating the retry count.
1179
1180        """
1181        old_job = view['job_keyvals'].get('retry_original_job_id')
1182        count = 0
1183        while old_job:
1184            count += 1
1185            views = self._tko.run(
1186                call='get_detailed_test_views', afe_job_id=old_job)
1187            old_job = (views[0]['job_keyvals'].get('retry_original_job_id')
1188                       if views else None)
1189        return count
1190
1191
1192    def _fetch_test_views_of_child_jobs(self, jobs=None):
1193        """Fetch test views of child jobs.
1194
1195        @returns: A tuple (child_views, retry_counts, missing_results)
1196                  child_views is list of TestView objects, representing
1197                  all valid views.
1198                  retry_counts is a dictionary that maps test_idx to retry
1199                  counts. It only stores retry counts that are greater than 0.
1200                  missing_results is a dictionary that maps test names to
1201                  lists of job ids.
1202
1203        """
1204        child_views = []
1205        retry_counts = {}
1206        missing_results = {}
1207        child_jobs = jobs or self._afe.get_jobs(parent_job_id=self._suite_job_id)
1208        if child_jobs:
1209            self._num_child_jobs = len(child_jobs)
1210        for job in child_jobs:
1211            views = [TestView(v, job, self._suite_name, self._build, self._user)
1212                     for v in self._tko.run(
1213                         call='get_detailed_test_views', afe_job_id=job.id,
1214                         invalid=0)]
1215            if len(views) == 0:
1216                missing_results.setdefault(job.name, []).append(job.id)
1217            contains_test_failure = any(
1218                    v.is_test() and v['status'] != 'GOOD' for v in views)
1219            for v in views:
1220                if (v.is_test() or
1221                        v['status'] != 'GOOD' and not contains_test_failure):
1222                    # For normal test view, just keep it.
1223                    # For SERVER_JOB or CLIENT_JOB, only keep it
1224                    # if it fails and no other test failure.
1225                    child_views.append(v)
1226                    retry_count = self._compute_retry_count(v)
1227                    if retry_count > 0:
1228                        retry_counts[v['test_idx']] = retry_count
1229        return child_views, retry_counts, missing_results
1230
1231
1232    def _generate_web_and_buildbot_links(self):
1233        """Generate web links and buildbot links."""
1234        # TODO(fdeng): If a job was aborted before it reaches Running
1235        # state, we read the test view from the suite job
1236        # and thus this method generates a link pointing to the
1237        # suite job's page for the aborted job. Need a fix.
1238        self._web_links = []
1239        self._buildbot_links = []
1240        # Bug info are stored in the suite job's keyvals.
1241        if self._solo_test_run:
1242            suite_job_keyvals = {}
1243        else:
1244            suite_job_keyvals = self._suite_views[0]['job_keyvals']
1245        for v in self._test_views:
1246            retry_count = self._retry_counts.get(v['test_idx'], 0)
1247            bug_info = v.get_bug_info(suite_job_keyvals)
1248            job_id_owner = v.get_job_id_owner_str()
1249            link = LogLink(
1250                    anchor=v.get_testname(),
1251                    server=self._instance_server,
1252                    job_string=job_id_owner,
1253                    bug_info=bug_info, retry_count=retry_count,
1254                    testname=v.get_testname())
1255            self._web_links.append(link)
1256
1257            if v.should_display_buildbot_link():
1258                link.reason = v.get_buildbot_link_reason()
1259                self._buildbot_links.append(link)
1260
1261
1262    def _record_timings(self):
1263        """Record suite timings."""
1264        self.timings = Timings(self._suite_job_id)
1265        for v in self._test_views:
1266            self.timings.RecordTiming(v)
1267
1268
1269    def _get_return_msg(self, code, tests_passed_after_retry):
1270        """Return the proper message for a given return code.
1271
1272        @param code: An enum value of RETURN_CODES
1273        @param test_passed_after_retry: True/False, indicating
1274            whether there are test(s) that have passed after retry.
1275
1276        @returns: A string, representing the message.
1277
1278        """
1279        if code == RETURN_CODES.INFRA_FAILURE:
1280            return 'Suite job failed or provisioning failed.'
1281        elif code == RETURN_CODES.SUITE_TIMEOUT:
1282            return ('Some test(s) was aborted before running,'
1283                    ' suite must have timed out.')
1284        elif code == RETURN_CODES.WARNING:
1285            if tests_passed_after_retry:
1286                return 'Some test(s) passed after retry.'
1287            else:
1288                return 'Some test(s) raised a warning.'
1289        elif code == RETURN_CODES.ERROR:
1290            return 'Some test(s) failed.'
1291        else:
1292            return ''
1293
1294
1295    def _compute_return_code(self):
1296        """Compute the exit code based on test results."""
1297        code = RETURN_CODES.OK
1298        tests_passed_after_retry = False
1299
1300        for v in self._test_views:
1301            # The order of checking each case is important.
1302            if v.is_experimental():
1303                continue
1304            if v.get_testname() == TestView.SUITE_JOB:
1305                if v.is_aborted() and v.hit_timeout():
1306                    current_code = RETURN_CODES.SUITE_TIMEOUT
1307                elif v.is_in_fail_status():
1308                    current_code = RETURN_CODES.INFRA_FAILURE
1309                elif v['status'] == 'WARN':
1310                    current_code = RETURN_CODES.WARNING
1311                else:
1312                    current_code = RETURN_CODES.OK
1313            else:
1314                if v.is_aborted() and v.is_relevant_suite_view():
1315                    # The test was aborted before started
1316                    # This gurantees that the suite has timed out.
1317                    current_code = RETURN_CODES.SUITE_TIMEOUT
1318                elif v.is_aborted() and not v.hit_timeout():
1319                    # The test was aborted, but
1320                    # not due to a timeout. This is most likely
1321                    # because the suite has timed out, but may
1322                    # also because it was aborted by the user.
1323                    # Since suite timing out is determined by checking
1324                    # the suite job view, we simply ignore this view here.
1325                    current_code = RETURN_CODES.OK
1326                elif v.is_in_fail_status():
1327                    # The test job failed.
1328                    if v.is_infra_test():
1329                        current_code = RETURN_CODES.INFRA_FAILURE
1330                    else:
1331                        current_code = RETURN_CODES.ERROR
1332                elif v['status'] == 'WARN':
1333                    # The test/suite job raised a wanrning.
1334                    current_code = RETURN_CODES.WARNING
1335                elif v.is_retry():
1336                    # The test is a passing retry.
1337                    current_code = RETURN_CODES.WARNING
1338                    tests_passed_after_retry = True
1339                else:
1340                    current_code = RETURN_CODES.OK
1341            code = get_worse_code(code, current_code)
1342
1343        self.return_code = code
1344        self.return_message = self._get_return_msg(
1345                code, tests_passed_after_retry)
1346
1347
1348    def _make_test_results(self):
1349        """Make TestResults for collected tests.
1350
1351        @returns: List of TestResult instances.
1352        """
1353        test_results = []
1354        for test_view in self._test_views:
1355            test_result = TestResult(
1356                test_view=test_view,
1357                retry_count=self._retry_counts.get(test_view['test_idx'], 0))
1358            test_results.append(test_result)
1359        return test_results
1360
1361
1362    def output_results(self):
1363        """Output test results, timings and web links."""
1364        # Output test results
1365        test_results = self._make_test_results()
1366        max_name_length = max(len(test_result.name)
1367                              for test_result in test_results)
1368        for test_result in test_results:
1369            test_result.log_using(logging.info, max_name_length + 3)
1370        # Output suite timings
1371        logging.info(self.timings)
1372        # Output links to test logs
1373        logging.info('\nLinks to test logs:')
1374        for link in self._web_links:
1375            logging.info(link.text_link)
1376        logging.info('\n')
1377
1378
1379    def get_results_dict(self):
1380        """Write test results, timings and web links into a dict.
1381
1382        @returns: A dict of results in the format like:
1383                  {
1384                  'tests': {
1385                        'test_1': {'status': 'PASSED', 'attributes': [1,2], ...}
1386                        'test_2': {'status': 'FAILED', 'attributes': [1],...}
1387                  }
1388                  'suite_timings': {
1389                        'download_start': '1998-07-17 00:00:00',
1390                        'payload_download_end': '1998-07-17 00:00:05',
1391                        ...
1392                  }
1393                  }
1394        """
1395        output_dict = {}
1396        tests_dict = output_dict.setdefault('tests', {})
1397        for v in self._test_views:
1398            test_name = v.get_testname()
1399            test_info = tests_dict.setdefault(test_name, {})
1400            test_info.update({
1401                'status': v['status'],
1402                'attributes': v.get_control_file_attributes() or list(),
1403                'reason': v['reason'],
1404                'retry_count': self._retry_counts.get(v['test_idx'], 0),
1405                })
1406            # For aborted test, the control file will not be parsed and thus
1407            # fail to get the attributes info. Therefore, the subsystems the
1408            # abort test testing will be missing. For this case, we will assume
1409            # the aborted test will test all subsystems, set subsystem:default.
1410            if (test_info['status'] == 'ABORT' and
1411                not any('subsystem:' in a for a in test_info['attributes'])):
1412                test_info['attributes'].append('subsystem:default')
1413
1414        # Write the links to test logs into the |tests_dict| of |output_dict|.
1415        # For test whose status is not 'GOOD', the link is also buildbot_link.
1416        for link in self._web_links:
1417            test_name = link.anchor.strip()
1418            test_info = tests_dict.get(test_name)
1419            if test_info:
1420                test_info['link_to_logs'] = link.url
1421                # Write the wmatrix link into the dict.
1422                if link in self._buildbot_links and link.testname:
1423                    test_info['wmatrix_link'] \
1424                        = reporting_utils.link_retry_url(link.testname)
1425                # Write the bug url into the dict.
1426                if link.bug_id:
1427                    test_info['bug_url'] = link.bug_url
1428
1429        # Write the suite timings into |output_dict|
1430        timings = self.timings
1431        if timings is not None:
1432            time_dict = output_dict.setdefault('suite_timings', {})
1433            time_dict.update({
1434                'download_start' : str(timings.download_start_time),
1435                'payload_download_end' : str(timings.payload_end_time),
1436                'suite_start' : str(timings.suite_start_time),
1437                'artifact_download_end' : str(timings.artifact_end_time),
1438                'tests_start' : str(timings.tests_start_time),
1439                'tests_end' : str(timings.tests_end_time),
1440                })
1441
1442        output_dict['suite_job_id'] = self._suite_job_id
1443
1444        return output_dict
1445
1446
1447    def run(self):
1448        """Collect test results.
1449
1450        This method goes through the following steps:
1451            Fetch relevent test views of the suite job.
1452            Fetch test views of child jobs
1453            Check whether the suite was aborted.
1454            Generate links.
1455            Calculate suite timings.
1456            Compute return code based on the test result.
1457
1458        """
1459        if self._solo_test_run:
1460            self._test_views, self.retry_count, self._missing_results = (
1461                  self._fetch_test_views_of_child_jobs(
1462                          jobs=self._afe.get_jobs(id=self._suite_job_id)))
1463        else:
1464            self._child_views, self._retry_counts, self._missing_results = (
1465                    self._fetch_test_views_of_child_jobs())
1466            self._suite_views = self._fetch_relevant_test_views_of_suite()
1467            self._test_views = self._suite_views + self._child_views
1468        # For hostless job in Starting status, there is no test view associated.
1469        # This can happen when a suite job in Starting status is aborted. When
1470        # the scheduler hits some limit, e.g., max_hostless_jobs_per_drone,
1471        # max_jobs_started_per_cycle, a suite job can stays in Starting status.
1472        if not self._test_views:
1473            self.return_code = RETURN_CODES.INFRA_FAILURE
1474            self.return_message = 'No test view was found.'
1475            return
1476        self.is_aborted = any([view['job_keyvals'].get('aborted_by')
1477                               for view in self._suite_views])
1478        self._generate_web_and_buildbot_links()
1479        self._record_timings()
1480        self._compute_return_code()
1481
1482
1483    def gather_timing_stats(self):
1484        """Collect timing related statistics."""
1485        # Record suite runtime in metadata db.
1486        # Some failure modes can leave times unassigned, report sentinel value
1487        # in that case.
1488        runtime_in_secs = -1
1489        if (self.timings.tests_end_time is not None and
1490            self.timings.suite_start_time is not None):
1491            runtime_in_secs = (self.timings.tests_end_time -
1492                    self.timings.suite_start_time).total_seconds()
1493
1494        job_overhead.record_suite_runtime(self._suite_job_id, self._suite_name,
1495                self._board, self._build, self._num_child_jobs, runtime_in_secs)
1496
1497
1498def _make_builds_from_options(options):
1499    """Create a dict of builds for creating a suite job.
1500
1501    The returned dict maps version label prefixes to build names.  Together,
1502    each key-value pair describes a complete label.
1503
1504    @param options: SimpleNamespace from argument parsing.
1505
1506    @return: dict mapping version label prefixes to build names
1507    """
1508    builds = {}
1509    if options.build:
1510        prefix = provision.get_version_label_prefix(options.build)
1511        builds[prefix] = options.build
1512    if options.firmware_rw_build:
1513        builds[provision.FW_RW_VERSION_PREFIX] = options.firmware_rw_build
1514    if options.firmware_ro_build:
1515        builds[provision.FW_RO_VERSION_PREFIX] = options.firmware_ro_build
1516    return builds
1517
1518
1519@retry.retry(error.StageControlFileFailure, timeout_min=10)
1520def create_suite(afe, options):
1521    """Create a suite with retries.
1522
1523    @param afe: The afe object to insert the new suite job into.
1524    @param options: The options to use in creating the suite.
1525
1526    @return: The afe_job_id of the new suite job.
1527    """
1528    logging.info('%s Submitted create_suite_job rpc',
1529                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1530    return afe.run(
1531        'create_suite_job',
1532        name=options.name,
1533        board=options.board,
1534        builds=_make_builds_from_options(options),
1535        test_source_build=options.test_source_build,
1536        check_hosts=not options.no_wait,
1537        pool=options.pool,
1538        num=options.num,
1539        file_bugs=options.file_bugs,
1540        priority=options.priority,
1541        suite_args=options.suite_args,
1542        wait_for_results=not options.no_wait,
1543        timeout_mins=options.timeout_mins + options.delay_minutes,
1544        max_runtime_mins=options.max_runtime_mins + options.delay_minutes,
1545        job_retry=options.retry,
1546        max_retries=options.max_retries,
1547        suite_min_duts=options.suite_min_duts,
1548        offload_failures_only=options.offload_failures_only,
1549        run_prod_code=options.run_prod_code,
1550        delay_minutes=options.delay_minutes,
1551        job_keyvals=options.job_keyvals,
1552        test_args=options.test_args,
1553    )
1554
1555
1556SuiteResult = namedtuple('SuiteResult', ['return_code', 'output_dict'])
1557
1558
1559def main_without_exception_handling(options):
1560    """
1561    run_suite script without exception handling.
1562
1563    @param options: The parsed options.
1564
1565    @returns: A tuple contains the return_code of run_suite and the dictionary
1566              of the output.
1567
1568    """
1569    # If indicate to use the new style suite control file, convert the args
1570    if options.use_suite_attr:
1571        options = change_options_for_suite_attr(options)
1572
1573    log_name = 'run_suite-default.log'
1574    if options.build:
1575        # convert build name from containing / to containing only _
1576        log_name = 'run_suite-%s.log' % options.build.replace('/', '_')
1577        log_dir = os.path.join(common.autotest_dir, 'logs')
1578        if os.path.exists(log_dir):
1579            log_name = os.path.join(log_dir, log_name)
1580
1581    utils.setup_logging(logfile=log_name)
1582
1583    if not options.bypass_labstatus and not options.web:
1584        utils.check_lab_status(options.build)
1585    instance_server = (options.web if options.web else
1586                       instance_for_pool(options.pool))
1587    afe = frontend_wrappers.RetryingAFE(server=instance_server,
1588                                        timeout_min=options.afe_timeout_mins,
1589                                        delay_sec=options.delay_sec)
1590    logging.info('Autotest instance: %s', instance_server)
1591
1592    rpc_helper = diagnosis_utils.RPCHelper(afe)
1593    is_real_time = True
1594    if options.mock_job_id:
1595        job_id = int(options.mock_job_id)
1596        existing_job = afe.get_jobs(id=job_id, finished=True)
1597        if existing_job:
1598            is_real_time = False
1599        else:
1600            existing_job = afe.get_jobs(id=job_id)
1601        if existing_job:
1602            job_created_on = time_utils.date_string_to_epoch_time(
1603                    existing_job[0].created_on)
1604        else:
1605            raise utils.TestLabException('Failed to retrieve job: %d' % job_id)
1606    else:
1607        try:
1608            rpc_helper.check_dut_availability(options.board, options.pool,
1609                                              options.minimum_duts,
1610                                              options.skip_duts_check)
1611            job_id = create_suite(afe, options)
1612            job_created_on = time.time()
1613        except diagnosis_utils.NotEnoughDutsError as e:
1614            e.add_suite_name(options.name)
1615            e.add_build(options.test_source_build)
1616            pool_health_bug = reporting.PoolHealthBug(e)
1617            bug_id = reporting.Reporter().report(pool_health_bug).bug_id
1618            if bug_id is not None:
1619                logging.info(annotations.StepLink(
1620                    text='Pool Health Bug',
1621                    url=reporting_utils.link_crbug(bug_id)))
1622                e.add_bug_id(bug_id)
1623            raise e
1624        except (error.CrosDynamicSuiteException,
1625                error.RPCException, proxy.JSONRPCException) as e:
1626            logging.exception('Error Message: %s', e)
1627            return (RETURN_CODES.INFRA_FAILURE, {'return_message': str(e)})
1628        except AttributeError:
1629            return (RETURN_CODES.INVALID_OPTIONS, {})
1630
1631    job_timer = diagnosis_utils.JobTimer(
1632            job_created_on, float(options.timeout_mins))
1633    job_url = reporting_utils.link_job(job_id,
1634                                       instance_server=instance_server)
1635    logging.info('%s Created suite job: %s',
1636                 job_timer.format_time(job_timer.job_created_time),
1637                 job_url)
1638    logging.info(annotations.StepLink(
1639        text='Link to suite',
1640        url=job_url))
1641
1642    if options.create_and_return:
1643        msg = '--create_and_return was specified, terminating now.'
1644        logging.info(msg)
1645        return (RETURN_CODES.OK, {'return_message':msg})
1646
1647    if options.no_wait:
1648        return _handle_job_nowait(job_id, options, instance_server)
1649    else:
1650        return _handle_job_wait(afe, job_id, options, job_timer, is_real_time)
1651
1652
1653def _handle_job_wait(afe, job_id, options, job_timer, is_real_time):
1654    """Handle suite job synchronously.
1655
1656    @param afe              AFE instance.
1657    @param job_id           Suite job id.
1658    @param options          Parsed options.
1659    @param job_timer        JobTimer for suite job.
1660    @param is_real_time     Whether or not to handle job timeout.
1661
1662    @return SuiteResult of suite job.
1663    """
1664    code = RETURN_CODES.OK
1665    output_dict = {}
1666    rpc_helper = diagnosis_utils.RPCHelper(afe)
1667    instance_server = afe.server
1668    while not afe.get_jobs(id=job_id, finished=True):
1669        # Note that this call logs output, preventing buildbot's
1670        # 9000 second silent timeout from kicking in. Let there be no
1671        # doubt, this is a hack. The timeout is from upstream buildbot and
1672        # this is the easiest work around.
1673        if job_timer.first_past_halftime():
1674            rpc_helper.diagnose_job(job_id, instance_server)
1675        if job_timer.debug_output_timer.poll():
1676            logging.info('The suite job has another %s till timeout.',
1677                            job_timer.timeout_hours - job_timer.elapsed_time())
1678        time.sleep(10)
1679    logging.info('%s Suite job is finished.',
1680                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1681    # For most cases, ResultCollector should be able to determine whether
1682    # a suite has timed out by checking information in the test view.
1683    # However, occationally tko parser may fail on parsing the
1684    # job_finished time from the job's keyval file. So we add another
1685    # layer of timeout check in run_suite. We do the check right after
1686    # the suite finishes to make it as accurate as possible.
1687    # There is a minor race condition here where we might have aborted
1688    # for some reason other than a timeout, and the job_timer thinks
1689    # it's a timeout because of the jitter in waiting for results.
1690    # The consequence would be that run_suite exits with code
1691    # SUITE_TIMEOUT while it should  have returned INFRA_FAILURE
1692    # instead, which should happen very rarely.
1693    # Note the timeout will have no sense when using -m option.
1694    is_suite_timeout = job_timer.is_suite_timeout()
1695
1696    # Extract the original suite name to record timing.
1697    original_suite_name = get_original_suite_name(options.name,
1698                                                    options.suite_args)
1699    # Start collecting test results.
1700    logging.info('%s Start collectint test results and dump them to json.',
1701                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1702    TKO = frontend_wrappers.RetryingTKO(server=instance_server,
1703                                        timeout_min=options.afe_timeout_mins,
1704                                        delay_sec=options.delay_sec)
1705    collector = ResultCollector(instance_server=instance_server,
1706                                afe=afe, tko=TKO, build=options.build,
1707                                board=options.board,
1708                                suite_name=options.name,
1709                                suite_job_id=job_id,
1710                                original_suite_name=original_suite_name)
1711    collector.run()
1712    # Dump test outputs into json.
1713    output_dict = collector.get_results_dict()
1714    output_dict['autotest_instance'] = instance_server
1715    if not options.json_dump:
1716        collector.output_results()
1717    code = collector.return_code
1718    return_message = collector.return_message
1719    if is_real_time:
1720        # Do not record stats if the suite was aborted (either by a user
1721        # or through the golo rpc).
1722        # Also do not record stats if is_aborted is None, indicating
1723        # aborting status is unknown yet.
1724        if collector.is_aborted == False:
1725            logging.info('%s Gathering timing stats for the suite job.',
1726                         diagnosis_utils.JobTimer.format_time(datetime.now()))
1727            collector.gather_timing_stats()
1728
1729        if collector.is_aborted == True and is_suite_timeout:
1730            # There are two possible cases when a suite times out.
1731            # 1. the suite job was aborted due to timing out
1732            # 2. the suite job succeeded, but some child jobs
1733            #    were already aborted before the suite job exited.
1734            # The case 2 was handled by ResultCollector,
1735            # here we handle case 1.
1736            old_code = code
1737            code = get_worse_code(
1738                    code, RETURN_CODES.SUITE_TIMEOUT)
1739            if old_code != code:
1740                return_message = 'Suite job timed out.'
1741                logging.info('Upgrade return code from %s to %s '
1742                                'because suite job has timed out.',
1743                                RETURN_CODES.get_string(old_code),
1744                                RETURN_CODES.get_string(code))
1745
1746        logging.info('\n %s Attempting to display pool info: %s',
1747                     diagnosis_utils.JobTimer.format_time(datetime.now()),
1748                     options.pool)
1749        try:
1750            # Add some jitter to make up for any latency in
1751            # aborting the suite or checking for results.
1752            cutoff = (job_timer.timeout_hours +
1753                      timedelta(hours=0.3))
1754            rpc_helper.diagnose_pool(
1755                    options.board, options.pool, cutoff)
1756        except proxy.JSONRPCException:
1757            logging.warning('Unable to display pool info.')
1758
1759    # And output return message.
1760    if return_message:
1761        logging.info('Reason: %s', return_message)
1762        output_dict['return_message'] = return_message
1763
1764    logging.info('\n %s Output below this line is for buildbot consumption:',
1765                 diagnosis_utils.JobTimer.format_time(datetime.now()))
1766    log_buildbot_links(logging.info, collector._buildbot_links)
1767    return SuiteResult(code, output_dict)
1768
1769
1770def _handle_job_nowait(job_id, options, instance_server):
1771    """Handle suite job asynchronously.
1772
1773    @param job_id           Suite job id.
1774    @param options          Parsed options.
1775    @param instance_server  Autotest instance hostname.
1776
1777    @return SuiteResult of suite job.
1778    """
1779    logging.info('Created suite job: %r', job_id)
1780    link = LogLink(options.name, instance_server,
1781                    '%s-%s' % (job_id, getpass.getuser()))
1782    for generate_link in link.GenerateBuildbotLinks():
1783        logging.info(generate_link)
1784    logging.info('--no_wait specified; Exiting.')
1785    return SuiteResult(RETURN_CODES.OK,
1786                        {'return_message': '--no_wait specified; Exiting.'})
1787
1788
1789def main():
1790    """Entry point."""
1791    utils.verify_not_root_user()
1792
1793    parser = make_parser()
1794    options = parser.parse_args()
1795    try:
1796        # Silence the log when dumping outputs into json
1797        if options.json_dump:
1798            logging.disable(logging.CRITICAL)
1799
1800        if not verify_options(options):
1801            parser.print_help()
1802            code = RETURN_CODES.INVALID_OPTIONS
1803            output_dict = {'return_code': RETURN_CODES.INVALID_OPTIONS}
1804        else:
1805            code, output_dict = main_without_exception_handling(options)
1806    except diagnosis_utils.BoardNotAvailableError as e:
1807        output_dict = {'return_message': 'Skipping testing: %s' % e.message}
1808        code = RETURN_CODES.BOARD_NOT_AVAILABLE
1809        logging.info(output_dict['return_message'])
1810    except utils.TestLabException as e:
1811        output_dict = {'return_message': 'TestLabException: %s' % e}
1812        code = RETURN_CODES.INFRA_FAILURE
1813        logging.exception(output_dict['return_message'])
1814    except Exception as e:
1815        output_dict = {
1816            'return_message': 'Unhandled run_suite exception: %s' % e
1817        }
1818        code = RETURN_CODES.INFRA_FAILURE
1819        logging.exception(output_dict['return_message'])
1820
1821    # Dump test outputs into json.
1822    output_dict['return_code'] = code
1823    if options.json_dump:
1824        output_json = json.dumps(output_dict, sort_keys=True)
1825        output_json_marked = '#JSON_START#%s#JSON_END#' % output_json.strip()
1826        sys.stdout.write(output_json_marked)
1827
1828    logging.info('Will return from run_suite with status: %s',
1829                  RETURN_CODES.get_string(code))
1830    return code
1831
1832
1833if __name__ == "__main__":
1834    sys.exit(main())
1835