1#!/usr/bin/python
2#
3# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Tool to validate code in prod branch before pushing to lab.
8
9The script runs push_to_prod suite to verify code in prod branch is ready to be
10pushed. Link to design document:
11https://docs.google.com/a/google.com/document/d/1JMz0xS3fZRSHMpFkkKAL_rxsdbNZomhHbC3B8L71uuI/edit
12
13To verify if prod branch can be pushed to lab, run following command in
14chromeos-autotest.cbf server:
15/usr/local/autotest/site_utils/test_push.py -e someone@company.com
16
17The script uses latest stumpy canary build as test build by default.
18
19"""
20
21import argparse
22import getpass
23import multiprocessing
24import os
25import re
26import subprocess
27import sys
28import time
29import traceback
30import urllib2
31
32import common
33try:
34    from autotest_lib.frontend import setup_django_environment
35    from autotest_lib.frontend.afe import models
36except ImportError:
37    # Unittest may not have Django database configured and will fail to import.
38    pass
39from autotest_lib.client.common_lib import global_config
40from autotest_lib.server import site_utils
41from autotest_lib.server.cros import provision
42from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
43from autotest_lib.server.cros.dynamic_suite import reporting
44from autotest_lib.server.hosts import factory
45from autotest_lib.site_utils import gmail_lib
46from autotest_lib.site_utils.suite_scheduler import constants
47
48CONFIG = global_config.global_config
49
50AFE = frontend_wrappers.RetryingAFE(timeout_min=0.5, delay_sec=2)
51
52MAIL_FROM = 'chromeos-test@google.com'
53DEVSERVERS = CONFIG.get_config_value('CROS', 'dev_server', type=list,
54                                     default=[])
55BUILD_REGEX = '^R[\d]+-[\d]+\.[\d]+\.[\d]+$'
56RUN_SUITE_COMMAND = 'run_suite.py'
57PUSH_TO_PROD_SUITE = 'push_to_prod'
58DUMMY_SUITE = 'dummy'
59AU_SUITE = 'paygen_au_canary'
60
61SUITE_JOB_START_INFO_REGEX = ('^.*Created suite job:.*'
62                              'tab_id=view_job&object_id=(\d+)$')
63
64# Dictionary of test results keyed by test name regular expression.
65EXPECTED_TEST_RESULTS = {'^SERVER_JOB$':                 'GOOD',
66                         # This is related to dummy_Fail/control.dependency.
67                         'dummy_Fail.dependency$':       'TEST_NA',
68                         'login_LoginSuccess.*':         'GOOD',
69                         'platform_InstallTestImage_SERVER_JOB$': 'GOOD',
70                         'provision_AutoUpdate.double':  'GOOD',
71                         'dummy_Pass.*':                 'GOOD',
72                         'dummy_Fail.Fail$':             'FAIL',
73                         'dummy_Fail.RetryFail$':        'FAIL',
74                         'dummy_Fail.RetrySuccess':      'GOOD',
75                         'dummy_Fail.Error$':            'ERROR',
76                         'dummy_Fail.Warn$':             'WARN',
77                         'dummy_Fail.NAError$':          'TEST_NA',
78                         'dummy_Fail.Crash$':            'GOOD',
79                         }
80
81EXPECTED_TEST_RESULTS_DUMMY = {'^SERVER_JOB$':       'GOOD',
82                               'dummy_Pass.*':       'GOOD',
83                               'dummy_Fail.Fail':    'FAIL',
84                               'dummy_Fail.Warn':    'WARN',
85                               'dummy_Fail.Crash':   'GOOD',
86                               'dummy_Fail.Error':   'ERROR',
87                               'dummy_Fail.NAError': 'TEST_NA',}
88
89EXPECTED_TEST_RESULTS_AU = {'SERVER_JOB$':                        'GOOD',
90         'autoupdate_EndToEndTest.paygen_au_canary_delta.*': 'GOOD',
91         'autoupdate_EndToEndTest.paygen_au_canary_full.*':  'GOOD',
92         }
93
94# Anchor for the auto-filed bug for dummy_Fail tests.
95BUG_ANCHOR = 'TestFailure(push_to_prod,dummy_Fail.Fail,always fail)'
96
97URL_HOST = CONFIG.get_config_value('SERVER', 'hostname', type=str)
98URL_PATTERN = CONFIG.get_config_value('CROS', 'log_url_pattern', type=str)
99
100# Some test could be missing from the test results for various reasons. Add
101# such test in this list and explain the reason.
102IGNORE_MISSING_TESTS = [
103    # For latest build, npo_test_delta does not exist.
104    'autoupdate_EndToEndTest.npo_test_delta.*',
105    # For trybot build, nmo_test_delta does not exist.
106    'autoupdate_EndToEndTest.nmo_test_delta.*',
107    # Older build does not have login_LoginSuccess test in push_to_prod suite.
108    # TODO(dshi): Remove following lines after R41 is stable.
109    'login_LoginSuccess']
110
111# Save all run_suite command output.
112run_suite_output = []
113
114class TestPushException(Exception):
115    """Exception to be raised when the test to push to prod failed."""
116    pass
117
118
119def powerwash_dut(hostname):
120    """Powerwash the dut with the given hostname.
121
122    @param hostname: hostname of the dut.
123    """
124    host = factory.create_host(hostname)
125    host.run('echo "fast safe" > '
126             '/mnt/stateful_partition/factory_install_reset')
127    host.run('reboot')
128    host.close()
129
130
131def get_default_build(devserver=None, board='stumpy'):
132    """Get the default build to be used for test.
133
134    @param devserver: devserver used to look for latest staged build. If value
135                      is None, all devservers in config will be tried.
136    @param board: Name of board to be tested, default is stumpy.
137    @return: Build to be tested, e.g., stumpy-release/R36-5881.0.0
138    """
139    LATEST_BUILD_URL_PATTERN = '%s/latestbuild?target=%s-release'
140    build = None
141    if not devserver:
142        for server in DEVSERVERS:
143            url = LATEST_BUILD_URL_PATTERN % (server, board)
144            build = urllib2.urlopen(url).read()
145            if build and re.match(BUILD_REGEX, build):
146                return '%s-release/%s' % (board, build)
147
148    # If no devserver has any build staged for the given board, use the stable
149    # build in config.
150    build = CONFIG.get_config_value('CROS', 'stable_cros_version')
151    return '%s-release/%s' % (board, build)
152
153
154def parse_arguments():
155    """Parse arguments for test_push tool.
156
157    @return: Parsed arguments.
158
159    """
160    parser = argparse.ArgumentParser()
161    parser.add_argument('-b', '--board', dest='board', default='stumpy',
162                        help='Default is stumpy.')
163    parser.add_argument('-sb', '--shard_board', dest='shard_board',
164                        default='quawks',
165                        help='Default is quawks.')
166    parser.add_argument('-i', '--build', dest='build', default=None,
167                        help='Default is the latest canary build of given '
168                             'board. Must be a canary build, otherwise AU test '
169                             'will fail.')
170    parser.add_argument('-si', '--shard_build', dest='shard_build', default=None,
171                        help='Default is the latest canary build of given '
172                             'board. Must be a canary build, otherwise AU test '
173                             'will fail.')
174    parser.add_argument('-p', '--pool', dest='pool', default='bvt')
175    parser.add_argument('-u', '--num', dest='num', type=int, default=3,
176                        help='Run on at most NUM machines.')
177    parser.add_argument('-f', '--file_bugs', dest='file_bugs', default='True',
178                        help='File bugs on test failures. Must pass "True" or '
179                             '"False" if used.')
180    parser.add_argument('-e', '--email', dest='email', default=None,
181                        help='Email address for the notification to be sent to '
182                             'after the script finished running.')
183    parser.add_argument('-d', '--devserver', dest='devserver',
184                        default=None,
185                        help='devserver to find what\'s the latest build.')
186    parser.add_argument('-t', '--timeout_min', dest='timeout_min', type=int,
187                        default=24,
188                        help='Time in mins to wait before abort the jobs we '
189                             'are waiting on. Only for the asynchronous suites '
190                             'triggered by create_and_return flag.')
191
192    arguments = parser.parse_args(sys.argv[1:])
193
194    # Get latest canary build as default build.
195    if not arguments.build:
196        arguments.build = get_default_build(arguments.devserver,
197                                            arguments.board)
198    if not arguments.shard_build:
199        arguments.shard_build = get_default_build(arguments.devserver,
200                                                  arguments.shard_board)
201
202    return arguments
203
204
205def do_run_suite(suite_name, arguments, use_shard=False,
206                 create_and_return=False):
207    """Call run_suite to run a suite job, and return the suite job id.
208
209    The script waits the suite job to finish before returning the suite job id.
210    Also it will echo the run_suite output to stdout.
211
212    @param suite_name: Name of a suite, e.g., dummy.
213    @param arguments: Arguments for run_suite command.
214    @param use_shard: If true, suite is scheduled for shard board.
215    @param create_and_return: If True, run_suite just creates the suite, print
216                              the job id, then finish immediately.
217
218    @return: Suite job ID.
219
220    """
221    if not use_shard:
222        board = arguments.board
223        build = arguments.build
224    else:
225        board = arguments.shard_board
226        build = arguments.shard_build
227
228    # Remove cros-version label to force provision.
229    hosts = AFE.get_hosts(label=constants.Labels.BOARD_PREFIX+board)
230    for host in hosts:
231        for label in [l for l in host.labels
232                      if l.startswith(provision.CROS_VERSION_PREFIX)]:
233            AFE.run('host_remove_labels', id=host.id, labels=[label])
234
235        if use_shard and not create_and_return:
236            # Let's verify the repair flow and powerwash the duts.  We can
237            # assume they're all cros hosts (valid assumption?) so powerwash
238            # will work.
239            try:
240                powerwash_dut(host.hostname)
241            except Exception as e:
242                raise TestPushException('Failed to powerwash dut %s. Make '
243                                        'sure the dut is working first. '
244                                        'Error: %s' % (host.hostname, e))
245            AFE.reverify_hosts(hostnames=[host.hostname])
246
247    current_dir = os.path.dirname(os.path.realpath(__file__))
248    cmd = [os.path.join(current_dir, RUN_SUITE_COMMAND),
249           '-s', suite_name,
250           '-b', board,
251           '-i', build,
252           '-p', arguments.pool,
253           '-u', str(arguments.num),
254           '-f', arguments.file_bugs]
255    if create_and_return:
256        cmd += ['-c']
257
258    suite_job_id = None
259
260    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
261                            stderr=subprocess.STDOUT)
262
263    while True:
264        line = proc.stdout.readline()
265
266        # Break when run_suite process completed.
267        if not line and proc.poll() != None:
268            break
269        print line.rstrip()
270        run_suite_output.append(line.rstrip())
271
272        if not suite_job_id:
273            m = re.match(SUITE_JOB_START_INFO_REGEX, line)
274            if m and m.group(1):
275                suite_job_id = int(m.group(1))
276
277    if not suite_job_id:
278        raise TestPushException('Failed to retrieve suite job ID.')
279
280    # If create_and_return specified, wait for the suite to finish.
281    if create_and_return:
282        end = time.time() + arguments.timeout_min * 60
283        while not AFE.get_jobs(id=suite_job_id, finished=True):
284            if time.time() < end:
285                time.sleep(10)
286            else:
287                AFE.run('abort_host_queue_entries', job=suite_job_id)
288                raise TestPushException(
289                        'Asynchronous suite triggered by create_and_return '
290                        'flag has timed out after %d mins. Aborting it.' %
291                        arguments.timeout_min)
292
293    print 'Suite job %s is completed.' % suite_job_id
294    return suite_job_id
295
296
297def check_dut_image(build, suite_job_id):
298    """Confirm all DUTs used for the suite are imaged to expected build.
299
300    @param build: Expected build to be imaged.
301    @param suite_job_id: job ID of the suite job.
302    @raise TestPushException: If a DUT does not have expected build imaged.
303    """
304    print 'Checking image installed in DUTs...'
305    job_ids = [job.id for job in
306               models.Job.objects.filter(parent_job_id=suite_job_id)]
307    hqes = [models.HostQueueEntry.objects.filter(job_id=job_id)[0]
308            for job_id in job_ids]
309    hostnames = set([hqe.host.hostname for hqe in hqes])
310    for hostname in hostnames:
311        found_build = site_utils.get_build_from_afe(hostname, AFE)
312        if found_build != build:
313            raise TestPushException('DUT is not imaged properly. Host %s has '
314                                    'build %s, while build %s is expected.' %
315                                    (hostname, found_build, build))
316
317
318def test_suite(suite_name, expected_results, arguments, use_shard=False,
319               create_and_return=False):
320    """Call run_suite to start a suite job and verify results.
321
322    @param suite_name: Name of a suite, e.g., dummy
323    @param expected_results: A dictionary of test name to test result.
324    @param arguments: Arguments for run_suite command.
325    @param use_shard: If true, suite is scheduled for shard board.
326    @param create_and_return: If True, run_suite just creates the suite, print
327                              the job id, then finish immediately.
328    """
329    suite_job_id = do_run_suite(suite_name, arguments, use_shard,
330                                create_and_return)
331
332    # Confirm all DUTs used for the suite are imaged to expected build.
333    # hqe.host_id for jobs running in shard is not synced back to master db,
334    # therefore, skip verifying dut build for jobs running in shard.
335    if suite_name != AU_SUITE and not use_shard:
336        check_dut_image(arguments.build, suite_job_id)
337
338    # Find all tests and their status
339    print 'Comparing test results...'
340    TKO = frontend_wrappers.RetryingTKO(timeout_min=0.1, delay_sec=10)
341    test_views = site_utils.get_test_views_from_tko(suite_job_id, TKO)
342
343    mismatch_errors = []
344    extra_test_errors = []
345
346    found_keys = set()
347    for test_name,test_status in test_views.items():
348        print "%s%s" % (test_name.ljust(30), test_status)
349        test_found = False
350        for key,val in expected_results.items():
351            if re.search(key, test_name):
352                test_found = True
353                found_keys.add(key)
354                # TODO(dshi): result for this test is ignored until servo is
355                # added to a host accessible by cbf server (crbug.com/277109).
356                if key == 'platform_InstallTestImage_SERVER_JOB$':
357                    continue
358                if val != test_status:
359                    error = ('%s Expected: [%s], Actual: [%s]' %
360                             (test_name, val, test_status))
361                    mismatch_errors.append(error)
362        if not test_found:
363            extra_test_errors.append(test_name)
364
365    missing_test_errors = set(expected_results.keys()) - found_keys
366    for exception in IGNORE_MISSING_TESTS:
367        try:
368            missing_test_errors.remove(exception)
369        except KeyError:
370            pass
371
372    summary = []
373    if mismatch_errors:
374        summary.append(('Results of %d test(s) do not match expected '
375                        'values:') % len(mismatch_errors))
376        summary.extend(mismatch_errors)
377        summary.append('\n')
378
379    if extra_test_errors:
380        summary.append('%d test(s) are not expected to be run:' %
381                       len(extra_test_errors))
382        summary.extend(extra_test_errors)
383        summary.append('\n')
384
385    if missing_test_errors:
386        summary.append('%d test(s) are missing from the results:' %
387                       len(missing_test_errors))
388        summary.extend(missing_test_errors)
389        summary.append('\n')
390
391    # Test link to log can be loaded.
392    job_name = '%s-%s' % (suite_job_id, getpass.getuser())
393    log_link = URL_PATTERN % (URL_HOST, job_name)
394    try:
395        urllib2.urlopen(log_link).read()
396    except urllib2.URLError:
397        summary.append('Failed to load page for link to log: %s.' % log_link)
398
399    if summary:
400        raise TestPushException('\n'.join(summary))
401
402
403def test_suite_wrapper(queue, suite_name, expected_results, arguments,
404                       use_shard=False, create_and_return=False):
405    """Wrapper to call test_suite. Handle exception and pipe it to parent
406    process.
407
408    @param queue: Queue to save exception to be accessed by parent process.
409    @param suite_name: Name of a suite, e.g., dummy
410    @param expected_results: A dictionary of test name to test result.
411    @param arguments: Arguments for run_suite command.
412    @param use_shard: If true, suite is scheduled for shard board.
413    @param create_and_return: If True, run_suite just creates the suite, print
414                              the job id, then finish immediately.
415    """
416    try:
417        test_suite(suite_name, expected_results, arguments, use_shard,
418                   create_and_return)
419    except:
420        # Store the whole exc_info leads to a PicklingError.
421        except_type, except_value, tb = sys.exc_info()
422        queue.put((except_type, except_value, traceback.extract_tb(tb)))
423
424
425def close_bug():
426    """Close all existing bugs filed for dummy_Fail.
427
428    @return: A list of issue ids to be used in check_bug_filed_and_deduped.
429    """
430    old_issue_ids = []
431    reporter = reporting.Reporter()
432    while True:
433        issue = reporter.find_issue_by_marker(BUG_ANCHOR)
434        if not issue:
435            return old_issue_ids
436        if issue.id in old_issue_ids:
437            raise TestPushException('Failed to close issue %d' % issue.id)
438        old_issue_ids.append(issue.id)
439        reporter.modify_bug_report(issue.id,
440                                   comment='Issue closed by test_push script.',
441                                   label_update='',
442                                   status='WontFix')
443
444
445def check_bug_filed_and_deduped(old_issue_ids):
446    """Confirm bug related to dummy_Fail was filed and deduped.
447
448    @param old_issue_ids: A list of issue ids that was closed earlier. id of the
449        new issue must be not in this list.
450    @raise TestPushException: If auto bug file failed to create a new issue or
451        dedupe multiple failures.
452    """
453    reporter = reporting.Reporter()
454    issue = reporter.find_issue_by_marker(BUG_ANCHOR)
455    if not issue:
456        raise TestPushException('Auto bug file failed. Unable to locate bug '
457                                'with marker %s' % BUG_ANCHOR)
458    if old_issue_ids and issue.id in old_issue_ids:
459        raise TestPushException('Auto bug file failed to create a new issue. '
460                                'id of the old issue found is %d.' % issue.id)
461    if not ('%s2' % reporter.AUTOFILED_COUNT) in issue.labels:
462        raise TestPushException(('Auto bug file failed to dedupe for issue %d '
463                                 'with labels of %s.') %
464                                (issue.id, issue.labels))
465    # Close the bug, and do the search again, which should return None.
466    reporter.modify_bug_report(issue.id,
467                               comment='Issue closed by test_push script.',
468                               label_update='',
469                               status='WontFix')
470    second_issue = reporter.find_issue_by_marker(BUG_ANCHOR)
471    if second_issue:
472        ids = '%d, %d' % (issue.id, second_issue.id)
473        raise TestPushException(('Auto bug file failed. Multiple issues (%s) '
474                                 'filed with marker %s') % (ids, BUG_ANCHOR))
475    print 'Issue %d was filed and deduped successfully.' % issue.id
476
477
478def check_queue(queue):
479    """Check the queue for any exception being raised.
480
481    @param queue: Queue used to store exception for parent process to access.
482    @raise: Any exception found in the queue.
483    """
484    if queue.empty():
485        return
486    exc_info = queue.get()
487    # Raise the exception with original backtrace.
488    print 'Original stack trace of the exception:\n%s' % exc_info[2]
489    raise exc_info[0](exc_info[1])
490
491
492def main():
493    """Entry point for test_push script."""
494    arguments = parse_arguments()
495
496    try:
497        # Close existing bugs. New bug should be filed in dummy_Fail test.
498        old_issue_ids = close_bug()
499
500        queue = multiprocessing.Queue()
501
502        push_to_prod_suite = multiprocessing.Process(
503                target=test_suite_wrapper,
504                args=(queue, PUSH_TO_PROD_SUITE, EXPECTED_TEST_RESULTS,
505                      arguments))
506        push_to_prod_suite.start()
507
508        # TODO(dshi): Remove following line after crbug.com/267644 is fixed.
509        # Also, merge EXPECTED_TEST_RESULTS_AU to EXPECTED_TEST_RESULTS
510        au_suite = multiprocessing.Process(
511                target=test_suite_wrapper,
512                args=(queue, AU_SUITE, EXPECTED_TEST_RESULTS_AU,
513                      arguments))
514        au_suite.start()
515
516        shard_suite = multiprocessing.Process(
517                target=test_suite_wrapper,
518                args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,
519                      arguments, True))
520        shard_suite.start()
521
522        # suite test with --create_and_return flag
523        asynchronous_suite = multiprocessing.Process(
524                target=test_suite_wrapper,
525                args=(queue, DUMMY_SUITE, EXPECTED_TEST_RESULTS_DUMMY,
526                      arguments, True, True))
527        asynchronous_suite.start()
528
529        bug_filing_checked = False
530        while (push_to_prod_suite.is_alive() or au_suite.is_alive() or
531               shard_suite.is_alive() or asynchronous_suite.is_alive()):
532            check_queue(queue)
533            # Check bug filing results to fail early if bug filing failed.
534            if not bug_filing_checked and not push_to_prod_suite.is_alive():
535                check_bug_filed_and_deduped(old_issue_ids)
536                bug_filing_checked = True
537            time.sleep(5)
538
539        check_queue(queue)
540
541        push_to_prod_suite.join()
542        au_suite.join()
543        shard_suite.join()
544        asynchronous_suite.join()
545    except Exception as e:
546        print 'Test for pushing to prod failed:\n'
547        print str(e)
548        # Send out email about the test failure.
549        if arguments.email:
550            gmail_lib.send_email(
551                    arguments.email,
552                    'Test for pushing to prod failed. Do NOT push!',
553                    ('Errors occurred during the test:\n\n%s\n\n' % str(e) +
554                     'run_suite output:\n\n%s' % '\n'.join(run_suite_output)))
555        raise
556
557    message = ('\nAll tests are completed successfully, prod branch is ready to'
558               ' be pushed.')
559    print message
560    # Send out email about test completed successfully.
561    if arguments.email:
562        gmail_lib.send_email(
563                arguments.email,
564                'Test for pushing to prod completed successfully',
565                message)
566
567
568if __name__ == '__main__':
569    sys.exit(main())
570