1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4"""
5Test to generate the AFDO profile for a set of ChromeOS benchmarks.
6
7This will run a pre-determined set of benchmarks on the DUT under
8the monitoring of the linux "perf" tool. The resulting perf.data
9file will then be copied to Google Storage (GS) where it can be
10used by the AFDO optimized build.
11
12Given that the telemetry benchmarks are quite unstable on ChromeOS at
13this point, this test also supports a mode where the benchmarks are
14executed outside of the telemetry framework. It is not the same as
15executing the benchmarks under telemetry because there is no telemetry
16measurement taken but, for the purposes of profiling Chrome, it should
17be pretty close.
18
19Example invocation:
20/usr/bin/test_that --debug --board=lumpy <DUT IP>
21  --args="ignore_failures=True local=True gs_test_location=True"
22  telemetry_AFDOGenerate
23"""
24
25from __future__ import print_function
26
27import bz2
28import logging
29import os
30import time
31
32from contextlib import contextmanager
33
34from autotest_lib.client.common_lib import error
35from autotest_lib.server import autotest
36from autotest_lib.server import test
37from autotest_lib.server import utils
38from autotest_lib.server.cros import filesystem_util
39from autotest_lib.server.cros import telemetry_runner
40from autotest_lib.site_utils import test_runner_utils
41
42# These are arguments to the linux "perf" tool.
43# The -e value is processor specific and comes from the Intel SDM vol 3b
44PROFILER_ARGS = 'record -a -e r20c4 -c 50000 -b'
45
46# In practice, it takes >2min to copy the perf.data back from the DUT, set
47# this timeout to 600 secs to be safe.
48WAIT_FOR_CMD_TIMEOUT_SECS = 600
49
50# Reuse ssh and scp settings from telemetry_Crosperf
51RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH
52DUT_SCP_OPTIONS = ' '.join([
53        '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null',
54        '-o BatchMode=yes', '-o ConnectTimeout=30',
55        '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3',
56        '-o ConnectionAttempts=4', '-o Protocol=2'
57])
58DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf'
59
60_WAIT_CMD_TEMPLATE = """\
61for _ in {1..%(timeout)d}; do \
62  ps %(pid)d >/dev/null || break; \
63  sleep 1; \
64done; \
65! ps %(pid)d >/dev/null \
66"""
67
68
69def _wait_for_process(host, pid, timeout=-1):
70    """Waits for a process on the DUT to terminate.
71
72    @param host: A host object representing the DUT.
73    @param pid: The process ID (integer).
74    @param timeout: Number of seconds to wait; default is wait forever.
75    """
76    wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout}
77    return host.run(wait_cmd, ignore_status=True).exit_status
78
79
80# List of benchmarks to run to capture profile information. This is
81# based on the "superhero" list and other telemetry benchmarks. Goal is
82# to have a short list that is as representative as possible and takes a
83# short time to execute. At this point the list of benchmarks is in flux.
84TELEMETRY_AFDO_BENCHMARKS = (
85        # page_cycler tests are deprecated. Replace them with loading.desktop.
86        ('loading.desktop', ('--pageset-repeat=1',
87                             '--story-tag-filter=typical')),
88        ('loading.desktop', ('--pageset-repeat=1',
89                             '--story-tag-filter=intl_ja_zh')),
90        ('rendering.desktop',
91         ('--story-tag-filter=tough_canvas',
92          '--story-filter="bouncing\\*\\|canvas\\*\\|microsoft\\*"')),
93        ('octane', ),
94        ('kraken', ),
95        ('speedometer2', ),
96)
97
98# Temporarily disable this benchmark because it is failing a
99# lot. Filed chromium:590127
100# ('smoothness.tough_webgl_cases',)
101
102# Some benchmarks removed from the profile set:
103# 'page_cycler.morejs' -> uninteresting, seems to fail frequently,
104# 'page_cycler.moz' -> seems very old.
105# 'media.tough_video_cases' -> removed this because it does not bring
106#                              any benefit and takes more than 12 mins
107
108# List of boards where this test can be run.  Currently, it needs a
109# machines with at least 4GB of memory or 2GB of /tmp.
110# This must be consistent with chromite.
111GCC_BOARDS = ['lumpy']
112
113# Should be disjoint with GCC_BOARDS
114LLVM_BOARDS = ['chell']
115
116# FIXME(tcwang): only used for testing Async AFDO generation builders.
117# Remove this after testing is done.
118# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated
119# by samus is not suitable for production in both master and branch.
120# So it's suitable to test generation profiles but not actually use it.
121LLVM_BOARDS_ASYNC = ['samus']
122
123
124class telemetry_AFDOGenerate(test.test):
125    """
126    Run one or more telemetry benchmarks under the "perf" monitoring
127    tool, generate a "perf.data" file and upload to GS for comsumption
128    by the AFDO optimized build.
129    """
130    version = 1
131
132    def scp_perf_data(self, dut, host_dir):
133        """Copy perf data from dut.
134
135        @param dut: The autotest host object representing DUT.
136        @param host_dir: The directory on host to put the file .
137
138        @returns status code for scp command.
139        """
140        cmd = []
141        src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR,
142                                  'perf.data'))
143        cmd.extend(['scp', DUT_SCP_OPTIONS, RSA_KEY, '-P', str(dut.port), '-v',
144                    src, host_dir])
145        command = ' '.join(cmd)
146
147        logging.debug('Retrieving Perf Data: %s', command)
148        try:
149            result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS)
150            exit_code = result.exit_status
151        except Exception as e:
152            logging.error('Failed to retrieve results: %s', e)
153            raise
154
155        logging.debug('command return value: %d', exit_code)
156        return exit_code
157
158    @contextmanager
159    def perf_on_dut(self):
160        """Start and kill perf process on DUT.
161        """
162        logging.info('Starting perf process in background.')
163        perf_cmd = 'nohup perf %s -o %s/perf.data' \
164                    % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR)
165        perf_pid = self._host.run_background(perf_cmd)
166
167        try:
168            # Use `kill -0` to check whether the perf process is alive
169            verify_cmd = 'kill -0 %s' % perf_pid
170            if self._host.run(verify_cmd, ignore_status=True).exit_status != 0:
171                logging.error('Perf process not started correctly on DUT')
172                raise RuntimeError
173            logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd)
174            yield
175        finally:
176            # Check if process is still alive after benchmark run, if yes,
177            # then kill it with -2 (which is SIGINT).
178            kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid
179            if self._host.run(kill_cmd, ignore_status=True).exit_status != 0:
180                logging.error('Perf process is not killed correctly on DUT.')
181                raise RuntimeError
182            # Perf process may not be terminated right after the kill command,
183            # wait until perf process finishes.
184            status = _wait_for_process(self._host, int(perf_pid),
185                                       WAIT_FOR_CMD_TIMEOUT_SECS)
186            if status != 0:
187                logging.error('Error waiting for perf process to be killed.')
188                raise RuntimeError
189            logging.info('Perf has been killed on DUT.')
190
191        status = self.scp_perf_data(self._host, self.profdir)
192        if status != 0:
193            logging.error('Cannot copy perf.data file to host.')
194            raise RuntimeError
195
196    def run_once(self, host, args):
197        """Run a set of telemetry benchmarks.
198
199        @param host: Host machine where test is run
200        @param args: A dictionary of the arguments that were passed
201                to this test.
202        @returns None.
203        """
204        self._host = host
205        host_board = host.get_board().split(':')[1]
206
207        if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS
208                or host_board in LLVM_BOARDS_ASYNC):
209            raise error.TestFail(
210                    'This test cannot be run on board %s' % host_board)
211
212        self._parse_args(args)
213
214        # Remove write protection on host, as now telemetry code will
215        # try to remove write protection that causes the machine to
216        # reboot and remount during run_benchmark. We want to avoid it.
217        filesystem_util.make_rootfs_writable(self._host)
218
219        with self.perf_on_dut():
220            if self._minimal_telemetry:
221                self._run_tests_minimal_telemetry()
222            else:
223                self._telemetry_runner = telemetry_runner.TelemetryRunner(
224                        self._host, self._local, telemetry_on_dut=False)
225
226                for benchmark_info in TELEMETRY_AFDO_BENCHMARKS:
227                    benchmark = benchmark_info[0]
228                    args = (
229                    ) if len(benchmark_info) == 1 else benchmark_info[1]
230                    try:
231                        self._run_test_with_retry(benchmark, *args)
232                    except error.TestBaseException:
233                        if not self._ignore_failures:
234                            raise
235                        logging.info('Ignoring failure from benchmark %s.',
236                                     benchmark)
237
238    def after_run_once(self):
239        """After the profile information has been collected, compress it
240        and upload it to GS
241        """
242        PERF_FILE = 'perf.data'
243        COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data'
244        perf_data = os.path.join(self.profdir, PERF_FILE)
245        comp_data = os.path.join(self.profdir,
246                                 COMP_PERF_FILE % (self._arch, self._version))
247        compressed = self._compress_file(perf_data, comp_data)
248        self._gs_upload(compressed, os.path.basename(compressed))
249
250        # Also create copy of this file using "LATEST" as version so
251        # it can be found in case the builder is looking for a version
252        # number that does not match. It is ok to use a slighly old
253        # version of the this file for the optimized build
254        latest_data = COMP_PERF_FILE % (self._arch, 'LATEST')
255        latest_compressed = self._get_compressed_name(latest_data)
256        self._gs_upload(compressed, latest_compressed)
257
258        # So that they are not uploaded along with the logs.
259        os.remove(compressed)
260        os.remove(perf_data)
261
262    def _parse_args(self, args):
263        """Parses input arguments to this autotest.
264
265        @param args: Options->values dictionary.
266        @raises error.TestFail if a bad option is passed.
267        """
268
269        # Set default values for the options.
270        # Architecture for which we are collecting afdo data.
271        self._arch = 'amd64'
272        # Use an alternate GS location where everyone can write.
273        # Set default depending on whether this is executing in
274        # the lab environment or not
275        self._gs_test_location = not utils.host_is_in_lab_zone(
276                self._host.hostname)
277        # Ignore individual test failures.
278        self._ignore_failures = False
279        # Use local copy of telemetry instead of using the dev server copy.
280        self._local = False
281        # Chrome version to which the AFDO data corresponds.
282        self._version, _ = self._host.get_chrome_version()
283        # Try to use the minimal support from Telemetry. The Telemetry
284        # benchmarks in ChromeOS are too flaky at this point. So, initially,
285        # this will be set to True by default.
286        self._minimal_telemetry = False
287
288        for option_name, value in args.iteritems():
289            if option_name == 'arch':
290                self._arch = value
291            elif option_name == 'gs_test_location':
292                self._gs_test_location = (value == 'True')
293            elif option_name == 'ignore_failures':
294                self._ignore_failures = (value == 'True')
295            elif option_name == 'local':
296                self._local = (value == 'True')
297            elif option_name == 'minimal_telemetry':
298                self._minimal_telemetry = (value == 'True')
299            elif option_name == 'version':
300                self._version = value
301            else:
302                raise error.TestFail('Unknown option passed: %s' % option_name)
303
304    def _run_test(self, benchmark, *args):
305        """Run the benchmark using Telemetry.
306
307        @param benchmark: Name of the benchmark to run.
308        @param args: Additional arguments to pass to the telemetry execution
309                     script.
310        @raises Raises error.TestFail if execution of test failed.
311                Also re-raise any exceptions thrown by run_telemetry benchmark.
312        """
313        try:
314            logging.info('Starting run for Telemetry benchmark %s', benchmark)
315            start_time = time.time()
316            result = self._telemetry_runner.run_telemetry_benchmark(
317                    benchmark, None, *args)
318            end_time = time.time()
319            logging.info('Completed Telemetry benchmark %s in %f seconds',
320                         benchmark, end_time - start_time)
321        except error.TestBaseException as e:
322            end_time = time.time()
323            logging.info(
324                    'Got exception from Telemetry benchmark %s '
325                    'after %f seconds. Exception: %s', benchmark,
326                    end_time - start_time, str(e))
327            raise
328
329        # We dont generate any keyvals for this run. This is not
330        # an official run of the benchmark. We are just running it to get
331        # a profile from it.
332
333        if result.status is telemetry_runner.SUCCESS_STATUS:
334            logging.info('Benchmark %s succeeded', benchmark)
335        else:
336            raise error.TestFail('An error occurred while executing'
337                                 ' benchmark: %s' % benchmark)
338
339    def _run_test_with_retry(self, benchmark, *args):
340        """Run the benchmark using Telemetry. Retry in case of failure.
341
342        @param benchmark: Name of the benchmark to run.
343        @param args: Additional arguments to pass to the telemetry execution
344                     script.
345        @raises Re-raise any exceptions thrown by _run_test.
346        """
347
348        tried = False
349        while True:
350            try:
351                self._run_test(benchmark, *args)
352                logging.info('Benchmark %s succeeded on %s try', benchmark,
353                             'first' if not tried else 'second')
354                break
355            except error.TestBaseException:
356                if not tried:
357                    tried = True
358                    logging.info('Benchmark %s failed. Retrying ...',
359                                 benchmark)
360                else:
361                    logging.info('Benchmark %s failed twice. Not retrying',
362                                 benchmark)
363                    raise
364
365    def _run_tests_minimal_telemetry(self):
366        """Run the benchmarks using the minimal support from Telemetry.
367
368        The benchmarks are run using a client side autotest test. This test
369        will control Chrome directly using the chrome.Chrome support and it
370        will ask Chrome to display the benchmark pages directly instead of
371        using the "page sets" and "measurements" support from Telemetry.
372        In this way we avoid using Telemetry benchmark support which is not
373        stable on ChromeOS yet.
374        """
375        AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient'
376
377        # Execute the client side test.
378        client_at = autotest.Autotest(self._host)
379        client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='')
380
381    @staticmethod
382    def _get_compressed_name(name):
383        """Given a file name, return bz2 compressed name.
384        @param name: Name of uncompressed file.
385        @returns name of compressed file.
386        """
387        return name + '.bz2'
388
389    @staticmethod
390    def _compress_file(unc_file, com_file):
391        """Compresses specified file with bz2.
392
393        @param unc_file: name of file to compress.
394        @param com_file: prefix name of compressed file.
395        @raises error.TestFail if compression failed
396        @returns Name of compressed file.
397        """
398        dest = ''
399        with open(unc_file, 'r') as inp:
400            dest = telemetry_AFDOGenerate._get_compressed_name(com_file)
401            with bz2.BZ2File(dest, 'w') as out:
402                for data in inp:
403                    out.write(data)
404        if not dest or not os.path.isfile(dest):
405            raise error.TestFail('Could not compress %s' % unc_file)
406        return dest
407
408    def _gs_upload(self, local_file, remote_basename):
409        """Uploads file to google storage specific location.
410
411        @param local_file: name of file to upload.
412        @param remote_basename: basename of remote file.
413        @raises error.TestFail if upload failed.
414        @returns nothing.
415        """
416        GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s'
417        GS_LLVM_DEST = 'gs://chromeos-toolchain-artifacts/afdo/unvetted/benchmark/%s'
418        GS_LLVM_ASYNC_DEST = \
419            'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s'
420        GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s'
421        GS_ACL = 'project-private'
422
423        board = self._host.get_board().split(':')[1]
424
425        if self._gs_test_location:
426            gs_dest = GS_TEST_DEST
427        elif board in GCC_BOARDS:
428            gs_dest = GS_GCC_DEST
429        elif board in LLVM_BOARDS:
430            gs_dest = GS_LLVM_DEST
431        elif board in LLVM_BOARDS_ASYNC:
432            gs_dest = GS_LLVM_ASYNC_DEST
433            GS_ACL = 'public-read'
434        else:
435            raise error.TestFail('This test cannot be run on board %s' % board)
436
437        remote_file = gs_dest % remote_basename
438
439        logging.info('About to upload to GS: %s', remote_file)
440        if not utils.gs_upload(
441                local_file, remote_file, GS_ACL, result_dir=self.resultsdir):
442            logging.info('Failed upload to GS: %s', remote_file)
443            raise error.TestFail(
444                    'Unable to gs upload %s to %s' % (local_file, remote_file))
445
446        logging.info('Successfull upload to GS: %s', remote_file)
447