1# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4""" 5Test to generate the AFDO profile for a set of ChromeOS benchmarks. 6 7This will run a pre-determined set of benchmarks on the DUT under 8the monitoring of the linux "perf" tool. The resulting perf.data 9file will then be copied to Google Storage (GS) where it can be 10used by the AFDO optimized build. 11 12Given that the telemetry benchmarks are quite unstable on ChromeOS at 13this point, this test also supports a mode where the benchmarks are 14executed outside of the telemetry framework. It is not the same as 15executing the benchmarks under telemetry because there is no telemetry 16measurement taken but, for the purposes of profiling Chrome, it should 17be pretty close. 18 19Example invocation: 20/usr/bin/test_that --debug --board=lumpy <DUT IP> 21 --args="ignore_failures=True local=True gs_test_location=True" 22 telemetry_AFDOGenerate 23""" 24 25from __future__ import print_function 26 27import bz2 28import logging 29import os 30import time 31 32from contextlib import contextmanager 33 34from autotest_lib.client.common_lib import error 35from autotest_lib.server import autotest 36from autotest_lib.server import test 37from autotest_lib.server import utils 38from autotest_lib.server.cros import filesystem_util 39from autotest_lib.server.cros import telemetry_runner 40from autotest_lib.site_utils import test_runner_utils 41 42# These are arguments to the linux "perf" tool. 43# The -e value is processor specific and comes from the Intel SDM vol 3b 44PROFILER_ARGS = 'record -a -e r20c4 -c 50000 -b' 45 46# In practice, it takes >2min to copy the perf.data back from the DUT, set 47# this timeout to 600 secs to be safe. 48WAIT_FOR_CMD_TIMEOUT_SECS = 600 49 50# Reuse ssh and scp settings from telemetry_Crosperf 51RSA_KEY = '-i %s' % test_runner_utils.TEST_KEY_PATH 52DUT_SCP_OPTIONS = ' '.join([ 53 '-o StrictHostKeyChecking=no', '-o UserKnownHostsFile=/dev/null', 54 '-o BatchMode=yes', '-o ConnectTimeout=30', 55 '-o ServerAliveInterval=900', '-o ServerAliveCountMax=3', 56 '-o ConnectionAttempts=4', '-o Protocol=2' 57]) 58DUT_CHROME_RESULTS_DIR = '/usr/local/telemetry/src/tools/perf' 59 60_WAIT_CMD_TEMPLATE = """\ 61for _ in {1..%(timeout)d}; do \ 62 ps %(pid)d >/dev/null || break; \ 63 sleep 1; \ 64done; \ 65! ps %(pid)d >/dev/null \ 66""" 67 68 69def _wait_for_process(host, pid, timeout=-1): 70 """Waits for a process on the DUT to terminate. 71 72 @param host: A host object representing the DUT. 73 @param pid: The process ID (integer). 74 @param timeout: Number of seconds to wait; default is wait forever. 75 """ 76 wait_cmd = _WAIT_CMD_TEMPLATE % {'pid': pid, 'timeout': timeout} 77 return host.run(wait_cmd, ignore_status=True).exit_status 78 79 80# List of benchmarks to run to capture profile information. This is 81# based on the "superhero" list and other telemetry benchmarks. Goal is 82# to have a short list that is as representative as possible and takes a 83# short time to execute. At this point the list of benchmarks is in flux. 84TELEMETRY_AFDO_BENCHMARKS = ( 85 # page_cycler tests are deprecated. Replace them with loading.desktop. 86 ('loading.desktop', ('--pageset-repeat=1', 87 '--story-tag-filter=typical')), 88 ('loading.desktop', ('--pageset-repeat=1', 89 '--story-tag-filter=intl_ja_zh')), 90 ('rendering.desktop', 91 ('--story-tag-filter=tough_canvas', 92 '--story-filter="bouncing\\*\\|canvas\\*\\|microsoft\\*"')), 93 ('octane', ), 94 ('kraken', ), 95 ('speedometer2', ), 96) 97 98# Temporarily disable this benchmark because it is failing a 99# lot. Filed chromium:590127 100# ('smoothness.tough_webgl_cases',) 101 102# Some benchmarks removed from the profile set: 103# 'page_cycler.morejs' -> uninteresting, seems to fail frequently, 104# 'page_cycler.moz' -> seems very old. 105# 'media.tough_video_cases' -> removed this because it does not bring 106# any benefit and takes more than 12 mins 107 108# List of boards where this test can be run. Currently, it needs a 109# machines with at least 4GB of memory or 2GB of /tmp. 110# This must be consistent with chromite. 111GCC_BOARDS = ['lumpy'] 112 113# Should be disjoint with GCC_BOARDS 114LLVM_BOARDS = ['chell'] 115 116# FIXME(tcwang): only used for testing Async AFDO generation builders. 117# Remove this after testing is done. 118# Due to crbug.com/991299 and crbug.com/992539, AFDO profiles generated 119# by samus is not suitable for production in both master and branch. 120# So it's suitable to test generation profiles but not actually use it. 121LLVM_BOARDS_ASYNC = ['samus'] 122 123 124class telemetry_AFDOGenerate(test.test): 125 """ 126 Run one or more telemetry benchmarks under the "perf" monitoring 127 tool, generate a "perf.data" file and upload to GS for comsumption 128 by the AFDO optimized build. 129 """ 130 version = 1 131 132 def scp_perf_data(self, dut, host_dir): 133 """Copy perf data from dut. 134 135 @param dut: The autotest host object representing DUT. 136 @param host_dir: The directory on host to put the file . 137 138 @returns status code for scp command. 139 """ 140 cmd = [] 141 src = ('root@%s:%s/%s' % (dut.hostname, DUT_CHROME_RESULTS_DIR, 142 'perf.data')) 143 cmd.extend(['scp', DUT_SCP_OPTIONS, RSA_KEY, '-P', str(dut.port), '-v', 144 src, host_dir]) 145 command = ' '.join(cmd) 146 147 logging.debug('Retrieving Perf Data: %s', command) 148 try: 149 result = utils.run(command, timeout=WAIT_FOR_CMD_TIMEOUT_SECS) 150 exit_code = result.exit_status 151 except Exception as e: 152 logging.error('Failed to retrieve results: %s', e) 153 raise 154 155 logging.debug('command return value: %d', exit_code) 156 return exit_code 157 158 @contextmanager 159 def perf_on_dut(self): 160 """Start and kill perf process on DUT. 161 """ 162 logging.info('Starting perf process in background.') 163 perf_cmd = 'nohup perf %s -o %s/perf.data' \ 164 % (PROFILER_ARGS, DUT_CHROME_RESULTS_DIR) 165 perf_pid = self._host.run_background(perf_cmd) 166 167 try: 168 # Use `kill -0` to check whether the perf process is alive 169 verify_cmd = 'kill -0 %s' % perf_pid 170 if self._host.run(verify_cmd, ignore_status=True).exit_status != 0: 171 logging.error('Perf process not started correctly on DUT') 172 raise RuntimeError 173 logging.info('Perf PID: %s\nPerf command: %s', perf_pid, perf_cmd) 174 yield 175 finally: 176 # Check if process is still alive after benchmark run, if yes, 177 # then kill it with -2 (which is SIGINT). 178 kill_cmd = 'kill -0 %s && killall -2 perf' % perf_pid 179 if self._host.run(kill_cmd, ignore_status=True).exit_status != 0: 180 logging.error('Perf process is not killed correctly on DUT.') 181 raise RuntimeError 182 # Perf process may not be terminated right after the kill command, 183 # wait until perf process finishes. 184 status = _wait_for_process(self._host, int(perf_pid), 185 WAIT_FOR_CMD_TIMEOUT_SECS) 186 if status != 0: 187 logging.error('Error waiting for perf process to be killed.') 188 raise RuntimeError 189 logging.info('Perf has been killed on DUT.') 190 191 status = self.scp_perf_data(self._host, self.profdir) 192 if status != 0: 193 logging.error('Cannot copy perf.data file to host.') 194 raise RuntimeError 195 196 def run_once(self, host, args): 197 """Run a set of telemetry benchmarks. 198 199 @param host: Host machine where test is run 200 @param args: A dictionary of the arguments that were passed 201 to this test. 202 @returns None. 203 """ 204 self._host = host 205 host_board = host.get_board().split(':')[1] 206 207 if not (host_board in LLVM_BOARDS or host_board in GCC_BOARDS 208 or host_board in LLVM_BOARDS_ASYNC): 209 raise error.TestFail( 210 'This test cannot be run on board %s' % host_board) 211 212 self._parse_args(args) 213 214 # Remove write protection on host, as now telemetry code will 215 # try to remove write protection that causes the machine to 216 # reboot and remount during run_benchmark. We want to avoid it. 217 filesystem_util.make_rootfs_writable(self._host) 218 219 with self.perf_on_dut(): 220 if self._minimal_telemetry: 221 self._run_tests_minimal_telemetry() 222 else: 223 self._telemetry_runner = telemetry_runner.TelemetryRunner( 224 self._host, self._local, telemetry_on_dut=False) 225 226 for benchmark_info in TELEMETRY_AFDO_BENCHMARKS: 227 benchmark = benchmark_info[0] 228 args = ( 229 ) if len(benchmark_info) == 1 else benchmark_info[1] 230 try: 231 self._run_test_with_retry(benchmark, *args) 232 except error.TestBaseException: 233 if not self._ignore_failures: 234 raise 235 logging.info('Ignoring failure from benchmark %s.', 236 benchmark) 237 238 def after_run_once(self): 239 """After the profile information has been collected, compress it 240 and upload it to GS 241 """ 242 PERF_FILE = 'perf.data' 243 COMP_PERF_FILE = 'chromeos-chrome-%s-%s.perf.data' 244 perf_data = os.path.join(self.profdir, PERF_FILE) 245 comp_data = os.path.join(self.profdir, 246 COMP_PERF_FILE % (self._arch, self._version)) 247 compressed = self._compress_file(perf_data, comp_data) 248 self._gs_upload(compressed, os.path.basename(compressed)) 249 250 # Also create copy of this file using "LATEST" as version so 251 # it can be found in case the builder is looking for a version 252 # number that does not match. It is ok to use a slighly old 253 # version of the this file for the optimized build 254 latest_data = COMP_PERF_FILE % (self._arch, 'LATEST') 255 latest_compressed = self._get_compressed_name(latest_data) 256 self._gs_upload(compressed, latest_compressed) 257 258 # So that they are not uploaded along with the logs. 259 os.remove(compressed) 260 os.remove(perf_data) 261 262 def _parse_args(self, args): 263 """Parses input arguments to this autotest. 264 265 @param args: Options->values dictionary. 266 @raises error.TestFail if a bad option is passed. 267 """ 268 269 # Set default values for the options. 270 # Architecture for which we are collecting afdo data. 271 self._arch = 'amd64' 272 # Use an alternate GS location where everyone can write. 273 # Set default depending on whether this is executing in 274 # the lab environment or not 275 self._gs_test_location = not utils.host_is_in_lab_zone( 276 self._host.hostname) 277 # Ignore individual test failures. 278 self._ignore_failures = False 279 # Use local copy of telemetry instead of using the dev server copy. 280 self._local = False 281 # Chrome version to which the AFDO data corresponds. 282 self._version, _ = self._host.get_chrome_version() 283 # Try to use the minimal support from Telemetry. The Telemetry 284 # benchmarks in ChromeOS are too flaky at this point. So, initially, 285 # this will be set to True by default. 286 self._minimal_telemetry = False 287 288 for option_name, value in args.iteritems(): 289 if option_name == 'arch': 290 self._arch = value 291 elif option_name == 'gs_test_location': 292 self._gs_test_location = (value == 'True') 293 elif option_name == 'ignore_failures': 294 self._ignore_failures = (value == 'True') 295 elif option_name == 'local': 296 self._local = (value == 'True') 297 elif option_name == 'minimal_telemetry': 298 self._minimal_telemetry = (value == 'True') 299 elif option_name == 'version': 300 self._version = value 301 else: 302 raise error.TestFail('Unknown option passed: %s' % option_name) 303 304 def _run_test(self, benchmark, *args): 305 """Run the benchmark using Telemetry. 306 307 @param benchmark: Name of the benchmark to run. 308 @param args: Additional arguments to pass to the telemetry execution 309 script. 310 @raises Raises error.TestFail if execution of test failed. 311 Also re-raise any exceptions thrown by run_telemetry benchmark. 312 """ 313 try: 314 logging.info('Starting run for Telemetry benchmark %s', benchmark) 315 start_time = time.time() 316 result = self._telemetry_runner.run_telemetry_benchmark( 317 benchmark, None, *args) 318 end_time = time.time() 319 logging.info('Completed Telemetry benchmark %s in %f seconds', 320 benchmark, end_time - start_time) 321 except error.TestBaseException as e: 322 end_time = time.time() 323 logging.info( 324 'Got exception from Telemetry benchmark %s ' 325 'after %f seconds. Exception: %s', benchmark, 326 end_time - start_time, str(e)) 327 raise 328 329 # We dont generate any keyvals for this run. This is not 330 # an official run of the benchmark. We are just running it to get 331 # a profile from it. 332 333 if result.status is telemetry_runner.SUCCESS_STATUS: 334 logging.info('Benchmark %s succeeded', benchmark) 335 else: 336 raise error.TestFail('An error occurred while executing' 337 ' benchmark: %s' % benchmark) 338 339 def _run_test_with_retry(self, benchmark, *args): 340 """Run the benchmark using Telemetry. Retry in case of failure. 341 342 @param benchmark: Name of the benchmark to run. 343 @param args: Additional arguments to pass to the telemetry execution 344 script. 345 @raises Re-raise any exceptions thrown by _run_test. 346 """ 347 348 tried = False 349 while True: 350 try: 351 self._run_test(benchmark, *args) 352 logging.info('Benchmark %s succeeded on %s try', benchmark, 353 'first' if not tried else 'second') 354 break 355 except error.TestBaseException: 356 if not tried: 357 tried = True 358 logging.info('Benchmark %s failed. Retrying ...', 359 benchmark) 360 else: 361 logging.info('Benchmark %s failed twice. Not retrying', 362 benchmark) 363 raise 364 365 def _run_tests_minimal_telemetry(self): 366 """Run the benchmarks using the minimal support from Telemetry. 367 368 The benchmarks are run using a client side autotest test. This test 369 will control Chrome directly using the chrome.Chrome support and it 370 will ask Chrome to display the benchmark pages directly instead of 371 using the "page sets" and "measurements" support from Telemetry. 372 In this way we avoid using Telemetry benchmark support which is not 373 stable on ChromeOS yet. 374 """ 375 AFDO_GENERATE_CLIENT_TEST = 'telemetry_AFDOGenerateClient' 376 377 # Execute the client side test. 378 client_at = autotest.Autotest(self._host) 379 client_at.run_test(AFDO_GENERATE_CLIENT_TEST, args='') 380 381 @staticmethod 382 def _get_compressed_name(name): 383 """Given a file name, return bz2 compressed name. 384 @param name: Name of uncompressed file. 385 @returns name of compressed file. 386 """ 387 return name + '.bz2' 388 389 @staticmethod 390 def _compress_file(unc_file, com_file): 391 """Compresses specified file with bz2. 392 393 @param unc_file: name of file to compress. 394 @param com_file: prefix name of compressed file. 395 @raises error.TestFail if compression failed 396 @returns Name of compressed file. 397 """ 398 dest = '' 399 with open(unc_file, 'r') as inp: 400 dest = telemetry_AFDOGenerate._get_compressed_name(com_file) 401 with bz2.BZ2File(dest, 'w') as out: 402 for data in inp: 403 out.write(data) 404 if not dest or not os.path.isfile(dest): 405 raise error.TestFail('Could not compress %s' % unc_file) 406 return dest 407 408 def _gs_upload(self, local_file, remote_basename): 409 """Uploads file to google storage specific location. 410 411 @param local_file: name of file to upload. 412 @param remote_basename: basename of remote file. 413 @raises error.TestFail if upload failed. 414 @returns nothing. 415 """ 416 GS_GCC_DEST = 'gs://chromeos-prebuilt/afdo-job/canonicals/%s' 417 GS_LLVM_DEST = 'gs://chromeos-toolchain-artifacts/afdo/unvetted/benchmark/%s' 418 GS_LLVM_ASYNC_DEST = \ 419 'gs://chromeos-throw-away-bucket/afdo-job/llvm/benchmarks/%s' 420 GS_TEST_DEST = 'gs://chromeos-throw-away-bucket/afdo-job/canonicals/%s' 421 GS_ACL = 'project-private' 422 423 board = self._host.get_board().split(':')[1] 424 425 if self._gs_test_location: 426 gs_dest = GS_TEST_DEST 427 elif board in GCC_BOARDS: 428 gs_dest = GS_GCC_DEST 429 elif board in LLVM_BOARDS: 430 gs_dest = GS_LLVM_DEST 431 elif board in LLVM_BOARDS_ASYNC: 432 gs_dest = GS_LLVM_ASYNC_DEST 433 GS_ACL = 'public-read' 434 else: 435 raise error.TestFail('This test cannot be run on board %s' % board) 436 437 remote_file = gs_dest % remote_basename 438 439 logging.info('About to upload to GS: %s', remote_file) 440 if not utils.gs_upload( 441 local_file, remote_file, GS_ACL, result_dir=self.resultsdir): 442 logging.info('Failed upload to GS: %s', remote_file) 443 raise error.TestFail( 444 'Unable to gs upload %s to %s' % (local_file, remote_file)) 445 446 logging.info('Successfull upload to GS: %s', remote_file) 447