1#!/usr/bin/env python 2# Copyright 2015 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Calculates statistical hypothesis test for given benchmark results. 7 8Evaluate two benchmark results given as Chart JSON files to determine how 9statistically significantly different they are. This evaluation should be run 10using Chart JSON files created by one of the available benchmarks in 11tools/perf/run_benchmark. 12 13A "benchmark" (e.g. startup.cold.blank_page) includes several "metrics" (e.g. 14first_main_frame_load_time). 15""" 16 17from __future__ import print_function 18import argparse 19import json 20import os 21import sys 22 23sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), 24 '..'))) 25from statistical_analysis import results_stats 26 27 28DEFAULT_SIGNIFICANCE_LEVEL = 0.05 29DEFAULT_STATISTICAL_TEST = results_stats.MANN 30 31 32def LoadJsonFromPath(json_path): 33 """Returns a JSON from specified location.""" 34 with open(os.path.abspath(json_path)) as data_file: 35 return json.load(data_file) 36 37 38def PrintOutcomeLine(name, max_name_length, outcome, print_p_value): 39 """Prints a single output line, e.g. 'metric_1 True 0.03'.""" 40 print('{:{}}{}'.format(name, max_name_length + 2, outcome[0]), end='') 41 if print_p_value: 42 print('\t{:.10f}'.format(outcome[1]), end='') 43 print() 44 45 46def PrintTestOutcome(test_outcome_dict, test_name, significance_level, 47 print_p_value): 48 """Prints the given test outcomes to the command line. 49 50 Will print the p-values for each metric's outcome if |print_p_value| is True 51 and also prints the name of the executed statistical test and the 52 significance level. 53 """ 54 print('Statistical analysis results (True=Performance difference likely)\n' 55 '(Test: {}, Significance Level: {})\n'.format(test_name, 56 significance_level)) 57 58 max_metric_name_len = max([len(metric_name) for metric_name in 59 test_outcome_dict]) 60 61 for metric_name, outcome in test_outcome_dict.iteritems(): 62 PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value) 63 64 65def PrintPagesetTestOutcome(test_outcome_dict, test_name, significance_level, 66 print_p_value, print_details): 67 """Prints the given test outcomes to the command line. 68 69 Prints a summary combining the p-values of the pageset for each metric. Then 70 prints results for each metric/page combination if |print_details| is True. 71 """ 72 print('Statistical analysis results (True=Performance difference likely)\n' 73 '(Test: {}, Significance Level: {})\n'.format(test_name, 74 significance_level)) 75 76 # Print summarized version at the top. 77 max_metric_name_len = max([len(metric_name) for metric_name in 78 test_outcome_dict]) 79 print('Summary (combined p-values for all pages in pageset):\n') 80 for metric_name, pageset in test_outcome_dict.iteritems(): 81 combined_p_value = results_stats.CombinePValues([p[1] for p in 82 pageset.itervalues()]) 83 outcome = (combined_p_value < significance_level, combined_p_value) 84 PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value) 85 print() 86 87 if not print_details: 88 return 89 90 # Print outcome for every metric/page combination. 91 for metric_name, pageset in test_outcome_dict.iteritems(): 92 max_page_name_len = max([len(page_name) for page_name in pageset]) 93 print('{}:'.format(metric_name)) 94 for page_name, page_outcome in pageset.iteritems(): 95 PrintOutcomeLine(page_name, max_page_name_len, page_outcome, 96 print_p_value) 97 print() 98 99 100def main(args=None): 101 """Set up parser and run statistical test on given benchmark results. 102 103 Set up command line parser and its arguments. Then load Chart JSONs from 104 given paths, run the specified statistical hypothesis test on the results and 105 print the test outcomes. 106 """ 107 if args is None: 108 args = sys.argv[1:] 109 110 parser = argparse.ArgumentParser(description="""Runs statistical significance 111 tests on two given Chart JSON benchmark 112 results produced by the telemetry 113 benchmarks.""") 114 115 parser.add_argument(dest='json_paths', nargs=2, help='JSON file location') 116 117 parser.add_argument('--significance', dest='significance_level', 118 default=DEFAULT_SIGNIFICANCE_LEVEL, type=float, 119 help="""The significance level is the type I error rate, 120 which is the probability of determining that the 121 benchmark results are different although they're not. 122 Default: {}, which is common in statistical hypothesis 123 testing.""".format(DEFAULT_SIGNIFICANCE_LEVEL)) 124 125 parser.add_argument('--statistical-test', dest='statistical_test', 126 default=DEFAULT_STATISTICAL_TEST, 127 choices=results_stats.ALL_TEST_OPTIONS, 128 help="""Specifies the statistical hypothesis test that is 129 used. Choices are: Mann-Whitney U-test, 130 Kolmogorov-Smirnov, Welch's t-test. Default: Mann-Whitney 131 U-Test.""") 132 133 parser.add_argument('-p', action='store_true', dest='print_p_value', 134 help="""If the -p flag is set, the output will include 135 the p-value for each metric.""") 136 137 parser.add_argument('-d', action='store_true', dest='print_details', 138 help="""If the -d flag is set, the output will be more 139 detailed for benchmarks containing pagesets, giving 140 results for every metric/page combination after a summary 141 at the top.""") 142 143 args = parser.parse_args(args) 144 145 result_jsons = [LoadJsonFromPath(json_path) for json_path in args.json_paths] 146 147 if (results_stats.DoesChartJSONContainPageset(result_jsons[0]) and 148 results_stats.DoesChartJSONContainPageset(result_jsons[1])): 149 # Benchmark containing a pageset. 150 result_dict_1, result_dict_2 = ( 151 [results_stats.CreatePagesetBenchmarkResultDict(result_json) 152 for result_json in result_jsons]) 153 test_outcome_dict = results_stats.ArePagesetBenchmarkResultsDifferent( 154 result_dict_1, result_dict_2, args.statistical_test, 155 args.significance_level) 156 157 PrintPagesetTestOutcome(test_outcome_dict, args.statistical_test, 158 args.significance_level, args.print_p_value, 159 args.print_details) 160 161 else: 162 # Benchmark not containing a pageset. 163 # (If only one JSON contains a pageset, results_stats raises an error.) 164 result_dict_1, result_dict_2 = ( 165 [results_stats.CreateBenchmarkResultDict(result_json) 166 for result_json in result_jsons]) 167 test_outcome_dict = ( 168 results_stats.AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, 169 args.statistical_test, 170 args.significance_level)) 171 172 PrintTestOutcome(test_outcome_dict, args.statistical_test, 173 args.significance_level, args.print_p_value) 174 175 176if __name__ == '__main__': 177 sys.exit(main()) 178