1# Copyright 2016 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""Statistical hypothesis testing for comparing benchmark results.""" 6 7try: 8 import numpy as np 9except ImportError: 10 np = None 11 12try: 13 from scipy import stats 14 import scipy.version 15except ImportError: 16 stats = None 17 18 19MANN = 'mann' 20KOLMOGOROV = 'kolmogorov' 21WELCH = 'welch' 22ALL_TEST_OPTIONS = [MANN, KOLMOGOROV, WELCH] 23 24 25class DictMismatchError(Exception): 26 """Provides exception for result dicts with mismatching keys/metrics.""" 27 def __str__(self): 28 return ("Provided benchmark result dicts' keys/metrics do not match. " 29 "Check if they have been created by the same benchmark.") 30 31 32class SampleSizeError(Exception): 33 """Provides exception for sample sizes too small for Mann-Whitney U-test.""" 34 def __str__(self): 35 return ('At least one sample size is smaller than 20, which is too small ' 36 'for Mann-Whitney U-test.') 37 38 39class NonNormalSampleError(Exception): 40 """Provides exception for samples that are not normally distributed.""" 41 def __str__(self): 42 return ("At least one sample is not normally distributed as required by " 43 "Welch's t-test.") 44 45 46def IsScipyMannTestOneSided(): 47 """Checks if Scipy version is < 0.17.0. 48 49 This is the version where stats.mannwhitneyu(...) is changed from returning 50 a one-sided to returning a two-sided p-value. 51 """ 52 scipy_version = [int(num) for num in scipy.version.version.split('.')] 53 return scipy_version[0] < 1 and scipy_version[1] < 17 54 55 56def GetChartsFromBenchmarkResultJson(benchmark_result_json): 57 """Returns the 'charts' element from a given Chart JSON. 58 59 Excludes entries that are not list_of_scalar_values and empty entries. Also 60 raises errors for an invalid JSON format or empty 'charts' element. 61 62 Raises: 63 ValueError: Provided chart JSON is either not valid or 'charts' is empty. 64 """ 65 try: 66 charts = benchmark_result_json['charts'] 67 except KeyError: 68 raise ValueError('Invalid benchmark result format. Make sure input is a ' 69 'Chart-JSON.\nProvided JSON:\n', 70 repr(benchmark_result_json)) 71 if not charts: 72 raise ValueError("Invalid benchmark result format. Dict entry 'charts' is " 73 "empty.") 74 75 def IsValidPageContent(page_content): 76 return (page_content['type'] == 'list_of_scalar_values' and 77 'values' in page_content) 78 79 def CreatePageDict(metric_content): 80 return {page_name: page_content 81 for page_name, page_content in metric_content.iteritems() 82 if IsValidPageContent(page_content)} 83 84 charts_valid_entries_only = {} 85 for metric_name, metric_content in charts.iteritems(): 86 inner_page_dict = CreatePageDict(metric_content) 87 if not inner_page_dict: 88 continue 89 charts_valid_entries_only[metric_name] = inner_page_dict 90 91 return charts_valid_entries_only 92 93 94def DoesChartJSONContainPageset(benchmark_result_json): 95 """Checks if given Chart JSON contains results for a pageset. 96 97 A metric in a benchmark NOT containing a pageset contains only two elements 98 ("Only_page_in_this_benchmark" and "Summary", as opposed to "Ex_page_1", 99 "Ex_page_2", ..., and "Summary"). 100 """ 101 charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) 102 103 arbitrary_metric_in_charts = charts.itervalues().next() 104 return len(arbitrary_metric_in_charts) > 2 105 106 107def CreateBenchmarkResultDict(benchmark_result_json): 108 """Creates a dict of format {metric_name: list of benchmark results}. 109 110 Takes a raw result Chart-JSON produced when using '--output-format=chartjson' 111 for 'run_benchmark'. 112 113 Args: 114 benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry. 115 116 Returns: 117 Dictionary of benchmark results. 118 Example dict entry: 'tab_load_time': [650, 700, ...]. 119 """ 120 charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) 121 122 benchmark_result_dict = {} 123 for metric_name, metric_content in charts.iteritems(): 124 benchmark_result_dict[metric_name] = metric_content['summary']['values'] 125 126 return benchmark_result_dict 127 128 129def CreatePagesetBenchmarkResultDict(benchmark_result_json): 130 """Creates a dict of format {metric_name: {page_name: list of page results}}. 131 132 Takes a raw result Chart-JSON produced by 'run_benchmark' when using 133 '--output-format=chartjson' and when specifying a benchmark that has a 134 pageset (e.g. top25mobile). Run 'DoesChartJSONContainPageset' to check if 135 your Chart-JSON contains a pageset. 136 137 Args: 138 benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry. 139 140 Returns: 141 Dictionary of benchmark results. 142 Example dict entry: 'tab_load_time': 'Gmail.com': [650, 700, ...]. 143 """ 144 charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) 145 146 benchmark_result_dict = {} 147 for metric_name, metric_content in charts.iteritems(): 148 benchmark_result_dict[metric_name] = {} 149 for page_name, page_content in metric_content.iteritems(): 150 if page_name == 'summary': 151 continue 152 benchmark_result_dict[metric_name][page_name] = page_content['values'] 153 154 return benchmark_result_dict 155 156 157def CombinePValues(p_values): 158 """Combines p-values from a number of tests using Fisher's Method. 159 160 The tests the p-values result from must test the same null hypothesis and be 161 independent. 162 163 Args: 164 p_values: List of p-values. 165 166 Returns: 167 combined_p_value: Combined p-value according to Fisher's method. 168 """ 169 # TODO (wierichs): Update to use scipy.stats.combine_pvalues(p_values) when 170 # Scipy v0.15.0 becomes available as standard version. 171 if not np: 172 raise ImportError('This function requires Numpy.') 173 174 if not stats: 175 raise ImportError('This function requires Scipy.') 176 177 test_statistic = -2 * np.sum(np.log(p_values)) 178 p_value = stats.chi2.sf(test_statistic, 2 * len(p_values)) 179 return p_value 180 181 182def IsNormallyDistributed(sample, significance_level=0.05): 183 """Calculates Shapiro-Wilk test for normality for a single sample. 184 185 Note that normality is a requirement for Welch's t-test. 186 187 Args: 188 sample: List of values. 189 significance_level: The significance level the p-value is compared against. 190 191 Returns: 192 is_normally_distributed: Returns True or False. 193 p_value: The calculated p-value. 194 """ 195 if not stats: 196 raise ImportError('This function requires Scipy.') 197 198 # pylint: disable=unbalanced-tuple-unpacking 199 _, p_value = stats.shapiro(sample) 200 201 is_normally_distributed = p_value >= significance_level 202 return is_normally_distributed, p_value 203 204 205def AreSamplesDifferent(sample_1, sample_2, test=MANN, 206 significance_level=0.05): 207 """Calculates the specified statistical test for the given samples. 208 209 The null hypothesis for each test is that the two populations that the 210 samples are taken from are not significantly different. Tests are two-tailed. 211 212 Raises: 213 ImportError: Scipy is not installed. 214 SampleSizeError: Sample size is too small for MANN. 215 NonNormalSampleError: Sample is not normally distributed as required by 216 WELCH. 217 218 Args: 219 sample_1: First list of values. 220 sample_2: Second list of values. 221 test: Statistical test that is used. 222 significance_level: The significance level the p-value is compared against. 223 224 Returns: 225 is_different: True or False, depending on the test outcome. 226 p_value: The p-value the test has produced. 227 """ 228 if not stats: 229 raise ImportError('This function requires Scipy.') 230 231 if test == MANN: 232 if len(sample_1) < 20 or len(sample_2) < 20: 233 raise SampleSizeError() 234 try: 235 _, p_value = stats.mannwhitneyu(sample_1, sample_2, use_continuity=True) 236 except ValueError: 237 # If sum of ranks of values in |sample_1| and |sample_2| is equal, 238 # scipy.stats.mannwhitneyu raises ValueError. Treat this as a 1.0 p-value 239 # (indistinguishable). 240 return (False, 1.0) 241 242 if IsScipyMannTestOneSided(): 243 p_value = p_value * 2 if p_value < 0.5 else 1 244 245 elif test == KOLMOGOROV: 246 _, p_value = stats.ks_2samp(sample_1, sample_2) 247 248 elif test == WELCH: 249 if not (IsNormallyDistributed(sample_1, significance_level)[0] and 250 IsNormallyDistributed(sample_2, significance_level)[0]): 251 raise NonNormalSampleError() 252 _, p_value = stats.ttest_ind(sample_1, sample_2, equal_var=False) 253 # TODO: Add k sample anderson darling test 254 255 is_different = p_value <= significance_level 256 return is_different, p_value 257 258 259def AssertThatKeysMatch(result_dict_1, result_dict_2): 260 """Raises an exception if benchmark dicts do not contain the same metrics.""" 261 if result_dict_1.viewkeys() != result_dict_2.viewkeys(): 262 raise DictMismatchError() 263 264 265def AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN, 266 significance_level=0.05): 267 """Runs the given test on the results of each metric in the benchmarks. 268 269 Checks if the dicts have been created from the same benchmark, i.e. if 270 metric names match (e.g. first_non_empty_paint_time). Then runs the specified 271 statistical test on each metric's samples to find if they vary significantly. 272 273 Args: 274 result_dict_1: Benchmark result dict of format {metric: list of values}. 275 result_dict_2: Benchmark result dict of format {metric: list of values}. 276 test: Statistical test that is used. 277 significance_level: The significance level the p-value is compared against. 278 279 Returns: 280 test_outcome_dict: Format {metric: (bool is_different, p-value)}. 281 """ 282 AssertThatKeysMatch(result_dict_1, result_dict_2) 283 284 test_outcome_dict = {} 285 for metric in result_dict_1: 286 is_different, p_value = AreSamplesDifferent(result_dict_1[metric], 287 result_dict_2[metric], 288 test, significance_level) 289 test_outcome_dict[metric] = (is_different, p_value) 290 291 return test_outcome_dict 292 293 294def ArePagesetBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN, 295 significance_level=0.05): 296 """Runs the given test on the results of each metric/page combination. 297 298 Checks if the dicts have been created from the same benchmark, i.e. if metric 299 names and pagesets match (e.g. metric first_non_empty_paint_time and page 300 Google.com). Then runs the specified statistical test on each metric/page 301 combination's sample to find if they vary significantly. 302 303 Args: 304 result_dict_1: Benchmark result dict 305 result_dict_2: Benchmark result dict 306 test: Statistical test that is used. 307 significance_level: The significance level the p-value is compared against. 308 309 Returns: 310 test_outcome_dict: Format {metric: {page: (bool is_different, p-value)}} 311 """ 312 AssertThatKeysMatch(result_dict_1, result_dict_2) 313 314 # Pagesets should also match. 315 for metric in result_dict_1.iterkeys(): 316 AssertThatKeysMatch(result_dict_1[metric], result_dict_2[metric]) 317 318 test_outcome_dict = {} 319 for metric in result_dict_1.iterkeys(): 320 test_outcome_dict[metric] = {} 321 for page in result_dict_1[metric]: 322 is_different, p_value = AreSamplesDifferent(result_dict_1[metric][page], 323 result_dict_2[metric][page], 324 test, significance_level) 325 test_outcome_dict[metric][page] = (is_different, p_value) 326 327 return test_outcome_dict 328