# Copyright 2016 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Statistical hypothesis testing for comparing benchmark results.""" try: import numpy as np except ImportError: np = None try: from scipy import stats import scipy.version except ImportError: stats = None MANN = 'mann' KOLMOGOROV = 'kolmogorov' WELCH = 'welch' ALL_TEST_OPTIONS = [MANN, KOLMOGOROV, WELCH] class DictMismatchError(Exception): """Provides exception for result dicts with mismatching keys/metrics.""" def __str__(self): return ("Provided benchmark result dicts' keys/metrics do not match. " "Check if they have been created by the same benchmark.") class SampleSizeError(Exception): """Provides exception for sample sizes too small for Mann-Whitney U-test.""" def __str__(self): return ('At least one sample size is smaller than 20, which is too small ' 'for Mann-Whitney U-test.') class NonNormalSampleError(Exception): """Provides exception for samples that are not normally distributed.""" def __str__(self): return ("At least one sample is not normally distributed as required by " "Welch's t-test.") def IsScipyMannTestOneSided(): """Checks if Scipy version is < 0.17.0. This is the version where stats.mannwhitneyu(...) is changed from returning a one-sided to returning a two-sided p-value. """ scipy_version = [int(num) for num in scipy.version.version.split('.')] return scipy_version[0] < 1 and scipy_version[1] < 17 def GetChartsFromBenchmarkResultJson(benchmark_result_json): """Returns the 'charts' element from a given Chart JSON. Excludes entries that are not list_of_scalar_values and empty entries. Also raises errors for an invalid JSON format or empty 'charts' element. Raises: ValueError: Provided chart JSON is either not valid or 'charts' is empty. """ try: charts = benchmark_result_json['charts'] except KeyError: raise ValueError('Invalid benchmark result format. Make sure input is a ' 'Chart-JSON.\nProvided JSON:\n', repr(benchmark_result_json)) if not charts: raise ValueError("Invalid benchmark result format. Dict entry 'charts' is " "empty.") def IsValidPageContent(page_content): return (page_content['type'] == 'list_of_scalar_values' and 'values' in page_content) def CreatePageDict(metric_content): return {page_name: page_content for page_name, page_content in metric_content.iteritems() if IsValidPageContent(page_content)} charts_valid_entries_only = {} for metric_name, metric_content in charts.iteritems(): inner_page_dict = CreatePageDict(metric_content) if not inner_page_dict: continue charts_valid_entries_only[metric_name] = inner_page_dict return charts_valid_entries_only def DoesChartJSONContainPageset(benchmark_result_json): """Checks if given Chart JSON contains results for a pageset. A metric in a benchmark NOT containing a pageset contains only two elements ("Only_page_in_this_benchmark" and "Summary", as opposed to "Ex_page_1", "Ex_page_2", ..., and "Summary"). """ charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) arbitrary_metric_in_charts = charts.itervalues().next() return len(arbitrary_metric_in_charts) > 2 def CreateBenchmarkResultDict(benchmark_result_json): """Creates a dict of format {metric_name: list of benchmark results}. Takes a raw result Chart-JSON produced when using '--output-format=chartjson' for 'run_benchmark'. Args: benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry. Returns: Dictionary of benchmark results. Example dict entry: 'tab_load_time': [650, 700, ...]. """ charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) benchmark_result_dict = {} for metric_name, metric_content in charts.iteritems(): benchmark_result_dict[metric_name] = metric_content['summary']['values'] return benchmark_result_dict def CreatePagesetBenchmarkResultDict(benchmark_result_json): """Creates a dict of format {metric_name: {page_name: list of page results}}. Takes a raw result Chart-JSON produced by 'run_benchmark' when using '--output-format=chartjson' and when specifying a benchmark that has a pageset (e.g. top25mobile). Run 'DoesChartJSONContainPageset' to check if your Chart-JSON contains a pageset. Args: benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry. Returns: Dictionary of benchmark results. Example dict entry: 'tab_load_time': 'Gmail.com': [650, 700, ...]. """ charts = GetChartsFromBenchmarkResultJson(benchmark_result_json) benchmark_result_dict = {} for metric_name, metric_content in charts.iteritems(): benchmark_result_dict[metric_name] = {} for page_name, page_content in metric_content.iteritems(): if page_name == 'summary': continue benchmark_result_dict[metric_name][page_name] = page_content['values'] return benchmark_result_dict def CombinePValues(p_values): """Combines p-values from a number of tests using Fisher's Method. The tests the p-values result from must test the same null hypothesis and be independent. Args: p_values: List of p-values. Returns: combined_p_value: Combined p-value according to Fisher's method. """ # TODO (wierichs): Update to use scipy.stats.combine_pvalues(p_values) when # Scipy v0.15.0 becomes available as standard version. if not np: raise ImportError('This function requires Numpy.') if not stats: raise ImportError('This function requires Scipy.') test_statistic = -2 * np.sum(np.log(p_values)) p_value = stats.chi2.sf(test_statistic, 2 * len(p_values)) return p_value def IsNormallyDistributed(sample, significance_level=0.05): """Calculates Shapiro-Wilk test for normality for a single sample. Note that normality is a requirement for Welch's t-test. Args: sample: List of values. significance_level: The significance level the p-value is compared against. Returns: is_normally_distributed: Returns True or False. p_value: The calculated p-value. """ if not stats: raise ImportError('This function requires Scipy.') # pylint: disable=unbalanced-tuple-unpacking _, p_value = stats.shapiro(sample) is_normally_distributed = p_value >= significance_level return is_normally_distributed, p_value def AreSamplesDifferent(sample_1, sample_2, test=MANN, significance_level=0.05): """Calculates the specified statistical test for the given samples. The null hypothesis for each test is that the two populations that the samples are taken from are not significantly different. Tests are two-tailed. Raises: ImportError: Scipy is not installed. SampleSizeError: Sample size is too small for MANN. NonNormalSampleError: Sample is not normally distributed as required by WELCH. Args: sample_1: First list of values. sample_2: Second list of values. test: Statistical test that is used. significance_level: The significance level the p-value is compared against. Returns: is_different: True or False, depending on the test outcome. p_value: The p-value the test has produced. """ if not stats: raise ImportError('This function requires Scipy.') if test == MANN: if len(sample_1) < 20 or len(sample_2) < 20: raise SampleSizeError() try: _, p_value = stats.mannwhitneyu(sample_1, sample_2, use_continuity=True) except ValueError: # If sum of ranks of values in |sample_1| and |sample_2| is equal, # scipy.stats.mannwhitneyu raises ValueError. Treat this as a 1.0 p-value # (indistinguishable). return (False, 1.0) if IsScipyMannTestOneSided(): p_value = p_value * 2 if p_value < 0.5 else 1 elif test == KOLMOGOROV: _, p_value = stats.ks_2samp(sample_1, sample_2) elif test == WELCH: if not (IsNormallyDistributed(sample_1, significance_level)[0] and IsNormallyDistributed(sample_2, significance_level)[0]): raise NonNormalSampleError() _, p_value = stats.ttest_ind(sample_1, sample_2, equal_var=False) # TODO: Add k sample anderson darling test is_different = p_value <= significance_level return is_different, p_value def AssertThatKeysMatch(result_dict_1, result_dict_2): """Raises an exception if benchmark dicts do not contain the same metrics.""" if result_dict_1.viewkeys() != result_dict_2.viewkeys(): raise DictMismatchError() def AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN, significance_level=0.05): """Runs the given test on the results of each metric in the benchmarks. Checks if the dicts have been created from the same benchmark, i.e. if metric names match (e.g. first_non_empty_paint_time). Then runs the specified statistical test on each metric's samples to find if they vary significantly. Args: result_dict_1: Benchmark result dict of format {metric: list of values}. result_dict_2: Benchmark result dict of format {metric: list of values}. test: Statistical test that is used. significance_level: The significance level the p-value is compared against. Returns: test_outcome_dict: Format {metric: (bool is_different, p-value)}. """ AssertThatKeysMatch(result_dict_1, result_dict_2) test_outcome_dict = {} for metric in result_dict_1: is_different, p_value = AreSamplesDifferent(result_dict_1[metric], result_dict_2[metric], test, significance_level) test_outcome_dict[metric] = (is_different, p_value) return test_outcome_dict def ArePagesetBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN, significance_level=0.05): """Runs the given test on the results of each metric/page combination. Checks if the dicts have been created from the same benchmark, i.e. if metric names and pagesets match (e.g. metric first_non_empty_paint_time and page Google.com). Then runs the specified statistical test on each metric/page combination's sample to find if they vary significantly. Args: result_dict_1: Benchmark result dict result_dict_2: Benchmark result dict test: Statistical test that is used. significance_level: The significance level the p-value is compared against. Returns: test_outcome_dict: Format {metric: {page: (bool is_different, p-value)}} """ AssertThatKeysMatch(result_dict_1, result_dict_2) # Pagesets should also match. for metric in result_dict_1.iterkeys(): AssertThatKeysMatch(result_dict_1[metric], result_dict_2[metric]) test_outcome_dict = {} for metric in result_dict_1.iterkeys(): test_outcome_dict[metric] = {} for page in result_dict_1[metric]: is_different, p_value = AreSamplesDifferent(result_dict_1[metric][page], result_dict_2[metric][page], test, significance_level) test_outcome_dict[metric][page] = (is_different, p_value) return test_outcome_dict