1# Copyright 2016 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""Statistical hypothesis testing for comparing benchmark results."""
6
7try:
8  import numpy as np
9except ImportError:
10  np = None
11
12try:
13  from scipy import stats
14  import scipy.version
15except ImportError:
16  stats = None
17
18
19MANN = 'mann'
20KOLMOGOROV = 'kolmogorov'
21WELCH = 'welch'
22ALL_TEST_OPTIONS = [MANN, KOLMOGOROV, WELCH]
23
24
25class DictMismatchError(Exception):
26  """Provides exception for result dicts with mismatching keys/metrics."""
27  def __str__(self):
28    return ("Provided benchmark result dicts' keys/metrics do not match. "
29            "Check if they have been created by the same benchmark.")
30
31
32class SampleSizeError(Exception):
33  """Provides exception for sample sizes too small for Mann-Whitney U-test."""
34  def __str__(self):
35    return ('At least one sample size is smaller than 20, which is too small '
36            'for Mann-Whitney U-test.')
37
38
39class NonNormalSampleError(Exception):
40  """Provides exception for samples that are not normally distributed."""
41  def __str__(self):
42    return ("At least one sample is not normally distributed as required by "
43            "Welch's t-test.")
44
45
46def IsScipyMannTestOneSided():
47  """Checks if Scipy version is < 0.17.0.
48
49  This is the version where stats.mannwhitneyu(...) is changed from returning
50  a one-sided to returning a two-sided p-value.
51  """
52  scipy_version = [int(num) for num in scipy.version.version.split('.')]
53  return scipy_version[0] < 1 and scipy_version[1] < 17
54
55
56def GetChartsFromBenchmarkResultJson(benchmark_result_json):
57  """Returns the 'charts' element from a given Chart JSON.
58
59  Excludes entries that are not list_of_scalar_values and empty entries. Also
60  raises errors for an invalid JSON format or empty 'charts' element.
61
62  Raises:
63    ValueError: Provided chart JSON is either not valid or 'charts' is empty.
64  """
65  try:
66    charts = benchmark_result_json['charts']
67  except KeyError:
68    raise ValueError('Invalid benchmark result format. Make sure input is a '
69                     'Chart-JSON.\nProvided JSON:\n',
70                     repr(benchmark_result_json))
71  if not charts:
72    raise ValueError("Invalid benchmark result format. Dict entry 'charts' is "
73                     "empty.")
74
75  def IsValidPageContent(page_content):
76    return (page_content['type'] == 'list_of_scalar_values' and
77            'values' in page_content)
78
79  def CreatePageDict(metric_content):
80    return {page_name: page_content
81            for page_name, page_content in metric_content.iteritems()
82            if IsValidPageContent(page_content)}
83
84  charts_valid_entries_only = {}
85  for metric_name, metric_content in charts.iteritems():
86    inner_page_dict = CreatePageDict(metric_content)
87    if not inner_page_dict:
88      continue
89    charts_valid_entries_only[metric_name] = inner_page_dict
90
91  return charts_valid_entries_only
92
93
94def DoesChartJSONContainPageset(benchmark_result_json):
95  """Checks if given Chart JSON contains results for a pageset.
96
97  A metric in a benchmark NOT containing a pageset contains only two elements
98  ("Only_page_in_this_benchmark" and "Summary", as opposed to "Ex_page_1",
99  "Ex_page_2", ..., and "Summary").
100  """
101  charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
102
103  arbitrary_metric_in_charts = charts.itervalues().next()
104  return len(arbitrary_metric_in_charts) > 2
105
106
107def CreateBenchmarkResultDict(benchmark_result_json):
108  """Creates a dict of format {metric_name: list of benchmark results}.
109
110  Takes a raw result Chart-JSON produced when using '--output-format=chartjson'
111  for 'run_benchmark'.
112
113  Args:
114    benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry.
115
116  Returns:
117    Dictionary of benchmark results.
118    Example dict entry: 'tab_load_time': [650, 700, ...].
119  """
120  charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
121
122  benchmark_result_dict = {}
123  for metric_name, metric_content in charts.iteritems():
124    benchmark_result_dict[metric_name] = metric_content['summary']['values']
125
126  return benchmark_result_dict
127
128
129def CreatePagesetBenchmarkResultDict(benchmark_result_json):
130  """Creates a dict of format {metric_name: {page_name: list of page results}}.
131
132  Takes a raw result Chart-JSON produced by 'run_benchmark' when using
133  '--output-format=chartjson' and when specifying a benchmark that has a
134  pageset (e.g. top25mobile). Run 'DoesChartJSONContainPageset' to check if
135  your Chart-JSON contains a pageset.
136
137  Args:
138    benchmark_result_json: Benchmark result Chart-JSON produced by Telemetry.
139
140  Returns:
141    Dictionary of benchmark results.
142    Example dict entry: 'tab_load_time': 'Gmail.com': [650, 700, ...].
143  """
144  charts = GetChartsFromBenchmarkResultJson(benchmark_result_json)
145
146  benchmark_result_dict = {}
147  for metric_name, metric_content in charts.iteritems():
148    benchmark_result_dict[metric_name] = {}
149    for page_name, page_content in metric_content.iteritems():
150      if page_name == 'summary':
151        continue
152      benchmark_result_dict[metric_name][page_name] = page_content['values']
153
154  return benchmark_result_dict
155
156
157def CombinePValues(p_values):
158  """Combines p-values from a number of tests using Fisher's Method.
159
160  The tests the p-values result from must test the same null hypothesis and be
161  independent.
162
163  Args:
164    p_values: List of p-values.
165
166  Returns:
167    combined_p_value: Combined p-value according to Fisher's method.
168  """
169  # TODO (wierichs): Update to use scipy.stats.combine_pvalues(p_values) when
170  # Scipy v0.15.0 becomes available as standard version.
171  if not np:
172    raise ImportError('This function requires Numpy.')
173
174  if not stats:
175    raise ImportError('This function requires Scipy.')
176
177  test_statistic = -2 * np.sum(np.log(p_values))
178  p_value = stats.chi2.sf(test_statistic, 2 * len(p_values))
179  return p_value
180
181
182def IsNormallyDistributed(sample, significance_level=0.05):
183  """Calculates Shapiro-Wilk test for normality for a single sample.
184
185  Note that normality is a requirement for Welch's t-test.
186
187  Args:
188    sample: List of values.
189    significance_level: The significance level the p-value is compared against.
190
191  Returns:
192    is_normally_distributed: Returns True or False.
193    p_value: The calculated p-value.
194  """
195  if not stats:
196    raise ImportError('This function requires Scipy.')
197
198  # pylint: disable=unbalanced-tuple-unpacking
199  _, p_value = stats.shapiro(sample)
200
201  is_normally_distributed = p_value >= significance_level
202  return is_normally_distributed, p_value
203
204
205def AreSamplesDifferent(sample_1, sample_2, test=MANN,
206                        significance_level=0.05):
207  """Calculates the specified statistical test for the given samples.
208
209  The null hypothesis for each test is that the two populations that the
210  samples are taken from are not significantly different. Tests are two-tailed.
211
212  Raises:
213    ImportError: Scipy is not installed.
214    SampleSizeError: Sample size is too small for MANN.
215    NonNormalSampleError: Sample is not normally distributed as required by
216    WELCH.
217
218  Args:
219    sample_1: First list of values.
220    sample_2: Second list of values.
221    test: Statistical test that is used.
222    significance_level: The significance level the p-value is compared against.
223
224  Returns:
225    is_different: True or False, depending on the test outcome.
226    p_value: The p-value the test has produced.
227  """
228  if not stats:
229    raise ImportError('This function requires Scipy.')
230
231  if test == MANN:
232    if len(sample_1) < 20 or len(sample_2) < 20:
233      raise SampleSizeError()
234    try:
235      _, p_value = stats.mannwhitneyu(sample_1, sample_2, use_continuity=True)
236    except ValueError:
237      # If sum of ranks of values in |sample_1| and |sample_2| is equal,
238      # scipy.stats.mannwhitneyu raises ValueError. Treat this as a 1.0 p-value
239      # (indistinguishable).
240      return (False, 1.0)
241
242    if IsScipyMannTestOneSided():
243      p_value = p_value * 2 if p_value < 0.5 else 1
244
245  elif test == KOLMOGOROV:
246    _, p_value = stats.ks_2samp(sample_1, sample_2)
247
248  elif test == WELCH:
249    if not (IsNormallyDistributed(sample_1, significance_level)[0] and
250            IsNormallyDistributed(sample_2, significance_level)[0]):
251      raise NonNormalSampleError()
252    _, p_value = stats.ttest_ind(sample_1, sample_2, equal_var=False)
253  # TODO: Add k sample anderson darling test
254
255  is_different = p_value <= significance_level
256  return is_different, p_value
257
258
259def AssertThatKeysMatch(result_dict_1, result_dict_2):
260  """Raises an exception if benchmark dicts do not contain the same metrics."""
261  if result_dict_1.viewkeys() != result_dict_2.viewkeys():
262    raise DictMismatchError()
263
264
265def AreBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN,
266                                 significance_level=0.05):
267  """Runs the given test on the results of each metric in the benchmarks.
268
269  Checks if the dicts have been created from the same benchmark, i.e. if
270  metric names match (e.g. first_non_empty_paint_time). Then runs the specified
271  statistical test on each metric's samples to find if they vary significantly.
272
273  Args:
274    result_dict_1: Benchmark result dict of format {metric: list of values}.
275    result_dict_2: Benchmark result dict of format {metric: list of values}.
276    test: Statistical test that is used.
277    significance_level: The significance level the p-value is compared against.
278
279  Returns:
280    test_outcome_dict: Format {metric: (bool is_different, p-value)}.
281  """
282  AssertThatKeysMatch(result_dict_1, result_dict_2)
283
284  test_outcome_dict = {}
285  for metric in result_dict_1:
286    is_different, p_value = AreSamplesDifferent(result_dict_1[metric],
287                                                result_dict_2[metric],
288                                                test, significance_level)
289    test_outcome_dict[metric] = (is_different, p_value)
290
291  return test_outcome_dict
292
293
294def ArePagesetBenchmarkResultsDifferent(result_dict_1, result_dict_2, test=MANN,
295                                        significance_level=0.05):
296  """Runs the given test on the results of each metric/page combination.
297
298  Checks if the dicts have been created from the same benchmark, i.e. if metric
299  names and pagesets match (e.g. metric first_non_empty_paint_time and page
300  Google.com). Then runs the specified statistical test on each metric/page
301  combination's sample to find if they vary significantly.
302
303  Args:
304    result_dict_1: Benchmark result dict
305    result_dict_2: Benchmark result dict
306    test: Statistical test that is used.
307    significance_level: The significance level the p-value is compared against.
308
309  Returns:
310    test_outcome_dict: Format {metric: {page: (bool is_different, p-value)}}
311  """
312  AssertThatKeysMatch(result_dict_1, result_dict_2)
313
314  # Pagesets should also match.
315  for metric in result_dict_1.iterkeys():
316    AssertThatKeysMatch(result_dict_1[metric], result_dict_2[metric])
317
318  test_outcome_dict = {}
319  for metric in result_dict_1.iterkeys():
320    test_outcome_dict[metric] = {}
321    for page in result_dict_1[metric]:
322      is_different, p_value = AreSamplesDifferent(result_dict_1[metric][page],
323                                                  result_dict_2[metric][page],
324                                                  test, significance_level)
325      test_outcome_dict[metric][page] = (is_different, p_value)
326
327  return test_outcome_dict
328