1#!/usr/bin/env python
2# Copyright 2015 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Calculates statistical hypothesis test for given benchmark results.
7
8Evaluate two benchmark results given as Chart JSON files to determine how
9statistically significantly different they are. This evaluation should be run
10using Chart JSON files created by one of the available benchmarks in
11tools/perf/run_benchmark.
12
13A "benchmark" (e.g. startup.cold.blank_page) includes several "metrics" (e.g.
14first_main_frame_load_time).
15"""
16
17from __future__ import print_function
18import argparse
19import json
20import os
21import sys
22
23sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__),
24                                                '..')))
25from statistical_analysis import results_stats
26
27
28DEFAULT_SIGNIFICANCE_LEVEL = 0.05
29DEFAULT_STATISTICAL_TEST = results_stats.MANN
30
31
32def LoadJsonFromPath(json_path):
33  """Returns a JSON from specified location."""
34  with open(os.path.abspath(json_path)) as data_file:
35    return json.load(data_file)
36
37
38def PrintOutcomeLine(name, max_name_length, outcome, print_p_value):
39  """Prints a single output line, e.g. 'metric_1  True  0.03'."""
40  print('{:{}}{}'.format(name, max_name_length + 2, outcome[0]), end='')
41  if print_p_value:
42    print('\t{:.10f}'.format(outcome[1]), end='')
43  print()
44
45
46def PrintTestOutcome(test_outcome_dict, test_name, significance_level,
47                     print_p_value):
48  """Prints the given test outcomes to the command line.
49
50  Will print the p-values for each metric's outcome if |print_p_value| is True
51  and also prints the name of the executed statistical test and the
52  significance level.
53  """
54  print('Statistical analysis results (True=Performance difference likely)\n'
55        '(Test: {}, Significance Level: {})\n'.format(test_name,
56                                                      significance_level))
57
58  max_metric_name_len = max([len(metric_name) for metric_name in
59                             test_outcome_dict])
60
61  for metric_name, outcome in test_outcome_dict.iteritems():
62    PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)
63
64
65def PrintPagesetTestOutcome(test_outcome_dict, test_name, significance_level,
66                            print_p_value, print_details):
67  """Prints the given test outcomes to the command line.
68
69  Prints a summary combining the p-values of the pageset for each metric. Then
70  prints results for each metric/page combination if |print_details| is True.
71  """
72  print('Statistical analysis results (True=Performance difference likely)\n'
73        '(Test: {}, Significance Level: {})\n'.format(test_name,
74                                                      significance_level))
75
76  # Print summarized version at the top.
77  max_metric_name_len = max([len(metric_name) for metric_name in
78                             test_outcome_dict])
79  print('Summary (combined p-values for all pages in pageset):\n')
80  for metric_name, pageset in test_outcome_dict.iteritems():
81    combined_p_value = results_stats.CombinePValues([p[1] for p in
82                                                     pageset.itervalues()])
83    outcome = (combined_p_value < significance_level, combined_p_value)
84    PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)
85  print()
86
87  if not print_details:
88    return
89
90  # Print outcome for every metric/page combination.
91  for metric_name, pageset in test_outcome_dict.iteritems():
92    max_page_name_len = max([len(page_name) for page_name in pageset])
93    print('{}:'.format(metric_name))
94    for page_name, page_outcome in pageset.iteritems():
95      PrintOutcomeLine(page_name, max_page_name_len, page_outcome,
96                       print_p_value)
97    print()
98
99
100def main(args=None):
101  """Set up parser and run statistical test on given benchmark results.
102
103  Set up command line parser and its arguments. Then load Chart JSONs from
104  given paths, run the specified statistical hypothesis test on the results and
105  print the test outcomes.
106  """
107  if args is None:
108    args = sys.argv[1:]
109
110  parser = argparse.ArgumentParser(description="""Runs statistical significance
111                                   tests on two given Chart JSON benchmark
112                                   results produced by the telemetry
113                                   benchmarks.""")
114
115  parser.add_argument(dest='json_paths', nargs=2, help='JSON file location')
116
117  parser.add_argument('--significance', dest='significance_level',
118                      default=DEFAULT_SIGNIFICANCE_LEVEL, type=float,
119                      help="""The significance level is the type I error rate,
120                      which is the probability of determining that the
121                      benchmark results are different although they're not.
122                      Default: {}, which is common in statistical hypothesis
123                      testing.""".format(DEFAULT_SIGNIFICANCE_LEVEL))
124
125  parser.add_argument('--statistical-test', dest='statistical_test',
126                      default=DEFAULT_STATISTICAL_TEST,
127                      choices=results_stats.ALL_TEST_OPTIONS,
128                      help="""Specifies the statistical hypothesis test that is
129                      used. Choices are: Mann-Whitney U-test,
130                      Kolmogorov-Smirnov, Welch's t-test. Default: Mann-Whitney
131                      U-Test.""")
132
133  parser.add_argument('-p', action='store_true', dest='print_p_value',
134                      help="""If the -p flag is set, the output will include
135                      the p-value for each metric.""")
136
137  parser.add_argument('-d', action='store_true', dest='print_details',
138                      help="""If the -d flag is set, the output will be more
139                      detailed for benchmarks containing pagesets, giving
140                      results for every metric/page combination after a summary
141                      at the top.""")
142
143  args = parser.parse_args(args)
144
145  result_jsons = [LoadJsonFromPath(json_path) for json_path in args.json_paths]
146
147  if (results_stats.DoesChartJSONContainPageset(result_jsons[0]) and
148      results_stats.DoesChartJSONContainPageset(result_jsons[1])):
149    # Benchmark containing a pageset.
150    result_dict_1, result_dict_2 = (
151        [results_stats.CreatePagesetBenchmarkResultDict(result_json)
152         for result_json in result_jsons])
153    test_outcome_dict = results_stats.ArePagesetBenchmarkResultsDifferent(
154        result_dict_1, result_dict_2, args.statistical_test,
155        args.significance_level)
156
157    PrintPagesetTestOutcome(test_outcome_dict, args.statistical_test,
158                            args.significance_level, args.print_p_value,
159                            args.print_details)
160
161  else:
162    # Benchmark not containing a pageset.
163    # (If only one JSON contains a pageset, results_stats raises an error.)
164    result_dict_1, result_dict_2 = (
165        [results_stats.CreateBenchmarkResultDict(result_json)
166         for result_json in result_jsons])
167    test_outcome_dict = (
168        results_stats.AreBenchmarkResultsDifferent(result_dict_1, result_dict_2,
169                                                   args.statistical_test,
170                                                   args.significance_level))
171
172    PrintTestOutcome(test_outcome_dict, args.statistical_test,
173                     args.significance_level, args.print_p_value)
174
175
176if __name__ == '__main__':
177  sys.exit(main())
178