experimental/statistical_analysis/compare_benchmark_results.py

#!/usr/bin/env python
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Calculates statistical hypothesis test for given benchmark results.

Evaluate two benchmark results given as Chart JSON files to determine how
statistically significantly different they are. This evaluation should be run
using Chart JSON files created by one of the available benchmarks in
tools/perf/run_benchmark.

A "benchmark" (e.g. startup.cold.blank_page) includes several "metrics" (e.g.
first_main_frame_load_time).
"""

from __future__ import print_function
import argparse
import json
import os
import sys

sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__),
                                                '..')))
from statistical_analysis import results_stats


DEFAULT_SIGNIFICANCE_LEVEL = 0.05
DEFAULT_STATISTICAL_TEST = results_stats.MANN


def LoadJsonFromPath(json_path):
  """Returns a JSON from specified location."""
  with open(os.path.abspath(json_path)) as data_file:
    return json.load(data_file)


def PrintOutcomeLine(name, max_name_length, outcome, print_p_value):
  """Prints a single output line, e.g. 'metric_1  True  0.03'."""
  print('{:{}}{}'.format(name, max_name_length + 2, outcome[0]), end='')
  if print_p_value:
    print('\t{:.10f}'.format(outcome[1]), end='')
  print()


def PrintTestOutcome(test_outcome_dict, test_name, significance_level,
                     print_p_value):
  """Prints the given test outcomes to the command line.

  Will print the p-values for each metric's outcome if |print_p_value| is True
  and also prints the name of the executed statistical test and the
  significance level.
  """
  print('Statistical analysis results (True=Performance difference likely)\n'
        '(Test: {}, Significance Level: {})\n'.format(test_name,
                                                      significance_level))

  max_metric_name_len = max([len(metric_name) for metric_name in
                             test_outcome_dict])

  for metric_name, outcome in test_outcome_dict.iteritems():
    PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)


def PrintPagesetTestOutcome(test_outcome_dict, test_name, significance_level,
                            print_p_value, print_details):
  """Prints the given test outcomes to the command line.

  Prints a summary combining the p-values of the pageset for each metric. Then
  prints results for each metric/page combination if |print_details| is True.
  """
  print('Statistical analysis results (True=Performance difference likely)\n'
        '(Test: {}, Significance Level: {})\n'.format(test_name,
                                                      significance_level))

  # Print summarized version at the top.
  max_metric_name_len = max([len(metric_name) for metric_name in
                             test_outcome_dict])
  print('Summary (combined p-values for all pages in pageset):\n')
  for metric_name, pageset in test_outcome_dict.iteritems():
    combined_p_value = results_stats.CombinePValues([p[1] for p in
                                                     pageset.itervalues()])
    outcome = (combined_p_value < significance_level, combined_p_value)
    PrintOutcomeLine(metric_name, max_metric_name_len, outcome, print_p_value)
  print()

  if not print_details:
    return

  # Print outcome for every metric/page combination.
  for metric_name, pageset in test_outcome_dict.iteritems():
    max_page_name_len = max([len(page_name) for page_name in pageset])
    print('{}:'.format(metric_name))
    for page_name, page_outcome in pageset.iteritems():
      PrintOutcomeLine(page_name, max_page_name_len, page_outcome,
                       print_p_value)
    print()


def main(args=None):
  """Set up parser and run statistical test on given benchmark results.

  Set up command line parser and its arguments. Then load Chart JSONs from
  given paths, run the specified statistical hypothesis test on the results and
  print the test outcomes.
  """
  if args is None:
    args = sys.argv[1:]

  parser = argparse.ArgumentParser(description="""Runs statistical significance
                                   tests on two given Chart JSON benchmark
                                   results produced by the telemetry
                                   benchmarks.""")

  parser.add_argument(dest='json_paths', nargs=2, help='JSON file location')

  parser.add_argument('--significance', dest='significance_level',
                      default=DEFAULT_SIGNIFICANCE_LEVEL, type=float,
                      help="""The significance level is the type I error rate,
                      which is the probability of determining that the
                      benchmark results are different although they're not.
                      Default: {}, which is common in statistical hypothesis
                      testing.""".format(DEFAULT_SIGNIFICANCE_LEVEL))

  parser.add_argument('--statistical-test', dest='statistical_test',
                      default=DEFAULT_STATISTICAL_TEST,
                      choices=results_stats.ALL_TEST_OPTIONS,
                      help="""Specifies the statistical hypothesis test that is
                      used. Choices are: Mann-Whitney U-test,
                      Kolmogorov-Smirnov, Welch's t-test. Default: Mann-Whitney
                      U-Test.""")

  parser.add_argument('-p', action='store_true', dest='print_p_value',
                      help="""If the -p flag is set, the output will include
                      the p-value for each metric.""")

  parser.add_argument('-d', action='store_true', dest='print_details',
                      help="""If the -d flag is set, the output will be more
                      detailed for benchmarks containing pagesets, giving
                      results for every metric/page combination after a summary
                      at the top.""")

  args = parser.parse_args(args)

  result_jsons = [LoadJsonFromPath(json_path) for json_path in args.json_paths]

  if (results_stats.DoesChartJSONContainPageset(result_jsons[0]) and
      results_stats.DoesChartJSONContainPageset(result_jsons[1])):
    # Benchmark containing a pageset.
    result_dict_1, result_dict_2 = (
        [results_stats.CreatePagesetBenchmarkResultDict(result_json)
         for result_json in result_jsons])
    test_outcome_dict = results_stats.ArePagesetBenchmarkResultsDifferent(
        result_dict_1, result_dict_2, args.statistical_test,
        args.significance_level)

    PrintPagesetTestOutcome(test_outcome_dict, args.statistical_test,
                            args.significance_level, args.print_p_value,
                            args.print_details)

  else:
    # Benchmark not containing a pageset.
    # (If only one JSON contains a pageset, results_stats raises an error.)
    result_dict_1, result_dict_2 = (
        [results_stats.CreateBenchmarkResultDict(result_json)
         for result_json in result_jsons])
    test_outcome_dict = (
        results_stats.AreBenchmarkResultsDifferent(result_dict_1, result_dict_2,
                                                   args.statistical_test,
                                                   args.significance_level))

    PrintTestOutcome(test_outcome_dict, args.statistical_test,
                     args.significance_level, args.print_p_value)


if __name__ == '__main__':
  sys.exit(main())