1# Copyright 2015 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""A simplified change-point detection algorithm.
6
7Historically, the performance dashboard has used the GASP service for
8detection. Brett Schein wrote a simplified version of this algorithm
9for the dashboard in Matlab, and this was ported to Python by Dave Tu.
10
11The general goal is to find any increase or decrease which is likely to
12represent a real change in the underlying data source.
13
14See: http://en.wikipedia.org/wiki/Step_detection
15"""
16
17import collections
18
19from dashboard import find_step
20from dashboard import math_utils
21from dashboard import ttest
22
23# Maximum number of points to consider at one time.
24_MAX_WINDOW_SIZE = 50
25
26# Minimum number of points in a segment. This can help filter out erroneous
27# results by ignoring results that were found from looking at too few points.
28MIN_SEGMENT_SIZE = 6
29
30# Minimum absolute difference between medians before and after.
31_MIN_ABSOLUTE_CHANGE = 0
32
33# Minimum relative difference between medians before and after.
34_MIN_RELATIVE_CHANGE = 0.01
35
36# "Steppiness" is a number between 0 and 1 that indicates how similar the
37# shape is to a perfect step function, where 1 represents a step function.
38_MIN_STEPPINESS = 0.5
39
40# The "standard deviation" is based on a subset of points in the series.
41# This parameter is the minimum acceptable ratio of the relative change
42# and this standard deviation.
43_MULTIPLE_OF_STD_DEV = 2.5
44
45
46class ChangePoint(collections.namedtuple(
47    'ChangePoint', (
48        # The x-value of the first point after a step.
49        'x_value',
50        # Median of the segments before and after the change point.
51        'median_before', 'median_after',
52        # Number of points before and after the change point.
53        'size_before', 'size_after',
54        # X-values of the first and last point in the series window used.
55        'window_start', 'window_end',
56        # Relative change from before to after.
57        'relative_change',
58        # Standard deviation of points before.
59        'std_dev_before',
60        # Results of the Welch's t-test for values before and after.
61        't_statistic', 'degrees_of_freedom', 'p_value'
62    ))):
63  """A ChangePoint represents a change in a series -- a potential alert."""
64
65  def AsDict(self):
66    """Returns a dictionary mapping attributes to values."""
67    return self._asdict()
68
69
70def FindChangePoints(
71    series,
72    max_window_size=_MAX_WINDOW_SIZE,
73    min_segment_size=MIN_SEGMENT_SIZE,
74    min_absolute_change=_MIN_ABSOLUTE_CHANGE,
75    min_relative_change=_MIN_RELATIVE_CHANGE,
76    min_steppiness=_MIN_STEPPINESS,
77    multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
78  """Finds at most one change point in the given series.
79
80  Only the last |max_window_size| points are examined, regardless of
81  how many points are passed in. The reason why it might make sense to
82  limit the number of points to look at is that if there are multiple
83  change-points in the window that's looked at, then this function will
84  be less likely to find any of them.
85
86  First, the "most likely" split is found. The most likely split is defined as
87  a split that minimizes the sum of the variances on either side. Then the
88  split point is checked to see whether it passes the thresholds.
89
90  Args:
91    series: A list of (x, y) pairs.
92    max_window_size: Number of points to analyze.
93    min_segment_size: Min size of segments before or after change point.
94    min_absolute_change: Absolute change threshold.
95    min_relative_change: Relative change threshold.
96    min_steppiness: Threshold for how similar to a step a change point must be.
97    multiple_of_std_dev: Threshold for change as multiple of std. deviation.
98
99  Returns:
100    A list with one ChangePoint object, or an empty list.
101  """
102  if len(series) < 2:
103    return []  # Not enough points to possibly contain a valid split point.
104  series = series[-max_window_size:]
105  _, y_values = zip(*series)
106  split_index = _FindSplit(y_values)
107  if _PassesThresholds(
108      y_values, split_index,
109      min_segment_size=min_segment_size,
110      min_absolute_change=min_absolute_change,
111      min_relative_change=min_relative_change,
112      min_steppiness=min_steppiness,
113      multiple_of_std_dev=multiple_of_std_dev):
114    return [MakeChangePoint(series, split_index)]
115  return []
116
117
118def MakeChangePoint(series, split_index):
119  """Makes a ChangePoint object for the given series at the given point.
120
121  Args:
122    series: A list of (x, y) pairs.
123    split_index: Index of the first point after the split.
124
125  Returns:
126    A ChangePoint object.
127  """
128  assert 0 <= split_index < len(series)
129  x_values, y_values = zip(*series)
130  left, right = y_values[:split_index], y_values[split_index:]
131  left_median, right_median = math_utils.Median(left), math_utils.Median(right)
132  ttest_results = ttest.WelchsTTest(left, right)
133  return ChangePoint(
134      x_value=x_values[split_index],
135      median_before=left_median,
136      median_after=right_median,
137      size_before=len(left),
138      size_after=len(right),
139      window_start=x_values[0],
140      window_end=x_values[-1],  # inclusive bound
141      relative_change=math_utils.RelativeChange(left_median, right_median),
142      std_dev_before=math_utils.StandardDeviation(left),
143      t_statistic=ttest_results.t,
144      degrees_of_freedom=ttest_results.df,
145      p_value=ttest_results.p)
146
147
148def _FindSplit(values):
149  """Finds the index of the "most interesting" split of a sample of data.
150
151  Currently, the most interesting split is considered to be the split that
152  minimizes the standard deviation of the two sides concatenated together
153  (after modifying both sides by shifting all the numbers in the left and
154  right sides by the median of the left and right sides respectively).
155
156  The reason why this is done is that normalizing the two segments on either
157  side of a point so that both have the same center essentially removes any
158  jump or step that occurs at that point.
159
160  Args:
161    values: A list of numbers.
162
163  Returns:
164    The index of the "most interesting" point.
165  """
166  def StdDevOfTwoNormalizedSides(index):
167    left, right = values[:index], values[index:]
168    return math_utils.StandardDeviation(_ZeroMedian(left) + _ZeroMedian(right))
169  return min(range(1, len(values)), key=StdDevOfTwoNormalizedSides)
170
171
172def _ZeroMedian(values):
173  """Subtracts the median value in the list from all values in the list."""
174  median = math_utils.Median(values)
175  return [val - median for val in values]
176
177
178def _PassesThresholds(
179    values, split_index, min_segment_size, min_absolute_change,
180    min_relative_change, min_steppiness, multiple_of_std_dev):
181  """Checks whether a point in a series appears to be an change point.
182
183  Args:
184    values: A list of numbers.
185    split_index: An index in the list of numbers.
186    min_segment_size: Threshold for size of segments before or after a point.
187    min_absolute_change: Minimum absolute median change threshold.
188    min_relative_change: Minimum relative median change threshold.
189    min_steppiness: Threshold for how similar to a step a change point must be.
190    multiple_of_std_dev: Threshold for change as multiple of std. deviation.
191
192  Returns:
193    True if it passes all of the thresholds, False otherwise.
194  """
195  left, right = values[:split_index], values[split_index:]
196  left_median, right_median = math_utils.Median(left), math_utils.Median(right)
197
198  # 1. Segment size filter.
199  if len(left) < min_segment_size or len(right) < min_segment_size:
200    return False
201
202  # 2. Absolute change filter.
203  absolute_change = abs(left_median - right_median)
204  if absolute_change < min_absolute_change:
205    return False
206
207  # 3. Relative change filter.
208  relative_change = math_utils.RelativeChange(left_median, right_median)
209  if relative_change < min_relative_change:
210    return False
211
212  # 4. Multiple of standard deviation filter.
213  min_std_dev = min(
214      math_utils.StandardDeviation(left),
215      math_utils.StandardDeviation(right))
216  if absolute_change < multiple_of_std_dev * min_std_dev:
217    return False
218
219  # 5. Steppiness filter.
220  steppiness = find_step.Steppiness(values, split_index)
221  if steppiness < min_steppiness:
222    return False
223
224  # Passed all filters!
225  return True
226