1# Copyright 2015 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""A simplified change-point detection algorithm. 6 7Historically, the performance dashboard has used the GASP service for 8detection. Brett Schein wrote a simplified version of this algorithm 9for the dashboard in Matlab, and this was ported to Python by Dave Tu. 10 11The general goal is to find any increase or decrease which is likely to 12represent a real change in the underlying data source. 13 14See: http://en.wikipedia.org/wiki/Step_detection 15""" 16 17import collections 18 19from dashboard import find_step 20from dashboard import math_utils 21from dashboard import ttest 22 23# Maximum number of points to consider at one time. 24_MAX_WINDOW_SIZE = 50 25 26# Minimum number of points in a segment. This can help filter out erroneous 27# results by ignoring results that were found from looking at too few points. 28MIN_SEGMENT_SIZE = 6 29 30# Minimum absolute difference between medians before and after. 31_MIN_ABSOLUTE_CHANGE = 0 32 33# Minimum relative difference between medians before and after. 34_MIN_RELATIVE_CHANGE = 0.01 35 36# "Steppiness" is a number between 0 and 1 that indicates how similar the 37# shape is to a perfect step function, where 1 represents a step function. 38_MIN_STEPPINESS = 0.5 39 40# The "standard deviation" is based on a subset of points in the series. 41# This parameter is the minimum acceptable ratio of the relative change 42# and this standard deviation. 43_MULTIPLE_OF_STD_DEV = 2.5 44 45 46class ChangePoint(collections.namedtuple( 47 'ChangePoint', ( 48 # The x-value of the first point after a step. 49 'x_value', 50 # Median of the segments before and after the change point. 51 'median_before', 'median_after', 52 # Number of points before and after the change point. 53 'size_before', 'size_after', 54 # X-values of the first and last point in the series window used. 55 'window_start', 'window_end', 56 # Relative change from before to after. 57 'relative_change', 58 # Standard deviation of points before. 59 'std_dev_before', 60 # Results of the Welch's t-test for values before and after. 61 't_statistic', 'degrees_of_freedom', 'p_value' 62 ))): 63 """A ChangePoint represents a change in a series -- a potential alert.""" 64 65 def AsDict(self): 66 """Returns a dictionary mapping attributes to values.""" 67 return self._asdict() 68 69 70def FindChangePoints( 71 series, 72 max_window_size=_MAX_WINDOW_SIZE, 73 min_segment_size=MIN_SEGMENT_SIZE, 74 min_absolute_change=_MIN_ABSOLUTE_CHANGE, 75 min_relative_change=_MIN_RELATIVE_CHANGE, 76 min_steppiness=_MIN_STEPPINESS, 77 multiple_of_std_dev=_MULTIPLE_OF_STD_DEV): 78 """Finds at most one change point in the given series. 79 80 Only the last |max_window_size| points are examined, regardless of 81 how many points are passed in. The reason why it might make sense to 82 limit the number of points to look at is that if there are multiple 83 change-points in the window that's looked at, then this function will 84 be less likely to find any of them. 85 86 First, the "most likely" split is found. The most likely split is defined as 87 a split that minimizes the sum of the variances on either side. Then the 88 split point is checked to see whether it passes the thresholds. 89 90 Args: 91 series: A list of (x, y) pairs. 92 max_window_size: Number of points to analyze. 93 min_segment_size: Min size of segments before or after change point. 94 min_absolute_change: Absolute change threshold. 95 min_relative_change: Relative change threshold. 96 min_steppiness: Threshold for how similar to a step a change point must be. 97 multiple_of_std_dev: Threshold for change as multiple of std. deviation. 98 99 Returns: 100 A list with one ChangePoint object, or an empty list. 101 """ 102 if len(series) < 2: 103 return [] # Not enough points to possibly contain a valid split point. 104 series = series[-max_window_size:] 105 _, y_values = zip(*series) 106 split_index = _FindSplit(y_values) 107 if _PassesThresholds( 108 y_values, split_index, 109 min_segment_size=min_segment_size, 110 min_absolute_change=min_absolute_change, 111 min_relative_change=min_relative_change, 112 min_steppiness=min_steppiness, 113 multiple_of_std_dev=multiple_of_std_dev): 114 return [MakeChangePoint(series, split_index)] 115 return [] 116 117 118def MakeChangePoint(series, split_index): 119 """Makes a ChangePoint object for the given series at the given point. 120 121 Args: 122 series: A list of (x, y) pairs. 123 split_index: Index of the first point after the split. 124 125 Returns: 126 A ChangePoint object. 127 """ 128 assert 0 <= split_index < len(series) 129 x_values, y_values = zip(*series) 130 left, right = y_values[:split_index], y_values[split_index:] 131 left_median, right_median = math_utils.Median(left), math_utils.Median(right) 132 ttest_results = ttest.WelchsTTest(left, right) 133 return ChangePoint( 134 x_value=x_values[split_index], 135 median_before=left_median, 136 median_after=right_median, 137 size_before=len(left), 138 size_after=len(right), 139 window_start=x_values[0], 140 window_end=x_values[-1], # inclusive bound 141 relative_change=math_utils.RelativeChange(left_median, right_median), 142 std_dev_before=math_utils.StandardDeviation(left), 143 t_statistic=ttest_results.t, 144 degrees_of_freedom=ttest_results.df, 145 p_value=ttest_results.p) 146 147 148def _FindSplit(values): 149 """Finds the index of the "most interesting" split of a sample of data. 150 151 Currently, the most interesting split is considered to be the split that 152 minimizes the standard deviation of the two sides concatenated together 153 (after modifying both sides by shifting all the numbers in the left and 154 right sides by the median of the left and right sides respectively). 155 156 The reason why this is done is that normalizing the two segments on either 157 side of a point so that both have the same center essentially removes any 158 jump or step that occurs at that point. 159 160 Args: 161 values: A list of numbers. 162 163 Returns: 164 The index of the "most interesting" point. 165 """ 166 def StdDevOfTwoNormalizedSides(index): 167 left, right = values[:index], values[index:] 168 return math_utils.StandardDeviation(_ZeroMedian(left) + _ZeroMedian(right)) 169 return min(range(1, len(values)), key=StdDevOfTwoNormalizedSides) 170 171 172def _ZeroMedian(values): 173 """Subtracts the median value in the list from all values in the list.""" 174 median = math_utils.Median(values) 175 return [val - median for val in values] 176 177 178def _PassesThresholds( 179 values, split_index, min_segment_size, min_absolute_change, 180 min_relative_change, min_steppiness, multiple_of_std_dev): 181 """Checks whether a point in a series appears to be an change point. 182 183 Args: 184 values: A list of numbers. 185 split_index: An index in the list of numbers. 186 min_segment_size: Threshold for size of segments before or after a point. 187 min_absolute_change: Minimum absolute median change threshold. 188 min_relative_change: Minimum relative median change threshold. 189 min_steppiness: Threshold for how similar to a step a change point must be. 190 multiple_of_std_dev: Threshold for change as multiple of std. deviation. 191 192 Returns: 193 True if it passes all of the thresholds, False otherwise. 194 """ 195 left, right = values[:split_index], values[split_index:] 196 left_median, right_median = math_utils.Median(left), math_utils.Median(right) 197 198 # 1. Segment size filter. 199 if len(left) < min_segment_size or len(right) < min_segment_size: 200 return False 201 202 # 2. Absolute change filter. 203 absolute_change = abs(left_median - right_median) 204 if absolute_change < min_absolute_change: 205 return False 206 207 # 3. Relative change filter. 208 relative_change = math_utils.RelativeChange(left_median, right_median) 209 if relative_change < min_relative_change: 210 return False 211 212 # 4. Multiple of standard deviation filter. 213 min_std_dev = min( 214 math_utils.StandardDeviation(left), 215 math_utils.StandardDeviation(right)) 216 if absolute_change < multiple_of_std_dev * min_std_dev: 217 return False 218 219 # 5. Steppiness filter. 220 steppiness = find_step.Steppiness(values, split_index) 221 if steppiness < min_steppiness: 222 return False 223 224 # Passed all filters! 225 return True 226